diff --git a/.github/workflows/pypi_wheel.yml b/.github/workflows/pypi_wheel.yml
index e350cd40..bf0565af 100644
--- a/.github/workflows/pypi_wheel.yml
+++ b/.github/workflows/pypi_wheel.yml
@@ -28,7 +28,7 @@ jobs:
ls ${{ github.workspace }}/wheelhouse
- name: Publish to PyPI
run: |
- pip install twine
+ pip install twine==6.0.1
export TWINE_USERNAME=__token__
export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}"
twine upload --verbose "${{ github.workspace }}/wheelhouse/*"
@@ -51,7 +51,7 @@ jobs:
ls ${{ github.workspace }}/wheelhouse
- name: Publish to PyPI
run: |
- pip install twine
+ pip install twine==6.0.1
export TWINE_USERNAME=__token__
export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}"
twine upload --verbose "${{ github.workspace }}/wheelhouse/*"
@@ -66,7 +66,7 @@ jobs:
python3 setup.py sdist
- name: Publish to PyPI
run: |
- pip install twine
+ pip install twine==6.0.1
export TWINE_USERNAME=__token__
export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}"
twine upload --verbose "${{ github.workspace }}/dist/*"
diff --git a/CHANGELOG b/CHANGELOG
index 7f747686..a95f5108 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,12 @@
+v1.3.1 (2025-02-04)
+-------------------
+* New Features
+ - Analytical Hessian for PCM solvent model
+ - Driver for 3c methods (wB97x-3c, R2Scan-3c, B97-3c, etc.)
+* Improvements
+ - Preconditioner and computation efficiency of Davidson iterations for TDDFT
+
+
v1.3.0 (2025-01-07)
-------------------
* New Features
diff --git a/examples/40-all_electron_scf.py b/examples/40-all_electron_scf.py
new file mode 100644
index 00000000..a33f2953
--- /dev/null
+++ b/examples/40-all_electron_scf.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Gamma point Hartree-Fock/DFT using density fitting approximation
+'''
+
+import numpy as np
+import pyscf
+
+cell = pyscf.M(
+ a = np.eye(3)*3.5668,
+ atom = '''C 0. 0. 0.
+ C 0.8917 0.8917 0.8917
+ C 1.7834 1.7834 0.
+ C 2.6751 2.6751 0.8917
+ C 1.7834 0. 1.7834
+ C 2.6751 0.8917 2.6751
+ C 0. 1.7834 1.7834
+ C 0.8917 2.6751 2.6751''',
+ basis = 'ccpvdz',
+ verbose = 5,
+)
+
+#
+# Gamma point HF and DFT
+#
+mf = cell.RHF().to_gpu().density_fit().run()
+
+mf = cell.RKS(xc='pbe0').to_gpu().density_fit().run()
+
+#
+# K-point sampled HF and DFT
+#
+kpts = cell.make_kpts([2,2,2])
+kmf = cell.KRHF(kpts=kpts).to_gpu().density_fit().run()
+
+kmf = cell.KRKS(xc='pbe0', kpts=kpts).to_gpu().density_fit().run()
diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
index 1bb12f85..bfd8d8d8 100644
--- a/gpu4pyscf/__config__.py
+++ b/gpu4pyscf/__config__.py
@@ -14,11 +14,11 @@
import cupy
-_num_devices = cupy.cuda.runtime.getDeviceCount()
+num_devices = cupy.cuda.runtime.getDeviceCount()
# TODO: switch to non_blocking stream (currently blocked by libxc)
-_streams = [None] * _num_devices
-for device_id in range(_num_devices):
+_streams = [None] * num_devices
+for device_id in range(num_devices):
with cupy.cuda.Device(device_id):
_streams[device_id] = cupy.cuda.stream.Stream(non_blocking=False)
@@ -38,11 +38,16 @@
mem_fraction = 0.9
cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)
+if props['sharedMemPerBlockOptin'] > 65536:
+ shm_size = props['sharedMemPerBlockOptin']
+else:
+ shm_size = props['sharedMemPerBlock']
+
# Check P2P data transfer is available
_p2p_access = True
-if _num_devices > 1:
- for src in range(_num_devices):
- for dst in range(_num_devices):
+if num_devices > 1:
+ for src in range(num_devices):
+ for dst in range(num_devices):
if src != dst:
can_access_peer = cupy.cuda.runtime.deviceCanAccessPeer(src, dst)
_p2p_access &= can_access_peer
diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
index af2c2982..4cd95fbc 100644
--- a/gpu4pyscf/__init__.py
+++ b/gpu4pyscf/__init__.py
@@ -12,6 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-__version__ = '1.3.0'
+__version__ = '1.3.1'
from . import lib, grad, hessian, solvent, scf, dft, tdscf
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index da61804c..c58c1428 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -25,7 +25,7 @@
from gpu4pyscf.df import int3c2e, df_jk
from gpu4pyscf.lib import logger
from gpu4pyscf import __config__
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128)
ALIGNED = getattr(__config__, 'ao_aligned', 32)
@@ -218,7 +218,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
# CDERI will be equally distributed to the devices
# Other devices usually have more memory available than Device 0
# CDERI will use up to 40% of the available memory
- use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices
+ use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * num_devices
if use_gpu_memory:
log.debug("Saving CDERI on GPU")
@@ -226,9 +226,9 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
log.debug("Saving CDERI on CPU")
_cderi = {}
- aux_blksize = (naux + _num_devices - 1) // _num_devices
+ aux_blksize = (naux + num_devices - 1) // num_devices
aux_blksize = (aux_blksize + ALIGNED - 1) // ALIGNED * ALIGNED
- for device_id in range(_num_devices):
+ for device_id in range(num_devices):
p0 = min(aux_blksize*device_id, naux)
p1 = min(aux_blksize*(device_id+1), naux)
#for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
@@ -246,16 +246,16 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
npairs_per_ctr = np.array(npairs_per_ctr)
total_task_list = np.argsort(npairs_per_ctr)
task_list_per_device = []
- for device_id in range(_num_devices):
- task_list_per_device.append(total_task_list[device_id::_num_devices])
+ for device_id in range(num_devices):
+ task_list_per_device.append(total_task_list[device_id::num_devices])
cd_low_f = cupy.array(cd_low, order='F', copy=False)
cd_low_f = tag_array(cd_low_f, tag=cd_low.tag)
cupy.cuda.get_current_stream().synchronize()
futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
task_list = task_list_per_device[device_id]
future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, aux_blksize,
omega=omega, sr_only=sr_only, device_id=device_id)
@@ -352,7 +352,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize,
for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1])
- elif _num_devices > 1:
+ elif num_devices > 1:
# Multi-GPU case, copy data to other Devices
for dev_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
# Making a copy for contiguous data transfer
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index 5561cf9c..66f1dd49 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -26,7 +26,7 @@
from gpu4pyscf.dft import rks, uks, numint
from gpu4pyscf.scf import hf, uhf
from gpu4pyscf.df import df, int3c2e
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
def _pin_memory(array):
mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
@@ -453,8 +453,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1])
futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_jk_task_with_mo,
dfobj, dms, mo_coeff, mo_occ,
@@ -474,8 +474,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
mo1s = [intopt.sort_orbitals(mo1, axis=[1]) for mo1 in mo1s]
futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_jk_task_with_mo1,
dfobj, dms, mo1s, occ_coeffs,
@@ -486,8 +486,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
# general K matrix with density matrix
else:
futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_jk_task_with_dm, dfobj, dms,
hermi=hermi, device_id=device_id,
diff --git a/gpu4pyscf/df/grad/jk.py b/gpu4pyscf/df/grad/jk.py
index 2bbf9d9e..4595af65 100644
--- a/gpu4pyscf/df/grad/jk.py
+++ b/gpu4pyscf/df/grad/jk.py
@@ -18,7 +18,7 @@
from gpu4pyscf.df.int3c2e import get_int3c2e_ip_jk, VHFOpt, _split_tasks
from gpu4pyscf.lib.cupy_helper import contract, concatenate, reduce_to_device
from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
''' # (L|ij) -> rhoj: (L), rhok: (L|oo)
@@ -61,8 +61,8 @@ def get_rhojk(with_df, dm, orbo, with_j=True, with_k=True):
'''
futures = []
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_jk_task, with_df, dm, orbo,
with_j=with_j, with_k=with_k, device_id=device_id)
@@ -161,12 +161,12 @@ def get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
aux_ao_loc = np.array(intopt.aux_ao_loc)
loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
- task_list = _split_tasks(loads, _num_devices)
+ task_list = _split_tasks(loads, num_devices)
futures = []
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_jk_ip_task, intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list[device_id],
with_j=with_j, with_k=with_k, device_id=device_id, omega=omega)
diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
index 40ab3bfd..5baff1d0 100644
--- a/gpu4pyscf/df/hessian/jk.py
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -23,7 +23,7 @@
from gpu4pyscf.hessian.jk import _ao2mo
from gpu4pyscf.lib import logger
from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
NROOT_ON_GPU = 7
@@ -171,8 +171,8 @@ def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0,
mo_coeff = [intopt.sort_orbitals(mo, axis=[0]) for mo in mo_coeff]
futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_jk_task_with_mo1,
dfobj, dms, mo_coeff, mo1s, occ_coeffs,
@@ -415,12 +415,12 @@ def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_j=True, with_k=True,
ncp_ij = len(intopt.log_qs)
tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij))))
task_list = []
- for device_id in range(_num_devices):
- task_list.append(tasks[device_id::_num_devices])
+ for device_id in range(num_devices):
+ task_list.append(tasks[device_id::num_devices])
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_int3c2e_ipip_tasks, intopt, task_list[device_id],
rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k,
diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
index e0d5cd90..321c9654 100644
--- a/gpu4pyscf/df/hessian/rks.py
+++ b/gpu4pyscf/df/hessian/rks.py
@@ -46,8 +46,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
mocc = mo_coeff[:,mo_occ>0]
dm0 = numpy.dot(mocc, mocc.T) * 2
- if mf.nlc != '':
- raise NotImplementedError
+ if mf.do_nlc():
+ raise NotImplementedError("2nd derivative of NLC is not implemented.")
omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py
index 059f571c..99661740 100644
--- a/gpu4pyscf/df/hessian/uks.py
+++ b/gpu4pyscf/df/hessian/uks.py
@@ -48,8 +48,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
moccb = mo_coeff[1][:,mo_occ[1]>0]
dm0a = numpy.dot(mocca, mocca.T)
dm0b = numpy.dot(moccb, moccb.T)
- if mf.nlc != '':
- raise NotImplementedError
+ if mf.do_nlc():
+ raise NotImplementedError("2nd derivative of NLC is not implemented.")
omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index e77e30ca..28e7e49e 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -24,7 +24,7 @@
reduce_to_device, copy_array, transpose_sum)
from gpu4pyscf.lib import logger
from gpu4pyscf.gto.mole import basis_seg_contraction
-from gpu4pyscf.__config__ import _num_devices, _streams
+from gpu4pyscf.__config__ import num_devices, _streams
LMAX_ON_GPU = 8
FREE_CUPY_CACHE = True
@@ -824,11 +824,11 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
futures = []
aux_ao_loc = np.array(intopt.aux_ao_loc)
loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
- task_list = _split_tasks(loads, _num_devices)
+ task_list = _split_tasks(loads, num_devices)
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_int3c2e_jk_task, intopt, task_list[device_id],
dm0_tag, orbo, device_id=device_id, omega=omega)
@@ -935,11 +935,11 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_j=True,
aux_ao_loc = np.array(intopt.aux_ao_loc)
loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
- task_list = _split_tasks(loads, _num_devices)
+ task_list = _split_tasks(loads, num_devices)
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_int3c2e_ip1_vjk_task, intopt, task_list[device_id],
rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k,
@@ -1033,11 +1033,11 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices,
aux_ao_loc = np.array(intopt.aux_ao_loc)
loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
- task_list = _split_tasks(loads, _num_devices)
+ task_list = _split_tasks(loads, num_devices)
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_int3c2e_ip2_vjk_task, intopt, task_list[device_id],
rhoj, rhok, dm0_tag, orbo, with_j=with_j,
@@ -1096,7 +1096,7 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
aux_ao_loc = np.array(intopt.aux_ao_loc)
loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
- task_list = _split_tasks(loads, _num_devices)
+ task_list = _split_tasks(loads, num_devices)
nao = intopt.mol.nao
naux = intopt.auxmol.nao
@@ -1107,8 +1107,8 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
wk = np.ndarray([naux,nao,nocc,3], dtype=np.float64, order='C', buffer=mem)
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_int3c2e_ip1_wjk_task, intopt, task_list[device_id],
dm0_tag, orbo, wk, with_k=with_k, device_id=device_id, omega=omega)
@@ -1156,11 +1156,11 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
aux_ao_loc = np.array(intopt.aux_ao_loc)
loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
- task_list = _split_tasks(loads, _num_devices)
+ task_list = _split_tasks(loads, num_devices)
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_int3c2e_ip2_wjk, intopt, task_list[device_id],
dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega)
diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py
index 2908b9a3..9dd1d813 100644
--- a/gpu4pyscf/dft/gen_grid.py
+++ b/gpu4pyscf/dft/gen_grid.py
@@ -30,9 +30,10 @@
import cupy
from pyscf import lib
from pyscf import gto
+from pyscf.dft import gen_grid as gen_grid_cpu
+from gpu4pyscf.lib import utils
from pyscf.gto.eval_gto import BLKSIZE, NBINS, CUTOFF, make_screen_index
from pyscf import __config__
-from cupyx.scipy.spatial.distance import cdist
from gpu4pyscf.lib import logger
from gpu4pyscf.dft import radi
from gpu4pyscf.lib.cupy_helper import load_library
@@ -72,13 +73,17 @@ def sg1_prune(nuc, rads, n_ang, radii=radi.SG1RADII):
'''
# In SG1 the ang grids for the five regions
# 6 38 86 194 86
- leb_ngrid = cupy.array([6, 38, 86, 194, 86])
- alphas = cupy.array((
+ if nuc >= 19:
+ return 194 * numpy.ones_like(rads, dtype=numpy.int64)
+
+ leb_ngrid = numpy.array([6, 38, 86, 194, 86], dtype=numpy.int64)
+ alphas = numpy.array((
(0.25 , 0.5, 1.0, 4.5),
(0.1667, 0.5, 0.9, 3.5),
(0.1 , 0.4, 0.8, 2.5)))
+
r_atom = radii[nuc] + 1e-200
- rads = cupy.asarray(rads)
+ rads = numpy.asarray(rads)
if nuc <= 2: # H, He
place = ((rads/r_atom).reshape(-1,1) > alphas[0]).sum(axis=1)
elif nuc <= 10: # Li - Ne
@@ -463,8 +468,6 @@ def _load_conf(mod, name, default):
else:
return var
-from pyscf.dft import gen_grid
-from gpu4pyscf.lib import utils
class Grids(lib.StreamObject):
from gpu4pyscf.lib.utils import to_gpu, device
@@ -481,9 +484,10 @@ class Grids(lib.StreamObject):
level = getattr(__config__, 'dft_gen_grid_Grids_level', 3)
alignment = ALIGNMENT_UNIT
cutoff = CUTOFF
- _keys = gen_grid.Grids._keys
+ _keys = gen_grid_cpu.Grids._keys
- __init__ = gen_grid.Grids.__init__
+ __init__ = gen_grid_cpu.Grids.__init__
+ dump_flags = gen_grid_cpu.Grids.dump_flags
def __setattr__(self, key, val):
if key in ('atom_grid', 'atomic_radii', 'radii_adjust', 'radi_method',
@@ -581,12 +585,12 @@ def prune_by_density_(self, rho, threshold=0):
return self
def to_cpu(self):
- grids = gen_grid.Grids(self.mol)
+ grids = gen_grid_cpu.Grids(self.mol)
utils.to_cpu(self, out=grids)
return grids
-_default_rad = gen_grid._default_rad
-RAD_GRIDS = gen_grid.RAD_GRIDS
-_default_ang = gen_grid._default_ang
-ANG_ORDER = gen_grid.ANG_ORDER
-_padding_size = gen_grid._padding_size
+_default_rad = gen_grid_cpu._default_rad
+RAD_GRIDS = gen_grid_cpu.RAD_GRIDS
+_default_ang = gen_grid_cpu._default_ang
+ANG_ORDER = gen_grid_cpu.ANG_ORDER
+_padding_size = gen_grid_cpu._padding_size
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index bf6c65c9..bb98e857 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -28,7 +28,7 @@
from gpu4pyscf.dft import xc_deriv, xc_alias, libxc
from gpu4pyscf import __config__
from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
LMAX_ON_GPU = 6
BAS_ALIGNED = 1
@@ -395,7 +395,7 @@ def gen_grid_range(ngrids, device_id, blksize=MIN_BLK_SIZE):
'''
Calculate the range of grids assigned the given device
'''
- ngrids_per_device = (ngrids + _num_devices - 1) // _num_devices
+ ngrids_per_device = (ngrids + num_devices - 1) // num_devices
ngrids_per_device = (ngrids_per_device + blksize - 1) // blksize * blksize
grid_start = min(device_id * ngrids_per_device, ngrids)
grid_end = min((device_id + 1) * ngrids_per_device, ngrids)
@@ -523,8 +523,8 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
release_gpu_stack()
cupy.cuda.get_current_stream().synchronize()
futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_nr_rks_task,
ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
@@ -914,8 +914,8 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
release_gpu_stack()
cupy.cuda.get_current_stream().synchronize()
futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_nr_uks_task,
ni, mol, grids, xc_code, (dma,dmb), mo_coeff, mo_occ,
@@ -1026,7 +1026,7 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
ao_deriv = 1
ngrids_glob = grids.coords.shape[0]
- ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+ ngrids_per_device = (ngrids_glob + num_devices - 1) // num_devices
ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
grid_start = min(device_id * ngrids_per_device, ngrids_glob)
grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
@@ -1108,8 +1108,8 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
futures = []
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_nr_rks_fxc_task,
ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
@@ -1178,7 +1178,7 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
ao_deriv = 1
ngrids_glob = grids.coords.shape[0]
- ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+ ngrids_per_device = (ngrids_glob + num_devices - 1) // num_devices
ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
grid_start = min(device_id * ngrids_per_device, ngrids_glob)
grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
@@ -1277,8 +1277,8 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
futures = []
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_nr_uks_fxc_task,
ni, mol, grids, xc_code, fxc, (dma, dmb), mo1, occ_coeff,
diff --git a/gpu4pyscf/dft/uks.py b/gpu4pyscf/dft/uks.py
index 5e11bb81..4d561e62 100644
--- a/gpu4pyscf/dft/uks.py
+++ b/gpu4pyscf/dft/uks.py
@@ -16,7 +16,7 @@
from pyscf.dft import uks as uks_cpu
from pyscf import lib
from gpu4pyscf.lib import logger
-from gpu4pyscf.dft import numint, gen_grid, rks
+from gpu4pyscf.dft import rks
from gpu4pyscf.scf import hf, uhf
from gpu4pyscf.lib.cupy_helper import tag_array
from gpu4pyscf.lib import utils
diff --git a/gpu4pyscf/drivers/basis_vDZP_NWCHEM.dat b/gpu4pyscf/drivers/basis_vDZP_NWCHEM.dat
new file mode 100644
index 00000000..1fc10e1e
--- /dev/null
+++ b/gpu4pyscf/drivers/basis_vDZP_NWCHEM.dat
@@ -0,0 +1,2310 @@
+BASIS "ao basis" PRINT
+#BASIS SET:
+H S
+ 81.886780875039 0.008423954179
+ 12.231063861388 0.064861285350
+ 2.786815144183 0.311400883616
+ 0.775786677408 0.985308081721
+ 0.223433692783 1.256819962883
+H S
+ 0.331097483644 0.052292300794
+ 0.107455350812 0.104139302794
+ 0.050680508365 0.245115714360
+H P
+ 1.417043684193 0.759765848611
+ 0.290781406697 1.522844626098
+
+#BASIS SET:
+He S
+ 248.304359266256 0.005013791761
+ 39.257359859983 0.034983701525
+ 9.290242872987 0.162973195617
+ 2.650678948299 0.489691016373
+ 0.811596267579 0.932899350713
+He S
+ 0.268607161928 0.222877468588
+ 0.345025805948 0.062111112842
+ 0.102007122911 0.091342429226
+He P
+ 1.310041712606 0.696987984442
+ 0.265008725379 0.178488628760
+
+#BASIS SET:
+Li S
+ 261.504397395816 0.007995837268
+ 39.435060612595 0.058753345768
+ 8.903002628902 0.251012919032
+ 2.312128536738 0.601136930396
+ 0.673560740522 0.478266124634
+Li S
+ 0.637937919385 -0.146472924115
+ 0.063047215665 0.821661055674
+ 0.020134930908 0.034773514803
+Li S
+ 0.029187683675 1.062645883498
+ 0.014762824299 1.095509715610
+Li P
+ 1.607685808951 0.088808339206
+ 0.261313614186 0.424066699835
+ 0.078704441731 0.737998244728
+Li D
+ 0.248573704371 0.694743515137
+ 0.088415849082 0.788715353455
+
+#BASIS SET:
+Be S
+ 510.032398065642 0.005805184625
+ 76.405556797429 0.043342046125
+ 17.255250508893 0.188908143114
+ 4.586010018287 0.469633572220
+ 1.356430512009 0.395896576117
+Be S
+ 1.628923371459 -0.134493067135
+ 0.158484974791 0.944737331855
+ 0.063902892456 0.171032012077
+Be S
+ 0.056984898840 0.893658825536
+ 0.029154653875 1.463000499223
+Be P
+ 2.658894134528 0.025302164759
+ 0.449379800839 0.144909124946
+ 0.113066880133 0.200552018717
+Be D
+ 0.458082080027 0.881395926545
+ 0.118291023217 0.993425036644
+
+#BASIS SET:
+B S
+ 1.548610750968 -0.375453530717
+ 1.230872527218 0.296383194735
+ 0.288936160855 0.400281132411
+ 0.105945534640 0.252339650517
+B S
+ 0.149430266146 0.631907262941
+ 0.046078378647 0.717819753504
+B P
+ 6.824625642150 0.032055054798
+ 1.786841467886 0.168629590401
+ 0.529445685476 0.508222453209
+ 0.171288703178 0.449200455124
+B P
+ 0.137855130395 1.122762891963
+ 0.045638926887 1.688087920718
+B D
+ 0.685952928321 1.202317712080
+ 0.217177410256 1.378136829807
+
+#BASIS SET:
+C S
+ 2.174987790335 -0.276058910823
+ 1.852226946510 0.240154751118
+ 0.459098487623 0.191124862153
+ 0.169128986465 0.087294467451
+C S
+ 0.227455022611 0.987938126698
+ 0.086918590957 1.016650788977
+C P
+ 12.870085333684 0.024741438180
+ 3.295443583046 0.152192079858
+ 0.976645717756 0.448674298135
+ 0.348425197118 0.494756938315
+C P
+ 0.146489481077 1.594415811311
+ 0.063733914289 0.807859142831
+C D
+ 1.025317925428 0.833187865353
+ 0.250249064514 0.858263567336
+
+#BASIS SET:
+N S
+ 2.735378582931 -0.239593796561
+ 2.219800598729 0.224989693361
+ 0.592708671545 0.194517761229
+ 0.227406999886 0.098125575699
+N S
+ 0.280435456826 0.853515477480
+ 0.105373393767 1.130727240507
+N P
+ 19.104528476156 0.011792637499
+ 4.831013846305 0.072900287221
+ 1.454969505940 0.205542762778
+ 0.494779046501 0.258358206618
+N P
+ 0.178530326117 1.242146697946
+ 0.068618652214 0.429051028293
+N D
+ 1.230812954761 1.261894149963
+ 0.370851969193 1.181659566926
+
+#BASIS SET:
+O S
+ 3.543641820586 -0.263554292988
+ 2.712717475497 0.265678831423
+ 0.683184525058 0.279087292612
+ 0.255976739185 0.112868556239
+O S
+ 0.286500025367 1.027594577324
+ 0.123012019465 1.159166516125
+O P
+ 27.003841820377 0.012470154545
+ 6.762654966183 0.074520776234
+ 2.011704598628 0.210079890981
+ 0.634428348949 0.291319728722
+O P
+ 0.199067660399 1.512413659527
+ 0.065643622039 0.312634979070
+O D
+ 1.366109340367 0.858294230859
+ 0.461931641275 0.846034666146
+
+#BASIS SET:
+F S
+ 7.034861151373 -0.446046417537
+ 4.872932295917 0.241120382569
+ 1.330878323026 0.666593307204
+ 0.525469722669 0.781917161284
+F S
+ 0.821266047563 0.135544243991
+ 0.198341185022 0.993554010066
+F P
+ 35.526100039768 0.026818249901
+ 9.093570542569 0.133596681058
+ 2.685413226386 0.376326355144
+ 0.833678395080 0.516681452384
+F P
+ 0.262279770686 0.931221931175
+ 0.082396241151 0.200669872602
+F D
+ 1.454912451083 1.369428727075
+ 0.446498292031 1.466564035452
+
+#BASIS SET:
+Ne S
+ 8.511025142406 -0.346407731370
+ 6.944709652311 0.242428346755
+ 1.561041650912 0.456810607744
+ 0.599496632788 0.413991739819
+Ne S
+ 0.609511035171 0.175668695012
+ 0.245683034081 0.854892929924
+Ne P
+ 56.111190785374 0.018617197509
+ 14.542058717068 0.119964511456
+ 4.468992837375 0.351570092625
+ 1.504786372395 0.552758556945
+Ne P
+ 0.498808346618 0.959617655183
+ 0.156283251576 0.293739815995
+Ne D
+ 1.678877049768 0.562501368964
+ 0.416060290014 1.450686746125
+
+#BASIS SET:
+Na S
+ 17.859469979814 -0.094639395365
+ 1.741589491951 0.454368094759
+ 0.671216662419 0.326110219634
+ 0.356748679884 0.099733348288
+Na S
+ 1.040532238619 1.128928459961
+ 0.363586481174 0.971270254416
+ 0.030788669980 0.725021277866
+Na S
+ 0.063388594723 0.461074706312
+ 0.027886589957 0.865217967753
+Na P
+ 94.408900907741 0.022285520404
+ 22.833035343911 0.141094814336
+ 7.392308589988 0.456452935832
+ 2.700481523814 0.811931701139
+ 0.980041469729 0.865637159210
+Na P
+ 2.667291528209 -0.005824466879
+ 0.336960843152 0.524769342105
+ 0.055297986191 0.011767234747
+Na D
+ 0.317736281462 0.216305744638
+ 0.050399448912 1.084520528356
+
+#BASIS SET:
+Mg S
+ 21.746078000863 -0.107712415417
+ 2.054693963682 0.605039945129
+ 0.775378201881 0.358253218691
+ 0.646755053402 0.124456587448
+Mg S
+ 1.518035874065 0.953325303523
+ 0.626648472409 0.885052846709
+ 0.061342469056 0.663200597406
+Mg S
+ 0.114525781742 1.102821153701
+ 0.049143798668 1.262827197035
+Mg P
+ 98.149921577771 0.020030532297
+ 22.986901774625 0.130645961601
+ 7.073704413314 0.416305903439
+ 2.357463360333 0.664621574936
+ 0.776737081590 0.460953274264
+Mg P
+ 3.771301810954 -0.008032934562
+ 0.189166874642 0.340369688597
+ 0.068419879293 0.260127856330
+Mg D
+ 0.401806625040 0.846622892966
+ 0.102441134001 1.533893458570
+
+#BASIS SET:
+Al S
+ 3.307057950862 0.035887679384
+ 1.064990666611 -0.301434323135
+ 0.217094967607 0.408003674147
+ 0.103399050795 0.504944565236
+Al S
+ 0.091326549223 0.267250123332
+ 0.040384684710 0.424093785538
+Al P
+ 0.966979219663 -0.093593510129
+ 0.418602698432 0.186041911562
+ 0.179016462009 0.364571859325
+ 0.092774532026 0.270083093687
+Al P
+ 0.101275278069 0.562522659034
+ 0.032903691927 0.829515939897
+Al D
+ 0.653520505846 0.545097705481
+ 0.177964130832 1.149228250111
+
+#BASIS SET:
+Si S
+ 4.206061763818 0.036046100379
+ 1.379029753670 -0.278621659842
+ 0.265740193555 0.514032304554
+ 0.112502697537 0.365777551979
+Si S
+ 0.133673616794 0.251860678646
+ 0.048297859440 1.001912217231
+Si P
+ 1.367505459412 -0.045311794170
+ 0.391314943212 0.288581385277
+ 0.158228930721 0.372264544458
+ 0.068937667225 0.100776575609
+Si P
+ 0.117817448223 0.511784304415
+ 0.041067655925 0.790699182240
+Si D
+ 1.229803285857 0.317490526212
+ 0.304062845855 1.160802102368
+
+#BASIS SET:
+P S
+ 7.492690912960 0.036211256938
+ 1.520979692504 -0.410346524622
+ 0.389986469517 0.590431160497
+ 0.185944246896 0.700026469133
+P S
+ 0.426277431342 0.066946209604
+ 0.075641155401 0.662868977638
+P P
+ 1.182449802418 -0.070708334586
+ 0.716815122233 0.179798137437
+ 0.284356002842 0.347185537499
+ 0.119844662929 0.186156103439
+P P
+ 0.104174736899 0.642173678179
+ 0.042996540650 0.592979489332
+P D
+ 1.695162960423 0.257470657436
+ 0.351822528642 1.089094143136
+
+#BASIS SET:
+S S
+ 7.601355269472 0.046444664335
+ 1.927121218723 -0.464421807155
+ 0.521456307465 0.575890670867
+ 0.230532240510 0.826584502527
+S S
+ 0.470005347879 0.136072106019
+ 0.083469210422 1.019121415962
+S P
+ 1.711639257837 -0.172150677616
+ 1.157276160172 0.256510381181
+ 0.449727077619 0.514122297269
+ 0.195290728699 0.449898131647
+S P
+ 0.121042410856 0.700057493662
+ 0.061043915320 0.824465481915
+S D
+ 1.777751118268 0.301983328801
+ 0.404467069554 1.098792945928
+
+#BASIS SET:
+Cl S
+ 14.151131413181 0.017603655830
+ 2.057697743755 -0.379080686343
+ 0.825720874850 0.374023499125
+ 0.302442901804 0.721003299953
+Cl S
+ 0.514888452515 0.170089201402
+ 0.098706322995 0.945113577067
+Cl P
+ 2.844174271253 -0.135762495160
+ 1.377515628579 0.223435753709
+ 0.619123933124 0.728151126508
+ 0.261922095680 0.819017057903
+Cl P
+ 0.126421788724 0.990717918733
+ 0.073988046008 0.696607999630
+Cl D
+ 1.922210814739 0.588629611664
+ 0.428318736672 1.937851927117
+
+#BASIS SET:
+Ar S
+ 14.940101122322 0.011521861228
+ 2.243165636285 -0.558968080854
+ 1.438970926578 0.473342341500
+ 0.406876486290 0.631293265007
+Ar S
+ 0.440259652763 0.069249308310
+ 0.161875873542 1.603023542921
+Ar P
+ 3.165484511165 -0.077261666511
+ 1.111111719803 0.471181733030
+ 0.414649437581 0.751059102315
+ 0.162616494311 0.223201162944
+Ar P
+ 0.154347795709 1.472564774171
+ 0.071270891501 0.581126940969
+Ar D
+ 1.467813037801 0.644461354176
+ 0.399396403391 1.401420583174
+
+#BASIS SET:
+K S
+ 2.712682131864 -0.138120511191
+ 0.959527159237 0.183647787140
+ 0.415861396286 0.054504151428
+ 0.222428317820 0.079993072198
+K S
+ 0.745830108700 -0.013684884521
+ 0.447736522029 0.507569624842
+ 0.027242967193 0.912536205404
+K S
+ 0.032660680703 0.739672700474
+ 0.024351810227 1.104288980008
+K P
+ 10.065499144539 -0.038491537428
+ 0.974019481043 0.674332393862
+ 0.374596372255 0.612538235871
+ 0.156921520903 0.172520943233
+K P
+ 4.042012088018 -0.007059823100
+ 0.087259092608 0.077806374422
+ 0.054714257801 0.688537533778
+K D
+ 0.866638592894 0.502040108967
+ 0.157085997854 1.055271878444
+
+#BASIS SET:
+Ca S
+ 2.719301351535 -0.078978644821
+ 1.883275651368 0.187115863443
+ 0.642163903598 0.589578641721
+ 0.336487867317 0.370804234632
+Ca S
+ 3.026324320338 -0.233621817613
+ 1.242283226509 0.166011018416
+ 0.053488348283 0.035851055675
+Ca S
+ 0.058373367494 1.094426980400
+ 0.029921642326 0.431156605007
+Ca P
+ 10.411513039603 -0.021761105222
+ 1.181201852942 0.307819154635
+ 0.472806545197 0.269256985339
+ 0.206363671314 0.068406701922
+Ca P
+ 1.103963525530 -0.009366778560
+ 0.092210626014 0.588635174998
+ 0.007655059125 0.028655018892
+Ca D
+ 1.466572349546 0.333010877274
+ 0.261547289431 0.740339846733
+
+#BASIS SET:
+Sc S
+ 12.894521140222 0.079236996463
+ 5.071686274181 -0.458078625675
+ 1.167181899577 0.336412876746
+ 0.663942013228 0.332736377800
+ 0.309654579707 0.211456915306
+Sc S
+ 1.040885767796 0.463978291008
+ 0.568097001649 0.435148577227
+Sc S
+ 0.070617692779 1.482515914675
+ 0.027248406219 0.639348111708
+Sc P
+ 6.368578496263 -0.093545873727
+ 1.949004097442 0.341238050163
+ 0.815301694561 0.601180009390
+ 0.313815959827 0.269603495756
+Sc P
+ 0.037710430517 0.936037163888
+ 0.026841704778 1.288316160280
+Sc D
+ 11.687225911271 0.048657783353
+ 3.314758379726 0.202225028823
+ 1.046336738397 0.396787231764
+ 0.313968073144 0.458512682004
+Sc D
+ 0.096369535224 0.705041436498
+ 0.031054056783 0.578360893303
+
+#BASIS SET:
+Ti S
+ 16.461347886752 0.082016774935
+ 5.371386815523 -0.650480440851
+ 1.448985756914 0.538706537935
+ 0.638919559348 0.450440953157
+ 0.241619151339 0.154995866347
+Ti S
+ 1.051736442516 0.627756794746
+ 0.589928288257 0.487164574898
+Ti S
+ 0.075250745810 1.663780527189
+ 0.026602025230 0.399689063950
+Ti P
+ 7.122429655502 -0.037873691723
+ 2.383153836626 0.114975280753
+ 1.028878547766 0.221233308088
+ 0.416069961536 0.117542200112
+Ti P
+ 0.179440719634 1.269893747125
+ 0.076704284265 0.585246578780
+Ti D
+ 16.285523097231 0.057717359677
+ 4.903186429987 0.253388428709
+ 1.644289014934 0.545862699555
+ 0.525281547715 0.674601799858
+Ti D
+ 0.170096639197 1.166232841900
+ 0.064103525267 0.622642131383
+
+#BASIS SET:
+V S
+ 16.255319265212 0.094023708983
+ 6.405063127696 -0.537187056948
+ 1.457163624962 0.520258926643
+ 0.499926424667 0.122520741372
+ 0.301185417021 0.072736674208
+V S
+ 1.012605416044 1.098550324678
+ 0.547886952925 0.607303361815
+V S
+ 0.090933679299 1.595189552198
+ 0.036744292616 0.948508696780
+V P
+ 8.103098572263 -0.081234318416
+ 2.629836559131 0.267004275831
+ 1.112202390392 0.485982458150
+ 0.429812510937 0.238017396511
+V P
+ 0.134170344208 0.865319211565
+ 0.045541731704 1.001512124222
+V D
+ 20.991852147597 0.047940709014
+ 6.482973795152 0.218068368478
+ 2.259607977658 0.483008624822
+ 0.787388461330 0.596005568614
+V D
+ 0.266299635377 0.873322009295
+ 0.085848775242 0.412455368037
+
+#BASIS SET:
+Cr S
+ 19.326807872923 0.115928746519
+ 6.895352080094 -0.789981589302
+ 1.869640267141 0.563707338990
+ 0.843806445546 0.445197267976
+ 0.311586877483 0.070648620143
+Cr S
+ 1.226144167110 1.430412693717
+ 0.536784361672 0.817200561042
+Cr S
+ 0.103687455191 1.278362236512
+ 0.040453943570 0.998838137196
+Cr P
+ 9.274973304564 -0.067626882136
+ 2.834803217107 0.248491248438
+ 1.188671904758 0.418309250990
+ 0.452672452476 0.190643237960
+Cr P
+ 0.099943441911 0.572805817456
+ 0.020690649882 0.829846109272
+Cr D
+ 25.343518443315 0.043409250189
+ 7.837194674080 0.208571751452
+ 2.703730990740 0.481280315878
+ 0.923329332261 0.600167729446
+Cr D
+ 0.301729037231 1.148719306493
+ 0.095659118228 0.610428134382
+
+#BASIS SET:
+Mn S
+ 20.843228079998 0.103402377154
+ 7.760419419145 -0.656141637759
+ 1.904516413545 0.665798788078
+ 1.001320433899 0.396522428503
+ 0.481937472515 0.292024154733
+Mn S
+ 1.268268420404 1.565497038368
+ 0.857241213876 1.201323188808
+Mn S
+ 0.109140028793 1.457741873878
+ 0.045318992446 1.357680548185
+Mn P
+ 10.512436868143 -0.097556999181
+ 3.099073135769 0.375169172686
+ 1.322948831985 0.597227851646
+ 0.527055310067 0.279726392464
+Mn P
+ 0.344507060257 0.623790512345
+ 0.118361361298 1.250852309631
+Mn D
+ 31.818227456997 0.047047726486
+ 10.037421666655 0.238358793520
+ 3.534516417893 0.579576958050
+ 1.249761773374 0.755472666211
+Mn D
+ 0.419663812162 1.107456649217
+ 0.131489658435 0.625907283697
+
+#BASIS SET:
+Fe S
+ 22.358655151183 0.092871540367
+ 8.726722256050 -0.543175687899
+ 1.883511346045 0.788008191373
+ 0.898098829763 0.506118339909
+ 0.499141495354 0.151467937894
+Fe S
+ 1.414672733116 1.048090878533
+ 0.838969961703 0.701944708617
+Fe S
+ 0.126967189900 1.110584663987
+ 0.048297686989 0.960183114424
+Fe P
+ 11.737335796504 -0.087457071524
+ 3.491740558354 0.321194930339
+ 1.519670314056 0.525914230497
+ 0.612230245037 0.263690954761
+Fe P
+ 0.270740512750 0.802890646298
+ 0.110956520505 0.614971889314
+Fe D
+ 37.602436513551 0.035788062793
+ 11.980469021430 0.185256836504
+ 4.315862847956 0.455518034427
+ 1.583817661861 0.610355600220
+Fe D
+ 0.548357256279 1.544966076786
+ 0.171477360605 0.810476849978
+
+#BASIS SET:
+Co S
+ 25.335315694476 0.087766855961
+ 9.488355054778 -0.550692716629
+ 2.105770866101 0.731208569241
+ 1.152741563097 0.435591231225
+ 0.600134651908 0.260972326852
+Co S
+ 1.728593048399 0.743738752336
+ 0.965135408786 0.944149972951
+Co S
+ 0.138217537887 1.152256178711
+ 0.048799619525 1.322788514015
+Co P
+ 13.019846825599 -0.106687768822
+ 3.898569128239 0.379287211199
+ 1.691912041740 0.646070467691
+ 0.663194327000 0.321080022842
+Co P
+ 0.266233736271 0.516713551942
+ 0.087682602286 0.610544822757
+Co D
+ 46.467186502091 0.036738936657
+ 14.844727962964 0.206400986919
+ 5.324781910313 0.545964703962
+ 1.946193019964 0.772313951631
+Co D
+ 0.663697766556 1.764841258136
+ 0.197538703092 0.981959446237
+
+#BASIS SET:
+Ni S
+ 28.117307328944 0.095054204041
+ 10.323114146207 -0.619352378957
+ 2.266501334743 0.903046067553
+ 1.031698338004 0.572669971934
+ 0.429763116477 0.107936112138
+Ni S
+ 1.585096116449 0.536471821182
+ 0.927688425987 0.466470527797
+Ni S
+ 0.142723981700 1.052935132680
+ 0.052288562321 0.852229071228
+Ni P
+ 14.649263427379 -0.123572234826
+ 4.487248555279 0.373049398485
+ 1.926522201916 0.666618694496
+ 0.755571182364 0.334357692295
+Ni P
+ 0.320923338030 1.109796900506
+ 0.124597438762 1.063986125250
+Ni D
+ 55.795596873330 0.032016254993
+ 17.671713442822 0.206558524336
+ 6.458884370599 0.576880736568
+ 2.375449580314 0.853632978049
+Ni D
+ 0.808597906774 1.577004983294
+ 0.239256209431 0.904057266556
+
+#BASIS SET:
+Cu S
+ 30.095654464774 0.088111002450
+ 11.189465415118 -0.566864618289
+ 2.541452118390 0.690601613017
+ 1.290510764029 0.526642987015
+ 0.608961815658 0.132227842362
+Cu S
+ 2.018430478515 0.946194744344
+ 0.849138974882 0.961852815883
+Cu S
+ 0.155609251028 1.205505591180
+ 0.055100263916 1.040275581217
+Cu P
+ 16.197916896934 -0.105945973220
+ 4.487908313373 0.425417913649
+ 1.928366077649 0.642632779569
+ 0.763943236555 0.292317921669
+Cu P
+ 0.299840227632 1.292856821394
+ 0.116773403393 1.061205571547
+Cu D
+ 60.487297938910 0.032026327350
+ 19.233046405660 0.186974917418
+ 6.965090689821 0.519387752985
+ 2.568903372512 0.762425819652
+Cu D
+ 0.875809628134 0.877877663513
+ 0.257690509171 0.512991728701
+
+#BASIS SET:
+Zn S
+ 33.874473714375 0.071277616937
+ 11.972622956367 -0.506758729754
+ 2.961714395977 0.493378998699
+ 1.829981912490 0.307312140366
+ 0.870554735732 0.249108397868
+Zn S
+ 1.739203860675 1.189126584032
+ 0.784958106810 0.340497149689
+Zn S
+ 0.173777482831 1.056166866705
+ 0.058936157612 0.875584102608
+Zn P
+ 18.428852736536 -0.072783254370
+ 4.757828516192 0.320487006186
+ 2.051504911103 0.464955123397
+ 0.811290090160 0.202072427695
+Zn P
+ 0.280329527429 0.290680713435
+ 0.074967629409 0.340832692654
+Zn D
+ 69.345783920227 0.038894475768
+ 22.344643463037 0.225936760009
+ 8.230073746758 0.625199117295
+ 3.129607637648 0.942430315944
+Zn D
+ 1.115445344238 1.300039063947
+ 0.350438433853 0.725428578056
+
+#BASIS SET:
+Ga S
+ 3.461851099712 0.086596634698
+ 1.673374271825 -0.270587362691
+ 0.211318368014 0.351819394013
+ 0.097828153295 0.285416686844
+Ga S
+ 0.151627010733 0.297902650076
+ 0.055604376270 0.811459212326
+Ga P
+ 1.224072196918 -0.142334247355
+ 0.570340687021 0.156300957280
+ 0.176342181917 0.414233473261
+ 0.069338127059 0.187202564348
+Ga P
+ 0.103202148668 0.781680200651
+ 0.032521362068 1.139904102344
+Ga D
+ 0.434136758705 0.630724293037
+ 0.136436950793 1.735376440057
+
+#BASIS SET:
+Ge S
+ 3.217496101688 0.220957013438
+ 1.918166572642 -0.511683753720
+ 0.214903864260 0.831622355013
+ 0.067589930320 0.205018958674
+Ge S
+ 0.465696258761 0.242715316187
+ 0.060174025120 1.570350456114
+Ge P
+ 3.914179557047 0.071448877544
+ 2.346468630555 -0.196014253790
+ 0.273861111944 0.722477976627
+ 0.105990071273 0.319809081571
+Ge P
+ 0.202135065169 0.219317596243
+ 0.061214687117 1.973742824912
+Ge D
+ 0.370009633120 1.071258503403
+ 0.132976201398 1.060360506723
+
+#BASIS SET:
+As S
+ 3.507693642950 0.158031853980
+ 1.888383025988 -0.490805795974
+ 0.288111240456 0.729540893159
+ 0.143180088068 0.326673579759
+As S
+ 0.352438514226 0.168792374132
+ 0.073126498505 1.058779710836
+As P
+ 1.358255409051 -0.165061038563
+ 0.934647702814 0.154119568450
+ 0.292654437651 0.269752770917
+ 0.122927318606 0.174509228238
+As P
+ 0.118767537140 0.878680997698
+ 0.047605103697 1.162342892188
+As D
+ 0.314487938869 0.831547539129
+ 0.218493965228 1.033423867748
+
+#BASIS SET:
+Se S
+ 3.650865730594 0.231457444968
+ 2.210903609188 -0.563533855155
+ 0.335170928150 0.729862073845
+ 0.151638836734 0.315657165245
+Se S
+ 0.599638822173 0.126025157342
+ 0.075582134264 1.069252579041
+Se P
+ 1.507148257439 -0.162605422408
+ 0.913367414636 0.183499918283
+ 0.341221978698 0.372235321531
+ 0.150432587271 0.271959003606
+Se P
+ 0.157082854490 0.530816305712
+ 0.061272426885 1.598391183858
+Se D
+ 0.367597502630 1.060909735181
+ 0.223013187434 0.893014584292
+
+#BASIS SET:
+Br S
+ 4.214014635008 0.165692028470
+ 2.412188698594 -0.476877193218
+ 0.391067027010 0.710299602030
+ 0.164674252704 0.362093036361
+Br S
+ 0.485599664697 0.145825877593
+ 0.074598994605 0.768087358690
+Br P
+ 1.789947253144 -0.207356828201
+ 1.292036187413 0.186107969204
+ 0.449383252160 0.332524333195
+ 0.194590167329 0.303778721054
+Br P
+ 0.122759759967 0.804134734196
+ 0.066738625731 1.319919526700
+Br D
+ 0.442438146643 1.158672768266
+ 0.245849018795 1.021366227819
+
+#BASIS SET:
+Kr S
+ 3.986511440636 0.290559957024
+ 2.943901744792 -0.516111029924
+ 0.429901866727 0.564971586700
+ 0.151821838900 0.272351694911
+Kr S
+ 0.334711812968 0.193054210335
+ 0.111745168911 0.897546361771
+Kr P
+ 2.153957686193 -0.178565423896
+ 1.283748685547 0.172715443611
+ 0.501685561731 0.400700715929
+ 0.237520267977 0.316528198613
+Kr P
+ 0.140113161172 0.940357138145
+ 0.089204908642 0.846922224775
+Kr D
+ 0.526301285310 1.174418328612
+ 0.203100266787 0.975302884833
+
+#BASIS SET:
+Rb S
+ 3.869999599846 0.122319888582
+ 2.130904636104 -0.666846831080
+ 0.738002026783 0.418046906087
+ 0.409528742839 0.516121898223
+Rb S
+ 1.350580726758 0.441228993164
+ 0.672123461407 -0.180183191143
+ 0.215026040193 0.940988897210
+Rb S
+ 0.041727573698 0.194823232373
+ 0.018173085367 0.240784908108
+Rb P
+ 2.767410417511 -0.100991461056
+ 0.690279272515 0.523102112330
+ 0.272134686488 0.509586156257
+ 0.099861314223 0.105281356899
+Rb P
+ 0.483868425918 -0.021967319393
+ 0.064990339093 0.192198563244
+ 0.026344167036 0.254778803087
+Rb D
+ 0.482952908592 0.814918945654
+ 0.118599680126 1.160272481701
+
+#BASIS SET:
+Sr S
+ 1.692706637459 -0.393341748067
+ 1.149795785772 0.208407810148
+ 0.669131477932 0.410560473766
+ 0.248220076725 0.223568940036
+Sr S
+ 1.886266511407 0.403235193588
+ 0.400164390156 -0.077244774899
+ 0.087928617203 0.264020515846
+Sr S
+ 0.059443196810 1.173409533858
+ 0.028584686102 1.214114711178
+Sr P
+ 2.806647699305 -0.059210927424
+ 0.819376524702 0.263531483008
+ 0.348158267797 0.253839198083
+ 0.146003646962 0.054726018823
+Sr P
+ 0.935150498254 -0.009917943921
+ 0.087786356974 0.731454921275
+ 0.045420993133 0.227916283507
+Sr D
+ 0.688406352900 0.490903537712
+ 0.192704647015 0.763662684452
+
+#BASIS SET:
+Y S
+ 7.296549891946 0.195341789784
+ 2.688422185597 -1.838359171848
+ 2.177642572388 1.023950898895
+ 0.659090683583 0.677612516206
+ 0.318000313098 0.286803253760
+Y S
+ 0.674812957467 0.585665285488
+ 0.290353362618 0.292072052074
+Y S
+ 0.061667823748 1.471139784374
+ 0.025673512214 0.434089285930
+Y P
+ 2.413073905893 -0.896109901350
+ 1.991118335445 0.920914883211
+ 0.644885028991 0.747249800994
+ 0.262158149515 0.358967578245
+Y P
+ 0.090251834443 1.587694599781
+ 0.038734540875 1.124989169808
+Y D
+ 2.640667082288 -0.051410329462
+ 1.377231693280 0.259691609738
+ 0.521170303505 0.647670134977
+ 0.191317945677 0.722772032171
+Y D
+ 0.066561171952 0.635575534061
+ 0.021058714013 0.213198852821
+
+#BASIS SET:
+Zr S
+ 7.636755459435 0.225594387816
+ 3.059460604715 -2.021770934596
+ 2.734289469745 1.043987761524
+ 0.974030529831 0.649481513646
+ 0.321237768185 0.271978542754
+Zr S
+ 0.666118313328 0.610098372668
+ 0.382004929999 0.279366801720
+Zr S
+ 0.078638985037 1.573000069276
+ 0.030977707836 0.591169478729
+Zr P
+ 2.635304536716 -0.851203628339
+ 2.139701672965 0.892120469003
+ 0.701270261887 0.710477753317
+ 0.294117575826 0.292528213505
+Zr P
+ 0.153963461038 1.438560009408
+ 0.088262101191 0.995725387494
+Zr D
+ 3.201047577661 -0.038892498436
+ 1.485048727987 0.275692468196
+ 0.559574253517 0.646436509544
+ 0.196618868868 0.666536113423
+Zr D
+ 0.062451815401 0.649948004351
+ 0.020325928756 0.194809845607
+
+#BASIS SET:
+Nb S
+ 8.226907672370 0.173869214563
+ 5.258853938239 0.400716421611
+ 4.024909003000 -1.292442883143
+ 1.195129963229 0.474286872382
+ 0.359665734464 0.272918985707
+Nb S
+ 0.780972111614 0.609456624827
+ 0.468149789914 0.309910528963
+Nb S
+ 0.084330299999 1.270848051253
+ 0.034918666624 0.591189631131
+Nb P
+ 2.871351590001 -0.873217473927
+ 2.280935596682 0.940543482261
+ 0.743551490746 0.760237258354
+ 0.298069250267 0.269556485513
+Nb P
+ 0.152595605504 1.401687661947
+ 0.068327454661 1.010582696646
+Nb D
+ 3.367880306345 -0.051672605872
+ 1.756564781302 0.284516251789
+ 0.658735017693 0.664232933582
+ 0.231059396397 0.658825321794
+Nb D
+ 0.072216226374 0.696966749733
+ 0.024801099920 0.196624147646
+
+#BASIS SET:
+Mo S
+ 9.673752512870 0.293639539791
+ 3.572282146733 -1.857614871108
+ 1.530849754894 1.224864128516
+ 0.631996839839 0.660456106153
+ 0.288839329473 0.296153352239
+Mo S
+ 0.778417996936 0.603632414429
+ 0.534172652813 0.301871306040
+Mo S
+ 0.087626525151 1.317087783672
+ 0.033562681859 0.580743179363
+Mo P
+ 3.093707673366 -0.826631577474
+ 2.403641873352 0.900674860790
+ 0.818240314903 0.711828517694
+ 0.336205879987 0.269489929318
+Mo P
+ 0.141936162003 1.303129589177
+ 0.051142057872 1.262881655764
+Mo D
+ 4.248706637709 -0.056047421983
+ 2.043076701133 0.260482078879
+ 0.819219599195 0.645891778457
+ 0.300623287176 0.680758170713
+Mo D
+ 0.099924939859 0.934167552489
+ 0.033837585352 0.225962855556
+
+#BASIS SET:
+Tc S
+ 8.734488231535 0.191829141563
+ 3.835899095068 -2.105775116524
+ 3.469751744685 1.518466647586
+ 0.967124516165 0.732292685600
+ 0.424182603993 0.274139633354
+Tc S
+ 1.003924527229 0.601613332020
+ 0.543013116325 0.295145009859
+Tc S
+ 0.088094236993 1.301256535828
+ 0.033682774860 0.537823653934
+Tc P
+ 3.405023984944 -0.854982996734
+ 2.564265074003 0.959319009575
+ 0.863581625172 0.828846615206
+ 0.337411029995 0.255144526070
+Tc P
+ 0.091795517667 1.101466987783
+ 0.029879282774 1.937115484620
+Tc D
+ 5.097732110758 -0.054667679941
+ 2.187138333162 0.264182410092
+ 0.976624475913 0.623831001407
+ 0.396051934959 0.663361650807
+Tc D
+ 0.148110040542 1.029802355721
+ 0.059688691595 0.261393560258
+
+#BASIS SET:
+Ru S
+ 9.696500481430 0.176368995944
+ 3.986490207677 -1.981311830052
+ 3.507270554217 1.413770781341
+ 1.044398638037 0.657254090853
+ 0.448751919987 0.252393374138
+Ru S
+ 1.064403045557 0.566761648709
+ 0.627779982819 0.302811587640
+Ru S
+ 0.101793183953 1.334893817716
+ 0.034776411982 0.605967167434
+Ru P
+ 3.763404295092 -0.859444471047
+ 2.770872457961 0.955526231776
+ 0.988572431005 0.893940245276
+ 0.408597854458 0.334564018189
+Ru P
+ 0.187687004290 1.566749185235
+ 0.051504739627 0.913525241281
+Ru D
+ 5.550032090322 -0.057846761038
+ 2.446355819382 0.253638478653
+ 1.073342025516 0.583490878354
+ 0.429389404890 0.589592130440
+Ru D
+ 0.165052645167 1.116296297183
+ 0.068844897289 0.265257106223
+
+#BASIS SET:
+Rh S
+ 10.612277159968 0.168503102460
+ 4.175711357562 -1.847338836606
+ 3.540198264706 1.311067350155
+ 1.054712457574 0.697993868303
+ 0.463476941827 0.185865531158
+Rh S
+ 1.120987885596 0.630953703067
+ 0.505354440197 0.263165934158
+Rh S
+ 0.107799055751 0.895058309015
+ 0.035593708316 0.457714131915
+Rh P
+ 3.915762760107 -1.155445931450
+ 3.214946272686 1.203030672069
+ 1.138390014316 0.792074916425
+ 0.483900206512 0.361955969996
+Rh P
+ 0.173801790703 1.467776545715
+ 0.058955083779 1.888699833975
+Rh D
+ 6.396827458805 -0.048409300225
+ 2.908324768342 0.267035530162
+ 1.367666120621 0.694020583366
+ 0.573074572998 0.792961890374
+Rh D
+ 0.215417984872 1.166151849377
+ 0.074694475620 0.264937531216
+
+#BASIS SET:
+Pd S
+ 11.260042205230 0.164097577056
+ 4.447554407937 -1.866722502527
+ 3.798950429913 1.359736862671
+ 1.125191239801 0.565530046178
+ 0.452556029260 0.188772887643
+Pd S
+ 1.151505961507 0.944652624868
+ 0.758022887254 0.204351049313
+Pd S
+ 0.114341106908 0.954437219940
+ 0.041622071605 0.471208585065
+Pd P
+ 4.280202330909 -1.177377340534
+ 3.258393937931 1.288735738708
+ 1.165454990282 1.060685702915
+ 0.488221099390 0.409953870721
+Pd P
+ 0.201275394801 1.775717120960
+ 0.057053584256 1.173935561093
+Pd D
+ 6.939335039037 -0.045626149466
+ 3.004130610160 0.311839410976
+ 1.365590049615 0.718055557380
+ 0.570518822592 0.709739345643
+Pd D
+ 0.235211306335 0.761544637626
+ 0.098548614743 0.252650801917
+
+#BASIS SET:
+Ag S
+ 11.137380185754 0.170113394090
+ 4.989686293940 -1.887735459567
+ 4.475920194052 1.434356812706
+ 1.201486343175 0.868528612669
+ 0.510392667886 0.192122601443
+Ag S
+ 1.168865294473 0.981239664457
+ 0.703892508518 0.301221440276
+Ag S
+ 0.109374172855 1.053993979072
+ 0.033776584003 0.411521983508
+Ag P
+ 4.556372684610 -1.202933355935
+ 3.513166261338 1.313296588850
+ 1.244794863986 1.036312910226
+ 0.512047738553 0.383294270449
+Ag P
+ 0.176416800129 0.973703560128
+ 0.052372032836 0.912910487489
+Ag D
+ 7.446453391764 -0.045685848836
+ 3.255927976164 0.334321217612
+ 1.458177385755 0.759037145474
+ 0.596305718479 0.729668743368
+Ag D
+ 0.229825513509 1.458004453308
+ 0.090151969871 0.269101010696
+
+#BASIS SET:
+Cd S
+ 12.017244504805 0.169339966251
+ 5.114217825559 -1.898543443118
+ 4.446940115430 1.450853794865
+ 1.216628349228 0.933523984483
+ 0.532052126747 0.193968463984
+Cd S
+ 1.055105987411 1.272808254083
+ 0.604148329306 0.229142637869
+Cd S
+ 0.146383394326 0.696754211201
+ 0.047046479439 0.358960905929
+Cd P
+ 4.757233246634 -1.076910687488
+ 3.867372408050 1.151779729705
+ 1.358823077137 0.721432137820
+ 0.577098779341 0.278790097257
+Cd P
+ 0.182360815830 0.804988419982
+ 0.062419275984 0.598193345980
+Cd D
+ 8.110020059042 -0.025222412807
+ 3.451726768569 0.211885297280
+ 1.581708281795 0.447542350487
+ 0.673407975664 0.418668166040
+Cd D
+ 0.272634468912 1.116448847886
+ 0.113339867322 0.175327136751
+
+#BASIS SET:
+In S
+ 1.424753204835 0.195216369564
+ 0.967776518278 -0.390335918373
+ 0.189316580236 0.332066512393
+ 0.069682986693 0.481912204467
+In S
+ 0.262064023539 0.051928774476
+ 0.064458770986 0.801906317169
+In P
+ 1.810449027147 0.061344922403
+ 1.046643496682 -0.193157129825
+ 0.184987578096 0.458390096608
+ 0.071861619824 0.194158130930
+In P
+ 0.138098170856 0.104631733864
+ 0.041247580889 0.868493244912
+In D
+ 0.138242005138 0.573395637068
+ 0.079777300441 1.016219806155
+
+#BASIS SET:
+Sn S
+ 2.401062078850 0.149527197103
+ 1.153316115528 -0.502781332360
+ 0.226914764388 0.515242159192
+ 0.110037948497 0.383620829459
+Sn S
+ 0.317225487215 0.157353145385
+ 0.061053997178 0.982405762325
+Sn P
+ 2.566898425488 0.055051200864
+ 1.499260435985 -0.169738769858
+ 0.227034747645 0.493183210725
+ 0.092179498934 0.327033786313
+Sn P
+ 0.173407347066 0.225606877021
+ 0.048674549711 2.065300464132
+Sn D
+ 0.241102386017 1.101090309967
+ 0.116551335998 1.474465735400
+
+#BASIS SET:
+Sb S
+ 1.901291496434 0.372390986717
+ 1.449475120294 -0.589821664528
+ 0.255179663388 0.238020525012
+ 0.159557301375 0.282602074546
+Sb S
+ 0.355114126591 0.126701793951
+ 0.067393141859 1.046620800528
+Sb P
+ 2.375689851709 0.051122584238
+ 1.348062774398 -0.161075779289
+ 0.243744636819 0.405765152732
+ 0.107166990009 0.186486532978
+Sb P
+ 0.205419448722 0.140291569705
+ 0.062916291344 1.926934328503
+Sb D
+ 0.205284248872 1.923448831425
+ 0.161194685111 1.084075276779
+
+#BASIS SET:
+Te S
+ 2.083514130685 0.802285578371
+ 1.714507901604 -1.147432002317
+ 0.243721857039 0.709372799498
+ 0.130681370450 0.126965084683
+Te S
+ 0.315794405079 0.190600374708
+ 0.088221331240 0.836704224185
+Te P
+ 2.481012810644 0.073901692051
+ 1.399169205477 -0.238783627662
+ 0.229946308236 0.340455500866
+ 0.127434638298 0.247951299805
+Te P
+ 0.503816419435 0.919676716095
+ 0.056809623374 0.797571264542
+Te D
+ 0.192018182978 1.571397655653
+ 0.064437864854 0.969777509197
+
+#BASIS SET:
+I S
+ 2.467637755918 0.270191039383
+ 1.696928162357 -0.514853581902
+ 0.298836215328 0.446031974526
+ 0.139199227031 0.193993291432
+I S
+ 0.241437612793 0.214060325418
+ 0.078209906425 0.853326113661
+I P
+ 0.954417866049 -0.267890449748
+ 0.663589859519 0.294307194920
+ 0.236750134212 0.246021979653
+ 0.130584739507 0.138773377806
+I P
+ 0.229168242463 0.063105849732
+ 0.067220884629 2.030056520658
+I D
+ 0.327941623565 0.159626880427
+ 0.228514545922 1.064910731508
+
+#BASIS SET:
+Xe S
+ 2.755095924827 0.249896927023
+ 1.738312366995 -0.537631660732
+ 0.358046650467 0.489101738862
+ 0.142124603625 0.268006037727
+Xe S
+ 0.275478850221 0.193322700350
+ 0.100907057598 0.887859271842
+Xe P
+ 1.117839719300 -0.276367079429
+ 0.691267910074 0.323178756010
+ 0.279267713061 0.298358628779
+ 0.155955498860 0.226885229369
+Xe P
+ 0.265352026982 0.067336765896
+ 0.075892744312 1.830243781388
+Xe D
+ 0.402780208511 0.161378596156
+ 0.243632826926 1.043744835741
+
+#BASIS SET:
+Cs S
+ 2.172433323343 0.093944897136
+ 1.213559429867 -0.574876851884
+ 0.913074181038 0.380293350663
+ 0.313880498180 0.275539125878
+Cs S
+ 1.294434390251 -0.246882008315
+ 1.088756263514 0.352493537018
+ 0.141990201906 1.013884420396
+Cs S
+ 0.110486818629 0.142948191575
+ 0.021036226155 0.289955177474
+Cs P
+ 1.296612062813 -0.158205270387
+ 0.511587649506 0.411480813114
+ 0.201550092329 0.414262558281
+ 0.065338478075 0.072302175823
+Cs P
+ 0.299675722332 -0.097355523294
+ 0.063470843305 0.070064297558
+ 0.029363753532 0.660221576187
+Cs D
+ 0.288013152964 1.130417708310
+ 0.094776153263 0.956697547107
+
+#BASIS SET:
+Ba S
+ 2.165224439573 0.172126972669
+ 1.563606663015 -0.412498520678
+ 0.468388660094 0.403433792501
+ 0.182490630329 0.153935914977
+Ba S
+ 0.969811881199 0.332173932718
+ 0.438335807571 -0.093598712810
+ 0.051413927514 0.275829475814
+Ba S
+ 0.038420685468 0.348249206633
+ 0.034827156628 1.049222993274
+Ba P
+ 1.246867677258 -0.090713352176
+ 0.619719104166 0.186065346861
+ 0.314722191970 0.053345917338
+ 0.218214872641 0.124038886456
+Ba P
+ 0.888823238004 -0.009266901736
+ 0.112339424956 0.520029153739
+ 0.048917251542 0.294753654056
+Ba D
+ 0.376362798375 0.740747300068
+ 0.122678391091 0.544085547842
+
+#BASIS SET:
+La S
+ 2.961518815289 0.344436595849
+ 2.116898887325 -0.624622276100
+ 0.521152464968 0.515127237675
+ 0.230069398586 0.246218458006
+La S
+ 0.579546119270 0.937709515416
+ 0.290509752938 1.164367331351
+La S
+ 0.048916762584 1.105995776925
+ 0.018604737481 0.464967111858
+La P
+ 3.245245643206 0.176769094460
+ 2.470890435412 -0.332713646676
+ 0.545303856973 0.526757519263
+ 0.216537966689 0.336610767661
+La P
+ 0.033299369099 1.044022672845
+ 0.006745724156 0.067353901679
+La D
+ 1.509848815606 -0.091622463451
+ 0.743009875436 0.179196344423
+ 0.339879135044 0.485427467590
+ 0.137512286533 0.463167031971
+La D
+ 0.060432252241 0.866616108693
+ 0.021446771088 0.436873072386
+
+#BASIS SET:
+Ce S
+ 2.338708269400 0.130423191930
+ 1.321938059900 -0.945407435390
+ 0.936653430510 0.578660129970
+ 0.433174507830 0.199228432480
+Ce S
+ 0.787161385190 0.613999779320
+ 0.256141028710 0.974944732900
+Ce S
+ 0.042862440378 1.450044603900
+ 0.019521352708 0.347469655730
+Ce P
+ 2.219774933800 0.196030889890
+ 2.018912507100 -0.320257523750
+ 0.592087363040 0.406207592670
+ 0.277158908340 0.243517746880
+Ce P
+ 0.156062580290 1.000000000000
+Ce D
+ 1.084221385500 -0.014352880349
+ 0.448540175050 0.588466222730
+ 0.151960945110 0.147553217860
+Ce D
+ 0.139549259200 1.539344007900
+ 0.040874025783 0.369009644880
+Ce F
+ 34.145399262000 0.032170509402
+ 12.065428998000 0.140071215210
+ 4.450111205600 0.284220463290
+ 1.676403229600 0.342402334160
+ 0.652084636310 0.123817383030
+Ce F
+ 0.545199397300 0.899844805050
+ 0.161907400480 0.966849207400
+
+#BASIS SET:
+Pr S
+ 3.168984651800 0.171631098330
+ 1.667890716800 -0.995838233800
+ 0.812186125420 0.272458450300
+ 0.601529244950 0.152421654920
+Pr S
+ 0.903875352100 0.657504248670
+ 0.301189177150 0.542467435400
+Pr S
+ 0.062615656705 0.959511574480
+ 0.022405546382 0.289850823590
+Pr P
+ 2.482108136900 0.203400612200
+ 2.257381996700 -0.340953226390
+ 0.685565850410 0.453133555060
+ 0.387144669100 0.087858983753
+Pr P
+ 0.220381064000 1.000000000000
+Pr D
+ 1.002533215800 -0.019351122016
+ 0.750223540940 0.598029336850
+ 0.229329053320 0.138544903510
+Pr D
+ 0.205673753170 0.684336589290
+ 0.046531823684 0.191287740830
+Pr F
+ 40.232881930000 0.039727695667
+ 14.439096901000 0.185973568320
+ 5.666824231800 0.364846754290
+ 2.437873255300 0.393202983180
+ 1.049827395300 0.151852361610
+Pr F
+ 1.155904606900 0.649433443810
+ 0.367184865050 0.738335210940
+
+#BASIS SET:
+Nd S
+ 2.399445231100 0.330052519610
+ 1.682884324300 -1.087812006300
+ 0.988141728820 0.228026434820
+ 0.208920206230 0.190568753630
+Nd S
+ 1.127298797900 0.721120134100
+ 0.489284491850 0.803593302000
+Nd S
+ 0.024303840698 0.504862388910
+ 0.013029609227 0.612696241810
+Nd P
+ 2.324735346000 0.170147916960
+ 2.056219145600 -0.355178646810
+ 0.697829062940 0.572197291710
+ 0.509194679390 0.085761284929
+Nd P
+ 0.235469139510 1.000000000000
+Nd D
+ 1.074577298900 -0.017620186217
+ 0.472062103620 0.708203155280
+ 0.200015863370 0.168647666540
+Nd D
+ 0.142617264320 0.520890592900
+ 0.057631923663 0.180610053610
+Nd F
+ 46.847185945000 0.030502424037
+ 17.019304283000 0.153104586980
+ 6.798621254000 0.316876806310
+ 2.887091563100 0.406866529590
+ 1.108982204200 0.166396269320
+Nd F
+ 1.261794730700 0.585966756640
+ 0.388764008300 0.611998230940
+
+#BASIS SET:
+Pm S
+ 2.930428338900 0.357177154220
+ 1.927498493600 -1.170119154300
+ 0.772548914370 0.230068336210
+ 0.169843099460 0.183828159010
+Pm S
+ 1.062021113300 0.229843223870
+ 0.443061656340 0.640385348860
+Pm S
+ 0.047080341941 0.566945207290
+ 0.020131076829 0.951730188260
+Pm P
+ 2.526409558100 0.186350621230
+ 2.182882378400 -0.359383234040
+ 0.678345299410 0.573936192200
+ 0.368422273410 0.203240994710
+Pm P
+ 0.217452026430 1.000000000000
+Pm D
+ 1.175937624900 -0.019615039553
+ 0.437438037980 0.684495543500
+ 0.212867517890 0.138802799410
+Pm D
+ 0.112237432370 0.346138298110
+ 0.029513406922 0.263739259970
+Pm F
+ 49.708022670000 0.030656850946
+ 17.966438036000 0.154675899030
+ 7.102080585100 0.330654822160
+ 2.908142509900 0.444450968650
+ 1.215606314300 0.270568901430
+Pm F
+ 0.879756044810 0.562280426050
+ 0.359805598200 0.722340818030
+
+#BASIS SET:
+Sm S
+ 3.198468986000 0.388700993160
+ 2.054638395700 -1.296672422600
+ 0.824222222660 0.227766807290
+ 0.160168477290 0.172833079050
+Sm S
+ 0.946609884890 0.286948469730
+ 0.428263406830 0.671620639940
+Sm S
+ 0.055236922625 0.599595120810
+ 0.024617411607 0.912179105210
+Sm P
+ 2.513124078800 0.191310169700
+ 2.237188900400 -0.361399138370
+ 0.697827704550 0.575663871770
+ 0.380565701450 0.175498933290
+Sm P
+ 0.225623014280 1.000000000000
+Sm D
+ 0.926535125110 -0.017463655728
+ 0.519362892700 0.585997594070
+ 0.221098036510 0.167648746790
+Sm D
+ 0.154304824910 0.311945638600
+ 0.053900115632 0.235546802760
+Sm F
+ 50.128022078000 0.032967782994
+ 18.164742527000 0.158422860610
+ 7.183502716200 0.325839145800
+ 2.962754106700 0.421637206220
+ 1.232140976700 0.251720808500
+Sm F
+ 0.930577398270 0.531939752500
+ 0.372797890110 0.763575162480
+
+#BASIS SET:
+Eu S
+ 3.345059905700 0.375970166750
+ 2.075574062000 -1.405131776800
+ 0.940426981650 0.232505604610
+ 0.181674650410 0.192608133490
+Eu S
+ 1.052166334100 0.327606031730
+ 0.442926265500 0.643659880550
+Eu S
+ 0.061414092824 0.588230789160
+ 0.031784177184 0.930609513330
+Eu P
+ 2.495634008500 0.194164540170
+ 2.292506715600 -0.356747943100
+ 0.720591307920 0.567428064980
+ 0.386142526820 0.170263118340
+Eu P
+ 0.229374588050 1.000000000000
+Eu D
+ 0.964946609210 -0.016327079928
+ 0.497327762740 0.714355608550
+ 0.212503372040 0.164848026920
+Eu D
+ 0.155452858570 0.317749826660
+ 0.058419430722 0.210656287270
+Eu F
+ 51.095384126000 0.035305949222
+ 18.473596326000 0.162335136700
+ 7.346815898000 0.324543595140
+ 3.059653733000 0.409215096810
+ 1.273688657300 0.267755957520
+Eu F
+ 0.881971831840 0.506986166910
+ 0.381292553470 0.853657650950
+
+#BASIS SET:
+Gd S
+ 4.000019104300 0.333487234160
+ 2.195731389500 -1.484942023100
+ 0.965241154350 0.249401906810
+ 0.162248187410 0.184150027830
+Gd S
+ 1.098367709400 0.332219940840
+ 0.432548270620 0.524530415210
+Gd S
+ 0.064209408826 0.603912156030
+ 0.033870628205 0.929468533350
+Gd P
+ 2.427873110800 0.198067677630
+ 2.311779823100 -0.357983931930
+ 0.739703409910 0.578006472370
+ 0.366725221840 0.189778608390
+Gd P
+ 0.227828614570 1.000000000000
+Gd D
+ 0.917667303880 -0.016219993919
+ 0.529461794640 0.687152060260
+ 0.221266400270 0.194933590720
+Gd D
+ 0.150791182580 0.317109114010
+ 0.055406934975 0.219244642480
+Gd F
+ 52.702891850000 0.037944092036
+ 18.968629359000 0.172950486860
+ 7.483761636800 0.339296635260
+ 3.124551141300 0.409244092080
+ 1.297917258000 0.266432518960
+Gd F
+ 0.882336787390 0.516585464810
+ 0.384484579370 0.873328896390
+
+#BASIS SET:
+Tb S
+ 3.201386505200 0.411894267370
+ 2.126784415700 -1.506935953000
+ 0.999097495030 0.254535056700
+ 0.157417449500 0.117990777940
+Tb S
+ 1.023627053700 0.340110769870
+ 0.430697801590 0.454888900280
+Tb S
+ 0.069974898990 0.698643296210
+ 0.032940990975 0.740386009190
+Tb P
+ 2.073989571400 -0.343595673730
+ 1.828471268600 0.215362185340
+ 0.775726769020 0.490677935480
+ 0.384091371450 0.256087426350
+Tb P
+ 0.217915710920 1.000000000000
+Tb D
+ 0.825171758200 -0.015782273999
+ 0.600154080990 0.515824182390
+ 0.239457542090 0.219930276580
+Tb D
+ 0.170840294520 0.339344511810
+ 0.066393296853 0.179617582290
+Tb F
+ 56.589558964000 0.038014387935
+ 20.155230196000 0.179793848070
+ 7.820360552900 0.362111598160
+ 3.178882888700 0.431909631700
+ 1.289918311100 0.254255670440
+Tb F
+ 0.942818356420 0.524283327370
+ 0.375168165820 0.774641908370
+
+#BASIS SET:
+Dy S
+ 3.077638972300 0.474009059980
+ 2.144516903200 -1.571543383700
+ 1.032524860600 0.251872641160
+ 0.149884855500 0.115997598420
+Dy S
+ 1.187918492500 0.371841314040
+ 0.439964465440 0.439801832140
+Dy S
+ 0.049122676587 0.662889360870
+ 0.020927956353 0.758733207730
+Dy P
+ 2.282049763500 -0.335364965580
+ 2.149986794100 0.212987377950
+ 0.778513739630 0.469808752900
+ 0.375219342670 0.251032791170
+Dy P
+ 0.199939554420 1.000000000000
+Dy D
+ 0.832650334010 -0.016672954384
+ 0.586505648120 0.572585525810
+ 0.249679875370 0.220091894330
+Dy D
+ 0.180105077260 0.344582383530
+ 0.065825970099 0.191502604620
+Dy F
+ 57.861342453000 0.038972875107
+ 20.635393825000 0.181045246390
+ 8.004535365600 0.359424664790
+ 3.226449556100 0.423360346770
+ 1.294175171300 0.256326644590
+Dy F
+ 0.881165570020 0.496894803850
+ 0.339754421590 0.786379098440
+
+#BASIS SET:
+Ho S
+ 3.215855287600 0.435915312520
+ 2.254633264000 -1.409057919500
+ 0.926678464460 0.236812959040
+ 0.197978695910 0.113859531790
+Ho S
+ 1.074584686700 0.349159708930
+ 0.433613079530 0.295799100520
+Ho S
+ 0.053966343732 0.578706693790
+ 0.021991634871 0.803713364560
+Ho P
+ 2.276740546700 -0.309919424620
+ 1.875689434100 0.222433385250
+ 0.769351845080 0.483617009030
+ 0.342467015030 0.282620610930
+Ho P
+ 0.189559521080 1.000000000000
+Ho D
+ 0.723086365990 -0.014022935559
+ 0.636267204790 0.519060618500
+ 0.234931783170 0.244755281630
+Ho D
+ 0.179485956040 0.338303537140
+ 0.073557821351 0.179021787710
+Ho F
+ 60.830053758000 0.038490576289
+ 21.588065934000 0.181910610310
+ 8.250479411400 0.365992385350
+ 3.275619486500 0.415195997170
+ 1.280581763200 0.223432973500
+Ho F
+ 1.061858771100 0.560929808620
+ 0.378053363800 0.866634478600
+
+#BASIS SET:
+Er S
+ 3.300008194300 0.449912443110
+ 2.261456184800 -1.616929874300
+ 0.901142008760 0.243660305590
+ 0.185129017200 0.116060607530
+Er S
+ 1.234831222100 0.310759560470
+ 0.435101114100 0.233295092470
+Er S
+ 0.039648759574 0.783823145590
+ 0.036670951258 0.635200383500
+Er P
+ 2.327643667900 -0.301384780980
+ 1.853492883500 0.228041624770
+ 0.774338801330 0.459307886770
+ 0.361332277440 0.285507707070
+Er P
+ 0.186448517290 1.000000000000
+Er D
+ 0.900253313010 -0.017255084799
+ 0.645098906620 0.519588132000
+ 0.171165586210 0.251556771310
+Er D
+ 0.177202737280 0.375443551790
+ 0.063204717631 0.194404298070
+Er F
+ 66.650276003000 0.038515010289
+ 23.862873532000 0.185215593670
+ 9.413269867800 0.371237926920
+ 3.903703455900 0.442139749250
+ 1.628218800500 0.297041225020
+Er F
+ 0.964181588410 0.515260179640
+ 0.417374040540 0.679879748240
+
+#BASIS SET:
+Tm S
+ 3.529951967700 0.440030003130
+ 2.373264752500 -1.676147150600
+ 0.859732693780 0.244015605370
+ 0.188994272710 0.126181490180
+Tm S
+ 1.237714700000 0.353524952200
+ 0.437883330560 0.257138376250
+Tm S
+ 0.056601467736 0.751591102950
+ 0.034452658435 0.665435412990
+Tm P
+ 2.389415038600 -0.298798971110
+ 1.855600057200 0.231839187010
+ 0.806657888830 0.465772792620
+ 0.364317357270 0.303562724750
+Tm P
+ 0.189679696220 1.000000000000
+Tm D
+ 0.842104081560 -0.016821641435
+ 0.642693889420 0.525275761120
+ 0.174205588420 0.254940589090
+Tm D
+ 0.185376205340 0.386133952670
+ 0.066999787462 0.185960001190
+Tm F
+ 69.098639820000 0.039780930603
+ 24.622398191000 0.189150061040
+ 9.670584964100 0.377486221370
+ 3.967675958400 0.447132729830
+ 1.602691573900 0.305383090360
+Tm F
+ 0.862982955250 0.539209910530
+ 0.387135399100 0.660330253160
+
+#BASIS SET:
+Yb S
+ 4.129766509200 0.314141037340
+ 2.367490669700 -1.896766067000
+ 0.806650220260 0.225771432160
+ 0.202273850080 0.109885398660
+Yb S
+ 1.438428189100 0.334354680940
+ 0.458551811280 0.223260919060
+Yb S
+ 0.060105436712 0.719579093490
+ 0.035482632702 0.643413955490
+Yb P
+ 2.471659314300 -0.295125193320
+ 1.860993668600 0.237399570240
+ 0.823485411640 0.457292600590
+ 0.367510901360 0.351242665330
+Yb P
+ 0.171373315040 1.000000000000
+Yb D
+ 0.805143471510 -0.017580695087
+ 0.498184965450 0.507346270890
+ 0.183187725480 0.270574390960
+Yb D
+ 0.048692800777 0.230634878920
+ 0.002065313870 0.297864857370
+Yb F
+ 69.918468598000 0.041342514961
+ 24.947950088000 0.192551350850
+ 9.765463162000 0.376774380600
+ 4.011151681600 0.429599734010
+ 1.635982295700 0.293066793210
+Yb F
+ 0.885391198850 0.546626208600
+ 0.427393308340 0.733864066460
+
+#BASIS SET:
+Lu S
+ 3.997702916200 0.457441680180
+ 2.709261988900 -1.540133835600
+ 0.902625980060 0.245518229230
+ 0.197099834360 0.103486958220
+Lu S
+ 1.249815976000 0.312543278220
+ 0.465480048010 0.234491973770
+Lu S
+ 0.080960274419 0.624707137520
+ 0.047485302330 0.624526188080
+Lu P
+ 2.581464009800 -0.297208403370
+ 1.976311904200 0.250055874640
+ 0.835207096090 0.450659846040
+ 0.356413889420 0.305459141120
+Lu P
+ 0.156421764500 1.000000000000
+Lu D
+ 0.917062629490 -0.017339228386
+ 0.572775203150 0.503610273730
+ 0.168191033340 0.248443090760
+Lu D
+ 0.086838737228 0.311494627600
+ 0.038900529418 0.217449926630
+Lu F
+ 73.095465952000 0.042169444125
+ 26.023715946000 0.195911479400
+ 10.230613237000 0.380324283710
+ 4.182255200200 0.437222388230
+ 1.656374469500 0.297996648530
+Lu F
+ 0.826840213730 0.493037406260
+ 0.497649365380 0.842886162920
+
+#BASIS SET:
+Hf S
+ 5.745263913265 0.276398505380
+ 3.894142553735 -0.839898489463
+ 3.121027335851 0.474240430954
+ 0.635496702086 0.158777918809
+Hf S
+ 0.764844351020 1.707468977910
+ 0.290339685115 0.519137602207
+Hf S
+ 0.082091325177 1.958910571765
+ 0.025505481057 0.408538141346
+Hf P
+ 8.047349756152 0.134374727878
+ 5.353154905330 -0.304612877776
+ 0.989731302776 0.607862566692
+ 0.373084955619 0.433934935433
+Hf P
+ 0.042930780688 1.371306677174
+ 0.018204544818 0.809563007668
+Hf D
+ 3.430686255910 -0.067789549648
+ 1.284638756596 0.283064494075
+ 0.433718517572 0.619508023969
+ 0.150295851218 0.390710591201
+Hf D
+ 0.072503797491 0.794719269255
+ 0.027061821278 0.342285689082
+
+#BASIS SET:
+Ta S
+ 5.963803707657 0.279364073058
+ 3.961116677799 -0.850260809206
+ 3.085766632618 0.479568840294
+ 0.665808701395 0.193394187367
+Ta S
+ 0.805972319842 1.688246848358
+ 0.268930532034 0.379133660479
+Ta S
+ 0.086553429743 2.595263529220
+ 0.027791157335 0.533383099437
+Ta P
+ 8.182904208394 0.136625206869
+ 5.419010727730 -0.317091050150
+ 1.080897305362 0.587463210567
+ 0.427101858969 0.435988981255
+Ta P
+ 0.123665166522 1.348117857613
+ 0.074631391466 0.779829286766
+Ta D
+ 3.938780857951 -0.057900768529
+ 1.367499138219 0.225894545813
+ 0.629935945405 0.379219030762
+ 0.273092407860 0.508381938982
+Ta D
+ 0.099925683041 0.812911312187
+ 0.037943852635 0.332399580288
+
+#BASIS SET:
+W S
+ 6.106125947092 0.277025340819
+ 4.127702570212 -0.844730321999
+ 3.274970817330 0.483142244883
+ 0.733004472901 0.158006598997
+W S
+ 0.841444067228 1.644909883363
+ 0.271780212684 0.409568828017
+W S
+ 0.100331812736 1.869589815121
+ 0.029632732761 0.528929932062
+W P
+ 8.285553529800 0.157354211889
+ 5.615678187579 -0.352854952607
+ 1.158573046813 0.612420934890
+ 0.467617380312 0.446722978433
+W P
+ 0.137270627660 0.964875039887
+ 0.082293678701 0.246586455905
+W D
+ 4.192126847007 -0.054899215601
+ 1.448413562415 0.216997892231
+ 0.626627825114 0.432982414411
+ 0.249124670225 0.423123825902
+W D
+ 0.097421390343 1.090266290688
+ 0.036447054814 0.478789058281
+
+#BASIS SET:
+Re S
+ 6.488782003396 0.257736215445
+ 4.217568459632 -0.822875620070
+ 3.262879428220 0.479067032833
+ 0.754784222831 0.203147240894
+Re S
+ 0.941081734619 1.881764103129
+ 0.272712355739 0.429341141770
+Re S
+ 0.111126860141 2.255322226400
+ 0.040009355137 1.259531768378
+Re P
+ 7.520497681685 0.496927359844
+ 6.498258506185 -0.676809195328
+ 1.216427394186 0.572364689927
+ 0.495158438289 0.396340498872
+Re P
+ 0.136186441470 4.165341545653
+ 0.082441357688 0.908639271543
+Re D
+ 4.424276090621 -0.075416964612
+ 1.537528643872 0.289740692768
+ 0.720850710433 0.511674499486
+ 0.312377703413 0.543026746807
+Re D
+ 0.122723095719 0.419130234729
+ 0.050344645461 0.124831357094
+
+#BASIS SET:
+Os S
+ 6.589301521031 0.262324014895
+ 4.483410826406 -0.824151542759
+ 3.598784747600 0.488236883338
+ 0.830482873892 0.148959289850
+Os S
+ 0.974401985901 1.704276404135
+ 0.337099880208 0.419939365808
+Os S
+ 0.119371477901 1.190416170712
+ 0.040334652774 0.415098407891
+Os P
+ 8.018868508095 0.366991980684
+ 6.516333088213 -0.572512459986
+ 1.298661604992 0.634399744500
+ 0.537047008060 0.437296123703
+Os P
+ 0.163425595329 2.846863530709
+ 0.046752457019 0.349464207023
+Os D
+ 4.562022886893 -0.075845399099
+ 1.700498170420 0.262012418274
+ 0.783963505481 0.489253051009
+ 0.333034419339 0.480607051003
+Os D
+ 0.128320311421 1.946527751776
+ 0.051494690010 0.291341732890
+
+#BASIS SET:
+Ir S
+ 6.990696480869 0.238565941294
+ 4.657628826621 -0.824425741900
+ 3.822133240323 0.507613356927
+ 0.888825622345 0.119157387695
+Ir S
+ 1.023793770162 1.457427286127
+ 0.309199315635 0.281543512886
+Ir S
+ 0.127967479426 1.171391362467
+ 0.040088560933 0.521901566386
+Ir P
+ 8.689088185082 0.243119322841
+ 6.577366558565 -0.444544360607
+ 1.356413354364 0.645655248595
+ 0.544928093624 0.402704630432
+Ir P
+ 0.140794381741 0.983202269789
+ 0.048851626916 0.535708402160
+Ir D
+ 4.665178664686 -0.081060285159
+ 1.806050424121 0.274531805989
+ 0.787394625251 0.509312962011
+ 0.321373987554 0.421907340162
+Ir D
+ 0.121625299200 1.542032145949
+ 0.049105698957 0.215355952279
+
+#BASIS SET:
+Pt S
+ 7.679412615881 0.230877908692
+ 4.709087759587 -0.861654782814
+ 3.645313517688 0.541904557353
+ 0.953420950841 0.472553125277
+Pt S
+ 1.027769770519 1.543816333192
+ 0.895827169520 0.276175320568
+Pt S
+ 0.139555297410 1.211933971063
+ 0.049340522870 0.547971738045
+Pt P
+ 8.289144538801 0.712848039607
+ 7.316311140271 -0.940414367671
+ 1.449109980349 0.710309776673
+ 0.609243388052 0.456919979943
+Pt P
+ 0.193683119741 2.292500144336
+ 0.044166144409 0.224210089383
+Pt D
+ 4.910332949872 -0.079625149422
+ 1.938691426506 0.267813136394
+ 0.864747561557 0.492535757006
+ 0.357332283630 0.415036501570
+Pt D
+ 0.137946336644 2.006060038467
+ 0.050242829695 0.262075608301
+
+#BASIS SET:
+Au S
+ 8.482247329754 0.229037007166
+ 5.008887371391 -0.907405209082
+ 3.953463080432 0.532729463041
+ 1.016674280753 0.288719412142
+Au S
+ 1.244532317707 1.102769284388
+ 0.309885825329 0.343912022063
+Au S
+ 0.147596397487 1.099367153402
+ 0.047719646781 0.522674919172
+Au P
+ 8.831554690513 0.509475138022
+ 7.566282794345 -0.721074776360
+ 1.506700409832 0.698388067412
+ 0.612822693070 0.409074097606
+Au P
+ 0.176221965082 2.730273272472
+ 0.061994012169 1.122895118562
+Au D
+ 5.149888373979 -0.076940684362
+ 2.038287817758 0.266148488840
+ 0.898017206860 0.479912521191
+ 0.362635858492 0.377471368469
+Au D
+ 0.138458740562 2.462111180979
+ 0.046715591266 0.250738248974
+
+#BASIS SET:
+Hg S
+ 9.562300229367 0.181250771129
+ 5.079395532020 -0.869350443981
+ 3.982850471475 0.486376181267
+ 0.961127400893 0.258983305153
+Hg S
+ 1.377803059990 1.722106394360
+ 0.381440671172 0.250455726703
+Hg S
+ 0.172351434399 1.100334941673
+ 0.063943840693 0.677823980347
+Hg P
+ 9.893101932495 0.218067320959
+ 7.423812439662 -0.420280763099
+ 1.600738605984 0.669305658396
+ 0.652677096183 0.386781925502
+Hg P
+ 0.183498087066 2.466433663901
+ 0.064592858638 1.561759630522
+Hg D
+ 5.315692442902 -0.078574152910
+ 2.245554598474 0.243559945187
+ 1.001947266394 0.451151800485
+ 0.413902886479 0.345923827411
+Hg D
+ 0.167213642564 2.300779412806
+ 0.067098161686 0.210685863765
+
+#BASIS SET:
+Tl S
+ 1.505313049428 0.313662634995
+ 0.926406638906 -0.722746145099
+ 0.195472441877 0.634249265129
+ 0.079197571540 0.616447338532
+Tl S
+ 0.179423367042 0.144781868221
+ 0.072061846216 0.890059337687
+Tl P
+ 1.340445390895 0.090978395301
+ 0.866604785093 -0.259543806887
+ 0.217123440967 0.362159556277
+ 0.091408672799 0.365315709408
+Tl P
+ 0.591817274684 0.018848049794
+ 0.043682496425 1.416103944518
+Tl D
+ 0.117314837048 0.604178245689
+ 0.054459036250 0.886013825833
+
+#BASIS SET:
+Pb S
+ 1.341753004241 0.775578610898
+ 1.105497689546 -1.202765308935
+ 0.211699001507 0.723705670622
+ 0.129547855691 0.256761268066
+Pb S
+ 0.549706034009 0.131886387067
+ 0.063284849518 1.510485987615
+Pb P
+ 1.414803989187 0.152750104080
+ 1.024069215145 -0.288877125439
+ 0.186029005782 0.437219972839
+ 0.082796300988 0.118522273753
+Pb P
+ 0.331088269580 0.201768239863
+ 0.046482916169 3.750488820230
+Pb D
+ 0.200493671996 0.835023372291
+ 0.088262123032 1.095405604994
+
+#BASIS SET:
+Bi S
+ 1.598452354960 0.090505383135
+ 1.024031934201 -0.505556064927
+ 0.211694361874 0.733716856593
+ 0.188315325223 0.112026081947
+Bi S
+ 0.302425752233 0.209510573559
+ 0.084994052070 0.597248589405
+Bi P
+ 1.494708346058 0.395953810643
+ 1.254367409879 -0.565931848387
+ 0.207175316033 0.638559022440
+ 0.075039529988 0.141797873604
+Bi P
+ 0.199985131181 0.116806624587
+ 0.062373215520 3.086007117955
+Bi D
+ 0.125642967394 1.009438999376
+ 0.092809453349 0.739934271101
+
+#BASIS SET:
+Po S
+ 1.899315412209 0.398802099258
+ 1.330205684309 -0.819810438961
+ 0.295449029905 0.530266670623
+ 0.199451922422 0.183432964269
+Po S
+ 0.315025299523 0.224516411804
+ 0.104988411226 0.818764958809
+Po P
+ 1.611909638782 0.155499395899
+ 1.099867763114 -0.399122211489
+ 0.206617899935 0.547783019859
+ 0.120990132583 0.207180326062
+Po P
+ 0.471197704174 0.778854060703
+ 0.062603532297 1.066397923226
+Po D
+ 0.165433941928 0.960300261731
+ 0.062674049116 0.786318229414
+
+#BASIS SET:
+At S
+ 1.924148580749 0.525856661156
+ 1.346688705121 -1.067720621131
+ 0.359439113226 0.638723774762
+ 0.188782116680 0.502953817840
+At S
+ 0.436733046143 0.086926389554
+ 0.086660202651 0.943503262571
+At P
+ 2.082544353701 0.062574676624
+ 1.160295712623 -0.258455276224
+ 0.350563463518 0.458257809303
+ 0.160220695112 0.515683272949
+At P
+ 0.191314694381 0.104704037336
+ 0.069240934858 0.929284118241
+At D
+ 0.286195490906 0.174109793764
+ 0.168758025833 1.048978368888
+
+#BASIS SET:
+Rn S
+ 1.974019782555 0.658758503133
+ 1.478356500621 -1.204012936389
+ 0.338292719593 0.976772343385
+ 0.138050661999 0.215247233371
+Rn S
+ 0.239238522409 0.078556974413
+ 0.115156734440 1.152238798180
+Rn P
+ 1.866046112655 0.255378767651
+ 1.572334840818 -0.377681250068
+ 0.317716247096 0.531475806749
+ 0.141239873985 0.285072318557
+Rn P
+ 0.200353951276 0.101799470336
+ 0.077190057936 0.881907208554
+Rn D
+ 0.333065558020 0.245718887909
+ 0.214108626479 1.219897201244
+
+END
diff --git a/gpu4pyscf/drivers/dft_3c_driver.py b/gpu4pyscf/drivers/dft_3c_driver.py
new file mode 100644
index 00000000..adf9fe8f
--- /dev/null
+++ b/gpu4pyscf/drivers/dft_3c_driver.py
@@ -0,0 +1,409 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###################################################################
+# This is a customized driver for three composite methods only
+# It only works for b97-3c, r2scan-3c, and wb97x-3c
+###################################################################
+
+import os
+import time
+import json
+import pyscf
+import argparse
+import tempfile
+import shutil
+import cupy
+import traceback
+import h5py
+import numpy as np
+from types import MethodType
+from pyscf import lib
+from pyscf import dft
+from pyscf.hessian import thermo
+from pyscf.lib import logger
+from pyscf.dispersion import dftd3, dftd4, gcp
+
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+import importlib.metadata
+required_version = "1.3.0"
+installed_version = importlib.metadata.version('pyscf-dispersion')
+assert installed_version >= required_version
+
+def parse_3c(xc_name):
+ """
+ return xc, nlc, basis, ecp, (xc_disp, disp), xc_gcp
+ """
+ if xc_name == 'b973c':
+ return 'GGA_XC_B97_3C', 0, 'def2-mtzvp', None, ('b97-3c', 'D3BJ'), 'b973c'
+ elif xc_name == 'r2scan3c':
+ return 'r2scan', 0, 'def2-mtzvpp', None, ('r2scan-3c', 'D4'), 'r2scan3c'
+ elif xc_name == 'wb97x3c':
+ # 'Grimme vDZP' is available is BSE, but pyscf 2.8 is not able to parse ECP properly
+ # basis = 'Grimme vDZP'
+ # ecp = 'Grimme vDZP'
+ basis = os.path.join(CURRENT_DIR, 'basis_vDZP_NWCHEM.dat')
+ ecp = os.path.join(CURRENT_DIR, 'ecp_vDZP_NWCHEM.dat')
+ return 'wb97x-v', 0, basis, ecp, ('wb97x-3c', 'D4'), None
+ else:
+ raise RuntimeError('Unknow xc functionals for parsing 3c')
+
+def get_dispersion(mol, xc, grad=True):
+ if xc == 'b97-3c':
+ d3_model = dftd3.DFTD3Dispersion(mol, xc=xc, atm=True)
+ res = d3_model.get_dispersion(grad=grad)
+ elif xc == 'r2scan-3c':
+ # r2scan-3c use customized parameters
+ # https://github.com/psi4/psi4/blob/0e54962d629494f4ed142d0499d7faeaf36effdd/psi4/driver/procrouting/dft/mgga_functionals.py#L250
+ d4_model = dftd4.DFTD4Dispersion(mol, xc=xc, atm=True, ga=2.0, gc=1.0)
+ d4_model.set_param(0.0, 0.42, 5.65, s9=2.0)
+ res = d4_model.get_dispersion(grad=grad)
+ elif xc == 'wb97x-3c':
+ d4_model = dftd4.DFTD4Dispersion(mol, xc=xc, atm=True)
+ res = d4_model.get_dispersion(grad=grad)
+ else:
+ raise NotImplementedError
+ return res
+
+def gen_disp_fun(xc_disp, xc_gcp):
+ """
+ Generate a function to calculate the sum of dispersion and gcp contributions
+ """
+ def get_disp(mf, disp=None, with_3body=None, verbose=None):
+ mol = mf.mol
+ energy = 0.0
+ if xc_disp is not None:
+ res = get_dispersion(mol, xc_disp, grad=False)
+ energy += res.get('energy')
+ mf.scf_summary['dispersion'] = energy
+ if xc_gcp is not None:
+ gcp_model = gcp.GCP(mol, method=xc_gcp)
+ res = gcp_model.get_counterpoise()
+ energy += res['energy']
+ return energy
+ return get_disp
+
+def gen_disp_grad_fun(xc_disp, xc_gcp):
+ """
+ Generate a function to calculate gradient of dispersion + gcp
+ """
+ def get_disp_grad(mf_grad, disp=None, with_3body=None, verbose=None):
+ mf = mf_grad.base
+ mol = mf.mol
+ gradient = 0.0
+ if xc_disp is not None:
+ res = get_dispersion(mol, xc_disp, grad=True)
+ gradient += res.get('gradient')
+
+ if xc_gcp is not None:
+ gcp_model = gcp.GCP(mol, method=xc_gcp)
+ res = gcp_model.get_counterpoise(grad=True)
+ gradient += res['gradient']
+ return gradient
+ return get_disp_grad
+
+def gen_disp_hess_fun(xc_disp, xc_gcp):
+ """
+ Generate a function to calculate Hessian of dispersion + gcp
+ """
+ def get_disp_hess(mf_hess, disp=None, with_3body=None):
+ mf = mf_hess.base
+ mol = mf.mol
+ natm = mol.natm
+ h_disp = np.empty([natm,natm,3,3])
+
+ coords = mf_hess.mol.atom_coords()
+ mol = mol.copy()
+ eps = 1e-5
+ for i in range(natm):
+ for j in range(3):
+ coords[i,j] += eps
+ mol.set_geom_(coords, unit='Bohr')
+ g1 = 0.0
+ if xc_disp is not None:
+ res = get_dispersion(mol, xc_disp, grad=True)
+ g1 += res.get('gradient')
+ if xc_gcp is not None:
+ gcp_model = gcp.GCP(mol, method=xc_gcp)
+ res = gcp_model.get_counterpoise(grad=True)
+ g1 += res['gradient']
+
+ coords[i,j] -= 2.0*eps
+ mol.set_geom_(coords, unit='Bohr')
+ g2 = 0.0
+ if xc_disp is not None:
+ res = get_dispersion(mol, xc_disp, grad=True)
+ g2 += res.get('gradient')
+ if xc_gcp is not None:
+ gcp_model = gcp.GCP(mol, method=xc_gcp)
+ res = gcp_model.get_counterpoise(grad=True)
+ g2 += res['gradient']
+
+ coords[i,j] += eps
+ h_disp[i,:,j,:] = (g1 - g2)/(2.0*eps)
+ return h_disp
+ return get_disp_hess
+
+def run_dft(mol_name, config, charge=None, spin=0):
+ ''' Perform DFT calculations based on the configuration file.
+ Saving the results, timing, and log to a HDF5 file.
+ '''
+ xc = config.get('xc', 'b3lyp')
+ grids = config.get('grids', {'atom_grid': (99,590)})
+ nlcgrids = config.get('nlcgrids', {'atom_grid': (50,194)})
+ verbose = config.get('verbose', 4)
+ scf_conv_tol = config.get('scf_conv_tol', 1e-10)
+ direct_scf_tol = config.get('direct_scf_tol', 1e-14)
+ with_df = config.get('with_df', True)
+ auxbasis = config.get('auxbasis', 'def2-universal-jkfit')
+ with_gpu = config.get('with_gpu', True)
+
+ with_grad = config.get('with_grad', True)
+ with_hess = config.get('with_hess', True)
+ with_thermo = config.get('with_thermo', False)
+ save_density = config.get('save_density', False)
+ input_dir = config.get('input_dir', './')
+
+ default_solvent = {'method': 'iefpcm', 'eps': 78.3553, 'solvent': 'water'}
+ with_solvent = config.get('with_solvent', False)
+ solvent = config.get('solvent', default_solvent)
+
+ pyscf_xc, nlc, basis, ecp, (xc_disp, disp), xc_gcp = parse_3c(xc)
+
+ # I/O
+ fp = tempfile.TemporaryDirectory()
+ local_dir = f'{fp.name}/'
+ logfile = f'{mol_name[:-4]}_pyscf.log'
+ shutil.copyfile(f'{input_dir}/{mol_name}', local_dir+mol_name)
+ cupy.get_default_memory_pool().free_all_blocks()
+ lib.num_threads(8)
+ start_time = time.time()
+ mol = pyscf.M(
+ atom=local_dir+mol_name,
+ basis=basis,
+ ecp=ecp,
+ max_memory=32000,
+ verbose=verbose,
+ charge=charge,
+ spin=spin,
+ output=f'{local_dir}/{logfile}')
+ mol.build()
+
+ mf = dft.KS(mol, xc=pyscf_xc)
+ if 'atom_grid' in grids: mf.grids.atom_grid = grids['atom_grid']
+ if 'level' in grids: mf.grids.level = grids['level']
+ if mf._numint.libxc.is_nlc(mf.xc):
+ if 'atom_grid' in nlcgrids: mf.nlcgrids.atom_grid = nlcgrids['atom_grid']
+ if 'level' in nlcgrids: mf.nlcgrids.level = nlcgrids['level']
+
+ if with_df:
+ mf = mf.density_fit(auxbasis=auxbasis)
+ if with_gpu:
+ mf = mf.to_gpu()
+
+ #### Changes for 3C methods #####
+ # Setup dispersion correction and GCP
+ mf.nlc = nlc
+ mf.get_dispersion = MethodType(gen_disp_fun(xc_disp, xc_gcp), mf)
+ mf.do_disp = lambda: True
+ #################################
+
+ mf.chkfile = None
+ if with_solvent:
+ if solvent['method'].endswith(('PCM', 'pcm')):
+ mf = mf.PCM()
+ mf.with_solvent.lebedev_order = 29
+ mf.with_solvent.method = solvent['method'].replace('PCM','-PCM')
+ mf.with_solvent.eps = solvent['eps']
+ elif solvent['method'].endswith(('smd', 'SMD')):
+ mf = mf.SMD()
+ mf.with_solvent.lebedev_order = 29
+ mf.with_solvent.method = 'SMD'
+ mf.with_solvent.solvent = solvent['solvent']
+ else:
+ raise NotImplementedError
+
+ mf.direct_scf_tol = direct_scf_tol
+ mf.chkfile = None
+ mf.conv_tol = scf_conv_tol
+ e_tot = mf.kernel()
+
+ if not mf.converged:
+ logger.warn(mf, 'SCF failed to converge')
+
+ scf_time = time.time() - start_time
+ print(f'compute time for energy: {scf_time:.3f} s')
+
+ e1 = mf.scf_summary.get('e1', 0.0)
+ e_coul = mf.scf_summary.get('coul', 0.0)
+ e_xc = mf.scf_summary.get('exc', 0.0)
+ e_disp = mf.scf_summary.get('dispersion', 0.0)
+ e_solvent = mf.scf_summary.get('e_solvent', 0.0)
+
+ data_file = mol_name[:-4] + '_pyscf.h5'
+
+ with h5py.File(f'{local_dir}/{data_file}', 'w') as h5f:
+ h5f.create_dataset('e_tot', data=e_tot)
+ h5f.create_dataset('e1', data=e1)
+ h5f.create_dataset('e_coul', data=e_coul)
+ h5f.create_dataset('e_xc', data=e_xc)
+ h5f.create_dataset('e_disp', data=e_disp)
+ h5f.create_dataset('e_solvent', data=e_solvent)
+ h5f.create_dataset('scf_time', data=scf_time)
+
+ dm = mf.make_rdm1()
+ if isinstance(dm, cupy.ndarray): dm = dm.get()
+ h5f.create_dataset('dm', data=dm)
+
+ if save_density and xc.lower() != 'hf':
+ weights = mf.grids.weights
+ coords = mf.grids.coords
+ dm0 = dm[0] + dm[1] if dm.ndim == 3 else dm
+ rho = mf._numint.get_rho(mf.mol, dm0, mf.grids)
+
+ if isinstance(weights, cupy.ndarray): weights = weights.get()
+ if isinstance(coords, cupy.ndarray): coords = coords.get()
+ if isinstance(rho, cupy.ndarray): rho = rho.get()
+
+ h5f.create_dataset('grids_weights', data=weights)
+ h5f.create_dataset('grids_coords', data=coords)
+ h5f.create_dataset('grids_rho', data=rho)
+
+ if dm.ndim == 3:
+ # open-shell case
+ mo_energy = mf.mo_energy
+ if isinstance(mo_energy, cupy.ndarray): mo_energy = mo_energy.get()
+ mo_energy[0].sort()
+ mo_energy[1].sort()
+ na, nb = mf.nelec
+ h5f.create_dataset('e_lumo_alpha', data=mo_energy[0][na])
+ h5f.create_dataset('e_lumo_beta', data=mo_energy[1][nb])
+ h5f.create_dataset('e_homo_alpha', data=mo_energy[0][na-1])
+ h5f.create_dataset('e_homo_beta', data=mo_energy[1][nb-1])
+ else:
+ # closed-shell case
+ mo_energy = mf.mo_energy
+ if isinstance(mo_energy, cupy.ndarray): mo_energy = mo_energy.get()
+ mo_energy.sort()
+ nocc = mf.mol.nelectron // 2
+ h5f.create_dataset('e_lumo', data=mo_energy[nocc])
+ h5f.create_dataset('e_homo', data=mo_energy[nocc-1])
+
+ ##################### Gradient Calculation ###############################
+ g = None
+ if with_grad:
+ try:
+ start_time = time.time()
+ g = mf.nuc_grad_method()
+ # Overwrite the method for 3C method
+ g.get_dispersion = MethodType(gen_disp_grad_fun(xc_disp, xc_gcp), g)
+ if with_df:
+ g.auxbasis_response = True
+ f = g.kernel()
+ g = None
+ grad_time = time.time() - start_time
+ print(f'compute time for gradient: {grad_time:.3f} s')
+ except Exception as exc:
+ print(traceback.format_exc())
+ print(exc)
+ f = -1
+ grad_time = -1
+
+ with h5py.File(f'{local_dir}/{data_file}', 'a') as h5f:
+ h5f.create_dataset('grad', data=f)
+ h5f.create_dataset('grad_time', data=grad_time)
+
+ #################### Hessian Calculation ###############################
+ h = None
+ if with_hess:
+ try:
+ natm = mol.natm
+ start_time = time.time()
+ h = mf.Hessian()
+ # Overwrite the method for 3C method
+ h.get_dispersion = MethodType(gen_disp_hess_fun(xc_disp, xc_gcp), h)
+ h.auxbasis_response = 2
+ _h_dft = h.kernel()
+ h_dft = _h_dft.transpose([0,2,1,3]).reshape([3*natm, 3*natm])
+ hess_time = time.time() - start_time
+ print(f'compute time for hessian: {hess_time:.3f} s')
+
+ if with_thermo:
+ # harmonic analysis
+ start_time = time.time()
+ normal_mode = thermo.harmonic_analysis(mol, _h_dft)
+
+ thermo_dat = thermo.thermo(
+ mf, # GPU4PySCF object
+ normal_mode['freq_au'],
+ 298.15, # room temperature
+ 101325) # standard atmosphere
+ thermo_time = time.time() - start_time
+ print(f'compute time for harmonic analysis: {thermo_time:.3f} s')
+
+ except Exception as exc:
+ print(traceback.format_exc())
+ print(exc)
+ h_dft = -1
+ hess_time = -1
+
+ with h5py.File(f'{local_dir}/{data_file}', 'a') as h5f:
+ h5f.create_dataset('hess', data=h_dft)
+ h5f.create_dataset('hess_time', data=hess_time)
+
+ if with_thermo:
+ h5f.create_dataset('freq_au', data=normal_mode['freq_au'])
+ h5f.create_dataset('freq_wavenumber', data=normal_mode['freq_wavenumber'])
+ h5f.create_dataset('E_tot', data=thermo_dat['E_tot'][0])
+ h5f.create_dataset('H_tot', data=thermo_dat['H_tot'][0])
+ h5f.create_dataset('G_tot', data=thermo_dat['G_tot'][0])
+ h5f.create_dataset('E_elec', data=thermo_dat['E_elec'][0])
+ h5f.create_dataset('E_trans', data=thermo_dat['E_trans'][0])
+ h5f.create_dataset('E_rot', data=thermo_dat['E_rot'][0])
+ h5f.create_dataset('E_vib', data=thermo_dat['E_vib'][0])
+ h5f.create_dataset('E_0K', data=thermo_dat['E_0K'][0])
+ h5f.create_dataset('H_elec', data=thermo_dat['H_elec'][0])
+ h5f.create_dataset('H_trans', data=thermo_dat['H_trans'][0])
+ h5f.create_dataset('H_rot', data=thermo_dat['H_rot'][0])
+ h5f.create_dataset('H_vib', data=thermo_dat['H_vib'][0])
+ h5f.create_dataset('G_elec', data=thermo_dat['G_elec'][0])
+ h5f.create_dataset('G_trans', data=thermo_dat['G_trans'][0])
+ h5f.create_dataset('G_rot', data=thermo_dat['G_rot'][0])
+ h5f.create_dataset('G_vib', data=thermo_dat['G_vib'][0])
+
+ # copy the files to destination folder
+ output_dir = config['output_dir']
+ isExist = os.path.exists(output_dir)
+ if not isExist:
+ os.makedirs(output_dir)
+
+ shutil.copyfile(f'{local_dir}/{data_file}', f'{output_dir}/{data_file}')
+ shutil.copyfile(f'{local_dir}/{logfile}', f'{output_dir}/{logfile}')
+
+ return mf
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Run DFT with GPU4PySCF for molecules')
+ parser.add_argument("--config", type=str, default='example.json')
+ parser.add_argument("--charge", type=int, default=None)
+ parser.add_argument("--spin", type=int, default=0)
+ args = parser.parse_args()
+
+ with open(args.config) as f:
+ config = json.load(f)
+ if isinstance(config, list):
+ config = config[0]
+ for mol_name in config['molecules']:
+ run_dft(mol_name, config, charge=args.charge, spin=args.spin)
diff --git a/gpu4pyscf/drivers/dft_b973c_sample.json b/gpu4pyscf/drivers/dft_b973c_sample.json
new file mode 100644
index 00000000..9085744c
--- /dev/null
+++ b/gpu4pyscf/drivers/dft_b973c_sample.json
@@ -0,0 +1,26 @@
+[{
+ "input_dir": "./",
+ "output_dir": "./",
+ "molecules": [
+ "h2o.xyz",
+ "h2o.xyz",
+ "h2o.xyz",
+ "h2o.xyz",
+ "h2o.xyz"
+ ],
+ "xc": "b973c",
+ "auxbasis": "def2-universal-JFIT",
+ "verbose": 6,
+ "with_solvent": false,
+ "with_thermo": false,
+ "solvent": {
+ "eps": 78.3553,
+ "solvent": "water",
+ "method": "SMD"
+ },
+ "with_gpu": true,
+ "with_df": true,
+ "with_grad": true,
+ "with_hess": true,
+ "save_density": true
+}]
diff --git a/gpu4pyscf/drivers/dft_r2scan3c_sample.json b/gpu4pyscf/drivers/dft_r2scan3c_sample.json
new file mode 100644
index 00000000..3f0eb6aa
--- /dev/null
+++ b/gpu4pyscf/drivers/dft_r2scan3c_sample.json
@@ -0,0 +1,26 @@
+[{
+ "input_dir": "./",
+ "output_dir": "./",
+ "molecules": [
+ "h2o.xyz",
+ "h2o.xyz",
+ "h2o.xyz",
+ "h2o.xyz",
+ "h2o.xyz"
+ ],
+ "xc": "r2scan3c",
+ "auxbasis": "def2-universal-JFIT",
+ "verbose": 4,
+ "with_solvent": false,
+ "with_thermo": false,
+ "solvent": {
+ "eps": 78.3553,
+ "solvent": "water",
+ "method": "SMD"
+ },
+ "with_gpu": true,
+ "with_df": true,
+ "with_grad": true,
+ "with_hess": true,
+ "save_density": false
+}]
diff --git a/gpu4pyscf/drivers/dft_wb97x3c_sample.json b/gpu4pyscf/drivers/dft_wb97x3c_sample.json
new file mode 100644
index 00000000..c32e0cc0
--- /dev/null
+++ b/gpu4pyscf/drivers/dft_wb97x3c_sample.json
@@ -0,0 +1,25 @@
+[{
+ "input_dir": "./",
+ "output_dir": "./",
+ "molecules": [
+ "h2o.xyz",
+ "h2o.xyz",
+ "h2o.xyz",
+ "h2o.xyz",
+ "h2o.xyz"
+ ],
+ "xc": "wb97x3c",
+ "verbose": 4,
+ "with_solvent": false,
+ "with_thermo": false,
+ "solvent": {
+ "eps": 78.3553,
+ "solvent": "water",
+ "method": "SMD"
+ },
+ "with_gpu": true,
+ "with_df": true,
+ "with_grad": true,
+ "with_hess": false,
+ "save_density": false
+}]
diff --git a/gpu4pyscf/drivers/ecp_vDZP_NWCHEM.dat b/gpu4pyscf/drivers/ecp_vDZP_NWCHEM.dat
new file mode 100644
index 00000000..228ad57d
--- /dev/null
+++ b/gpu4pyscf/drivers/ecp_vDZP_NWCHEM.dat
@@ -0,0 +1,1214 @@
+
+ECP
+B nelec 2
+B S
+2 4.50610000 23.99296000
+B P
+2 5.60000000 -1.30000000
+B D
+2 0.08000000 -0.00300000
+B ul
+2 1.00000000 0.00000000
+C nelec 2
+C S
+2 6.40105200 33.12163800
+C P
+2 7.30774700 -1.98625700
+C D
+2 5.96179600 -9.45431800
+C ul
+2 1.00000000 0.00000000
+N nelec 2
+N S
+2 7.97723200 38.53383100
+N P
+2 10.18385400 -2.55081000
+N D
+2 11.55994700 -2.99554500
+N ul
+2 1.00000000 0.00000000
+O nelec 2
+O S
+2 10.44567000 50.77106900
+O P
+2 18.04517400 -4.90355100
+O D
+2 8.16479800 -3.31212400
+O ul
+2 1.00000000 0.00000000
+F nelec 2
+F S
+2 22.35040000 102.59795200
+2 11.17520000 19.04966300
+F P
+2 26.47680000 -15.14396000
+2 13.23840000 2.80292100
+F D
+F ul
+2 1.00000000 0.00000000
+Ne nelec 2
+Ne S
+2 31.86016200 112.52543566
+2 12.36221900 28.30083454
+Ne P
+2 21.50803400 -11.12658543
+2 12.91044700 3.38754919
+Ne D
+2 0.85038500 -0.18408921
+Ne ul
+2 1.00000000 0.00000000
+Na nelec 2
+Na S
+2 2.47830001 -14.53866100
+2 3.09900001 31.91120791
+2 3.94710001 -32.32224607
+1 8.21659994 3.14094701
+0 1.50080000 1.87765500
+Na ul
+2 1.60680000 -0.00010200
+2 22.52309990 -1.71544300
+1 76.23649979 -1.36191100
+Mg nelec 2
+Mg S
+2 2.95159999 -15.62671006
+2 3.73249999 33.53119421
+2 4.81180000 -36.07307196
+1 9.91949999 3.12963399
+0 1.81850000 1.93843301
+Mg ul
+2 1.13200000 -0.00005300
+2 27.30570006 -1.93249400
+1 93.30560017 -1.39384700
+Al nelec 10
+Al S
+2 2.19822500 20.40981300
+Al P
+2 1.60139500 8.98049500
+Al D
+2 1.49902600 -1.97041100
+Al ul
+2 1.00000000 0.00000000
+Si nelec 10
+Si S
+2 2.71362197 26.62331865
+Si P
+2 1.96687987 10.92995391
+Si D
+2 2.71001600 -4.66941200
+Si ul
+2 1.00000000 0.00000000
+P nelec 10
+P S
+2 2.94055986 26.53226131
+P P
+2 2.22771255 11.49721021
+P D
+2 5.66170600 -16.77278000
+P ul
+2 1.00000000 0.00000000
+S nelec 10
+S S
+2 3.74389164 37.97481900
+S P
+2 3.08608744 18.79052931
+S D
+2 4.86241400 -7.83796400
+S ul
+2 1.00000000 0.00000000
+Cl nelec 10
+Cl S
+2 6.39430000 33.13663196
+2 3.19710000 16.27072783
+Cl P
+2 5.62070000 24.41699269
+2 2.81030000 7.68304978
+Cl D
+2 5.33810000 -8.58764865
+Cl ul
+2 1.00000000 0.00000000
+Ar nelec 10
+Ar S
+2 10.26172100 68.66778801
+2 3.95272500 24.04276629
+Ar P
+2 5.39271400 27.73076331
+2 2.69996700 4.04545904
+Ar D
+2 8.08623500 -8.13747696
+2 4.01663200 -1.66452808
+Ar F
+2 5.20845900 -3.40009845
+Ar ul
+2 1.00000000 0.00000000
+K nelec 10
+K S
+2 2.41030002 33.92499542
+2 2.77449989 -117.52191162
+2 3.33690000 108.25654602
+1 2.22169995 5.80375385
+0 12.19449997 3.21023202
+K P
+2 3.15459990 -69.97570801
+2 4.03189993 178.15260315
+2 5.16720009 -136.11946106
+1 11.33360004 4.43228197
+0 1.74329996 4.74835920
+K ul
+2 4.99889994 -1.96356797
+2 14.58220005 -15.53031635
+2 44.89559937 -38.57669830
+1 141.48350525 -7.30101395
+Ca nelec 10
+Ca S
+2 2.68330002 29.85718918
+2 3.08299994 -117.43736267
+2 3.70659995 102.65641785
+1 2.47650003 10.56573391
+0 13.28890038 3.14651394
+Ca P
+2 3.47429991 -77.17020416
+2 4.38199997 180.64172363
+2 5.66209984 -125.80883789
+1 2.20009995 5.88094378
+0 6.12169981 4.76948595
+Ca ul
+2 5.92140007 -2.26729298
+2 17.20369911 -17.17546654
+2 53.37039948 -42.60916519
+1 169.98489380 -7.46766806
+Sc nelec 10
+Sc S
+2 11.50000000 138.53815200
+2 5.18400000 14.83404210
+Sc P
+2 10.93000000 82.45861400
+2 4.58100000 8.56520569
+Sc D
+2 13.47000000 -16.12986210
+2 4.37500000 -0.53469012
+Sc ul
+2 1.00000000 0.00000000
+Ti nelec 10
+Ti S
+2 13.01000000 158.24159300
+2 5.86200000 17.51182390
+Ti P
+2 12.46000000 95.23512680
+2 5.21700000 10.04785600
+Ti D
+2 15.35000000 -17.56886120
+2 4.98000000 -0.58725612
+Ti ul
+2 1.00000000 0.00000000
+V nelec 10
+V S
+2 14.49000000 178.44797100
+2 6.52400000 19.83137520
+V P
+2 14.30000000 109.52976300
+2 6.02100000 12.57030950
+V D
+2 17.48000000 -19.21965700
+2 5.70900000 -0.64277474
+V ul
+2 1.00000000 0.00000000
+Cr nelec 10
+Cr S
+2 16.39000000 201.57888700
+2 7.40200000 24.20574090
+Cr P
+2 16.45000000 125.02277400
+2 6.96200000 16.47906550
+Cr D
+2 19.93000000 -20.82742110
+2 6.59800000 -0.83436781
+Cr ul
+2 1.00000000 0.00000000
+Mn nelec 10
+Mn S
+2 18.52000000 226.43090200
+2 8.37300000 30.35907230
+Mn P
+2 18.92000000 142.15470500
+2 8.01700000 21.53650930
+Mn D
+2 22.72000000 -22.56811870
+2 7.64000000 -1.20581020
+Mn ul
+2 1.00000000 0.00000000
+Fe nelec 10
+Fe S
+2 20.93000000 253.74958800
+2 9.44500000 37.92284500
+Fe P
+2 21.76000000 161.03681200
+2 9.17800000 27.65129800
+Fe D
+2 25.90000000 -24.43127600
+2 8.83500000 -1.43425100
+Fe ul
+2 1.00000000 0.00000000
+Co nelec 10
+Co S
+2 23.66000000 283.96056600
+2 10.61000000 47.15684590
+Co P
+2 25.04000000 182.21223600
+2 10.44000000 35.23335150
+Co D
+2 29.54000000 -26.47533270
+2 10.18000000 -1.82578723
+Co ul
+2 1.00000000 0.00000000
+Ni nelec 10
+Ni S
+2 26.74000000 317.68227200
+2 11.86000000 58.25539100
+Ni P
+2 28.80000000 252.47436600
+2 11.79000000 36.08150310
+Ni D
+2 33.70000000 -18.52295510
+2 11.66000000 -4.55766810
+Ni ul
+2 1.00000000 0.00000000
+Cu nelec 10
+Cu S
+2 30.11054300 355.75051200
+2 13.07631000 70.93090600
+Cu P
+2 32.69261400 77.96993100
+2 32.77033900 155.92744800
+2 13.75106700 18.02113200
+2 13.32216600 36.09437200
+Cu D
+2 38.99651100 -12.34341000
+2 39.53978800 -18.27336200
+2 12.28751100 -0.98470500
+2 11.45930000 -1.31874700
+Cu F
+2 6.19010200 -0.22726400
+2 8.11878000 -0.46877300
+Cu ul
+2 1.00000000 0.00000000
+Zn nelec 10
+Zn S
+2 34.17400100 399.98639900
+2 14.45637100 85.48975000
+Zn P
+2 39.88868300 92.38107700
+2 39.65501700 184.77117600
+2 15.29054600 23.00254100
+2 14.90352400 46.05742700
+Zn D
+2 43.70829600 -13.69073400
+2 43.69853600 -20.54398000
+2 15.15071800 -1.31615400
+2 15.28244100 -1.83871500
+Zn F
+2 8.16001400 -0.37036000
+2 12.22842200 -1.06294300
+Zn ul
+2 1.00000000 0.00000000
+Ga nelec 28
+Ga S
+2 5.21596000 203.85397200
+Ga P
+2 4.30890400 156.10339000
+Ga D
+2 0.49635700 1.03164700
+Ga F
+2 1.71517000 -10.67373500
+Ga ul
+2 1.00000000 0.00000000
+Ge nelec 28
+Ge S
+2 4.81540900 149.24657900
+Ge P
+2 4.16951500 132.84433500
+Ge D
+2 0.59195800 1.34615400
+Ge F
+2 1.79177000 -7.04422300
+Ge ul
+2 1.00000000 0.00000000
+As nelec 28
+As S
+2 3.61262500 53.96562000
+As P
+2 3.90792600 88.94908800
+As D
+2 1.92646700 22.42028800
+As F
+2 1.77343400 -4.70481500
+As ul
+2 1.00000000 0.00000000
+Se nelec 28
+Se S
+2 4.23705700 79.66334500
+Se P
+2 2.91033400 31.56099300
+Se D
+2 2.33570100 30.80461000
+Se F
+2 2.25463900 -6.54687500
+Se ul
+2 1.00000000 0.00000000
+Br nelec 28
+Br S
+2 5.02180000 61.51372100
+2 2.51090000 9.02149300
+Br P
+2 4.28140000 53.87586400
+2 2.14070000 4.62940200
+Br D
+2 2.88000000 20.84967700
+2 1.44000000 2.96544400
+Br F
+2 2.72070000 -8.16149300
+Br ul
+2 1.00000000 0.00000000
+Kr nelec 28
+Kr S
+2 5.87771800 73.91569390
+2 3.08462200 16.16825080
+Kr P
+2 5.16411000 58.51769101
+2 2.35830200 8.25910073
+Kr D
+2 3.21536200 33.45822776
+2 1.28500800 0.67725331
+Kr F
+2 4.08286900 -15.15869859
+2 1.19396000 -0.17408825
+Kr G
+2 3.18077500 -6.83315877
+Kr ul
+2 1.00000000 0.00000000
+Rb nelec 28
+Rb S
+2 2.29809999 50.81394196
+2 2.66269994 -162.04731750
+2 3.50929999 313.81082153
+2 4.96980000 -309.75451660
+2 6.94840002 216.07606506
+1 17.70389938 20.86063194
+0 25.66029930 3.36120105
+Rb P
+2 2.02160001 45.41232300
+2 2.33979988 -145.47238159
+2 3.07839990 283.18420410
+2 4.37570000 -305.10214233
+2 6.15859985 207.65396118
+1 16.77890015 12.15985012
+0 16.61680031 5.39989424
+Rb D
+2 1.23380005 31.68275070
+2 1.41939998 -100.62529755
+2 1.83389997 186.52160645
+2 2.54550004 -239.76072693
+2 3.47009993 170.19052124
+1 10.62069988 9.91743755
+0 9.28610039 7.41062880
+Rb ul
+2 1.96459997 -1.04400003
+2 5.02349997 -12.26854706
+2 12.31190014 -40.49360657
+2 39.43920136 -92.10794830
+1 116.43070221 -20.25083160
+Sr nelec 28
+Sr S
+2 2.44670010 53.58986664
+2 2.86780000 -172.08218384
+2 3.86610007 345.58593750
+2 5.66069984 -351.22171021
+2 8.30790043 257.34286499
+1 23.49519920 12.91709232
+0 21.03380013 6.33449411
+Sr P
+2 2.20950007 49.15122604
+2 2.58439994 -158.82582092
+2 3.46840000 320.08462524
+2 5.06430006 -349.10769653
+2 7.37960005 239.32991028
+1 22.22710037 11.86419868
+0 19.91810036 5.33859777
+Sr D
+2 1.40730000 32.67572403
+2 1.62419999 -103.22133636
+2 2.11750007 193.27650452
+2 2.97239995 -248.63302612
+2 4.11800003 183.39566040
+1 12.52390003 10.14631939
+0 10.88269997 7.38135815
+Sr ul
+2 2.23399997 -1.16187501
+2 5.67100000 -13.37399960
+2 13.88179970 -43.23659134
+2 44.81060028 -100.09903717
+1 132.90260315 -20.51813126
+Y nelec 28
+Y S
+2 7.48804900 135.15384400
+2 3.74402500 15.55244100
+Y P
+2 6.44537700 87.78499200
+2 3.22268900 11.56406600
+Y D
+2 4.65844700 29.70100100
+2 2.32922400 5.53996800
+Y F
+2 6.58421200 -19.12219800
+2 3.29210600 -2.43637500
+Y ul
+2 1.00000000 0.00000000
+Zr nelec 28
+Zr S
+2 8.20000000 150.26759100
+2 4.08972800 18.97621600
+Zr P
+2 7.11000000 99.62212400
+2 3.59679800 14.16873300
+Zr D
+2 5.35000000 35.04512400
+2 2.49182100 6.11125900
+Zr F
+2 7.54000000 -21.09377600
+2 3.77000000 -3.08069400
+Zr ul
+2 1.00000000 0.00000000
+Nb nelec 28
+Nb S
+2 8.90000000 165.17914300
+2 4.43000000 21.99297400
+Nb P
+2 7.77000000 111.79441400
+2 3.96000000 16.63348300
+Nb D
+2 6.05000000 38.11224900
+2 2.84000000 8.03916700
+Nb F
+2 8.49000000 -22.92955000
+2 4.25000000 -3.66631000
+Nb ul
+2 1.00000000 0.00000000
+Mo nelec 28
+Mo S
+2 9.71459400 180.10310800
+2 4.68050000 24.99722800
+Mo P
+2 8.14213700 123.77275200
+2 4.62598600 19.53022800
+Mo D
+2 6.61841500 48.37502200
+2 3.24875200 8.89205300
+Mo F
+2 9.45000000 -24.80517700
+2 4.72000000 -4.15378200
+Mo ul
+2 1.00000000 0.00000000
+Tc nelec 28
+Tc S
+2 10.42234600 195.15916600
+2 5.03651600 28.09260300
+Tc P
+2 8.95044900 135.28456600
+2 4.85443900 21.80650400
+Tc D
+2 6.94569700 54.32972900
+2 3.97058500 11.15506800
+Tc F
+2 10.40000000 -26.56244700
+2 5.20000000 -4.58568100
+Tc ul
+2 1.00000000 0.00000000
+Ru nelec 28
+Ru S
+2 11.10526900 209.82297100
+2 5.41474500 30.65472600
+Ru P
+2 9.77127100 146.33618200
+2 5.07399100 24.12787700
+Ru D
+2 7.67142300 67.51589700
+2 4.13656500 9.87010400
+Ru F
+2 11.36000000 -28.34061600
+2 5.68000000 -4.94462900
+Ru ul
+2 1.00000000 0.00000000
+Rh nelec 28
+Rh S
+2 11.72000000 225.34775400
+2 5.82000000 32.82318900
+Rh P
+2 10.42000000 158.70941200
+2 5.45000000 26.44410000
+Rh D
+2 8.82000000 62.75862600
+2 3.87000000 10.97871900
+Rh F
+2 12.31000000 -30.09345600
+2 6.16000000 -5.21848200
+Rh ul
+2 1.00000000 0.00000000
+Pd nelec 28
+Pd S
+2 12.43000000 240.22904000
+2 6.17075900 35.17194300
+Pd P
+2 11.08000000 170.41727600
+2 5.82955400 28.47213300
+Pd D
+2 9.51000000 69.01384500
+2 4.13978100 11.75086200
+Pd F
+2 13.27000000 -31.92955400
+2 6.63000000 -5.39821700
+Pd ul
+2 1.00000000 0.00000000
+Ag nelec 28
+Ag S
+2 13.13000000 255.13936500
+2 6.51000000 36.86612200
+Ag P
+2 11.74000000 182.18186900
+2 6.20000000 30.35775100
+Ag D
+2 10.21000000 73.71926100
+2 4.38000000 12.50211700
+Ag F
+2 14.22000000 -33.68992000
+2 7.11000000 -5.53112000
+Ag ul
+2 1.00000000 0.00000000
+Cd nelec 28
+Cd S
+2 13.83586900 270.00948300
+2 6.85727000 38.76730800
+Cd P
+2 12.40497100 193.82962900
+2 6.56779900 31.89652500
+Cd D
+2 10.89692500 79.19364700
+2 4.64116500 13.23082700
+Cd F
+2 15.18479600 -35.47662600
+2 7.59239800 -5.61767700
+Cd ul
+2 1.00000000 0.00000000
+In nelec 46
+In S
+2 1.43509100 29.16521900
+2 0.69580500 -4.19080600
+In P
+2 1.44083200 36.99054200
+2 0.70139200 -3.36582000
+In D
+2 0.96123600 20.00053100
+In F
+2 0.88436900 -6.01909200
+In ul
+2 1.00000000 0.00000000
+Sn nelec 46
+Sn S
+2 1.96972500 67.92534700
+2 0.97237500 -7.47854600
+Sn P
+2 1.99921000 56.60288000
+2 0.99904200 -2.16177600
+Sn D
+2 0.50036100 2.57633600
+Sn F
+2 1.23088000 -10.10925300
+Sn ul
+2 1.00000000 0.00000000
+Sb nelec 46
+Sb S
+2 2.49109100 68.42793800
+2 1.34157500 -4.39863100
+Sb P
+2 2.14386400 63.96546900
+2 0.58550300 -0.57872600
+Sb D
+2 0.79540100 7.80366100
+Sb F
+2 1.60925100 -14.51768700
+Sb ul
+2 1.00000000 0.00000000
+Te nelec 46
+Te S
+2 2.92379400 50.08380500
+2 1.15275400 1.96814000
+Te P
+2 2.60308600 119.82070200
+2 0.98544800 -2.03904800
+Te D
+2 1.43501900 37.75721400
+Te F
+2 1.93927000 -17.86464100
+Te ul
+2 1.00000000 0.00000000
+I nelec 46
+I S
+2 3.51120000 83.11386300
+2 1.75560000 5.20187600
+I P
+2 2.96880000 82.81110900
+2 1.48440000 3.37968200
+I D
+2 1.90660000 10.30427700
+2 0.95330000 7.58803200
+I F
+2 2.30750000 -21.47793600
+I ul
+2 1.00000000 0.00000000
+Xe nelec 46
+Xe S
+2 3.94026300 122.76382934
+2 2.27726400 8.30885115
+Xe P
+2 3.02837300 68.82300437
+2 1.39431900 3.64674223
+Xe D
+2 2.12260500 23.65207854
+2 0.79866900 3.25844113
+Xe F
+2 6.16436000 -47.70319876
+2 1.54237400 -6.54113991
+Xe G
+2 1.84789200 -7.10585060
+Xe ul
+2 1.00000000 0.00000000
+Cs nelec 46
+Cs S
+2 1.38530004 42.85466766
+2 1.63240004 -138.00901794
+2 2.20580006 275.99960327
+2 3.22149992 -280.45663452
+2 4.64960003 199.82038879
+1 15.15250015 27.73096657
+0 19.00049973 3.76870608
+Cs P
+2 1.25950003 48.66250992
+2 1.44169998 -145.70526123
+2 1.87639999 264.46368408
+2 2.65750003 -279.85159302
+2 3.63870001 184.35585022
+1 10.65320015 23.30001831
+0 14.68060017 5.76792908
+Cs D
+2 0.76810002 34.86072540
+2 0.87459999 -106.79302979
+2 1.10769999 188.23532104
+2 1.48230004 -217.63992310
+2 1.92019999 137.74559021
+1 6.21829987 34.42418671
+0 17.38809967 7.19875193
+Cs ul
+2 0.93849999 -0.78916699
+2 2.31629992 -8.42115784
+2 6.00729990 -30.98544312
+2 20.37969971 -95.03477478
+1 59.32889938 -30.07960320
+Ba nelec 46
+Ba S
+2 1.51549995 52.18550110
+2 1.80079997 -166.64633179
+2 2.46210003 336.98910522
+2 3.64910007 -346.60510254
+2 5.34859991 229.66429138
+1 17.15789986 20.49417496
+0 16.02389908 6.64949989
+Ba P
+2 1.35119998 61.51347351
+2 1.55149996 -171.17402649
+2 2.06870008 303.61636353
+2 3.00740004 -324.12673950
+2 4.23050022 210.71342468
+1 14.21850014 19.11876488
+0 13.10439968 5.84502220
+Ba D
+2 0.89740002 34.92659378
+2 1.02090001 -109.23178864
+2 1.28859997 196.23254395
+2 1.73850000 -224.63766479
+2 2.28509998 146.97143555
+1 7.33769989 37.01747894
+0 20.39410019 7.09744883
+Ba ul
+2 0.97610003 -0.88013703
+2 2.66910005 -10.01861763
+2 7.10550022 -35.70346451
+2 24.84989929 -114.57715607
+1 75.09850311 -30.99500656
+La nelec 46
+La S
+2 3.30990000 91.93217700
+2 1.65500000 -3.78876400
+La P
+2 2.83680000 63.75948600
+2 1.41840000 -0.64795800
+La D
+2 2.02130000 36.11617300
+2 1.01070000 0.21911400
+La F
+2 4.02860000 -36.01001600
+La ul
+2 1.00000000 0.00000000
+Ce nelec 46
+Ce S
+2 1.89370130 -255.56238300
+2 1.97914860 307.31392800
+0 10.74296970 10.66990170
+Ce P
+0 7.75592980 12.22921090
+2 1.81564130 124.94246600
+2 1.67164720 -84.59998680
+Ce D
+2 1.70642050 24.94467550
+0 6.48933740 10.28614400
+Ce ul
+1 9.20747930 -15.34875610
+1 1.86730120 -5.84323950
+Pr nelec 46
+Pr S
+2 2.12955580 -223.64398200
+2 2.22746550 278.13451500
+0 7.28371960 12.62107180
+Pr P
+0 7.80928040 12.52563900
+2 1.92977860 121.95278200
+2 1.76478920 -79.40475120
+Pr D
+2 1.79797430 26.19266520
+0 6.54805150 9.87391210
+Pr ul
+1 9.82972860 -15.44352190
+1 1.98070080 -5.89611280
+Nd nelec 46
+Nd S
+2 2.22478560 -219.47084000
+2 2.34459590 280.52892800
+0 11.85798300 11.76232500
+Nd P
+0 9.44306450 11.43612170
+2 1.98949080 120.33535600
+2 1.79805450 -75.78196430
+Nd D
+2 1.88855050 27.38307130
+0 6.64366550 9.61741050
+Nd ul
+1 10.42978840 -15.48300440
+1 2.09229130 -5.94833370
+Pm nelec 46
+Pm S
+2 2.27365310 -215.58178700
+2 2.40502950 277.53404200
+0 10.76473100 11.55980120
+Pm P
+0 9.85397700 10.33666460
+2 2.07626890 121.46769300
+2 1.86493490 -74.87664400
+Pm D
+2 1.97798220 28.48611040
+0 6.67329420 9.28090580
+Pm ul
+1 11.10670840 -15.58675570
+1 2.21044790 -6.01203460
+Sm nelec 46
+Sm S
+2 2.37776610 -206.06726600
+2 2.52471590 270.34259800
+0 10.26438980 11.44490440
+Sm P
+0 10.00542360 10.69510070
+2 2.20124120 121.87722800
+2 1.96809280 -72.78839030
+Sm D
+2 2.06720440 29.52394000
+0 6.71288900 9.03717700
+Sm ul
+1 11.75812390 -15.65862010
+1 2.32808300 -6.05376500
+Eu nelec 46
+Eu S
+2 2.50382840 -196.63773800
+2 2.66926410 264.10344900
+0 10.36738240 11.80698020
+Eu P
+0 10.13418310 10.93806010
+2 2.32439940 128.16026900
+2 2.08337490 -76.51532760
+Eu D
+2 2.15438940 30.46732640
+0 6.69379730 8.74122690
+Eu ul
+1 12.41835390 -15.72447990
+1 2.44755850 -6.09107370
+Gd nelec 46
+Gd S
+2 2.70226520 -137.90212500
+2 2.93347740 208.96454000
+0 9.85080960 16.85869360
+Gd P
+0 10.25620120 11.13083430
+2 2.45273210 131.65132100
+2 2.19693430 -77.44430080
+Gd D
+2 2.24716920 31.46641180
+0 6.43064370 8.34507290
+Gd ul
+1 13.09509080 -15.78672600
+1 2.56973860 -6.12777810
+Tb nelec 46
+Tb S
+2 2.61817350 -140.10322400
+2 2.89041800 215.28993500
+0 17.04897130 11.68123370
+Tb P
+0 10.12533140 11.12162130
+2 2.56570380 139.65713100
+2 2.30834840 -83.30982210
+Tb D
+2 2.32603330 32.12176790
+0 6.59311800 8.22336260
+Tb ul
+1 13.78498800 -15.84080810
+1 2.69386690 -6.16678470
+Dy nelec 46
+Dy S
+2 2.73908100 -130.00434800
+2 3.04934710 208.63602400
+0 16.57473300 11.48546660
+Dy P
+0 10.69436010 11.62238130
+2 2.74055830 125.41578200
+2 2.41123460 -65.82038280
+Dy D
+2 2.40961370 32.79772690
+0 6.49588250 7.98023140
+Dy ul
+1 14.49058170 -15.89723700
+1 2.82105130 -6.19939400
+Ho nelec 46
+Ho S
+2 2.84745680 -110.58301100
+2 3.22421600 192.70173300
+0 16.15618810 11.35468650
+Ho P
+0 9.92608330 10.84274330
+2 2.86230720 116.52006000
+2 2.47244920 -55.66109800
+Ho D
+2 2.49115510 33.34249060
+0 6.35100910 7.73004920
+Ho ul
+1 15.21414080 -15.95165880
+1 2.95124920 -6.23162570
+Er nelec 46
+Er S
+2 3.03618530 -150.36700200
+2 3.35253480 236.19775600
+0 16.03924000 11.41646110
+Er P
+0 11.04634630 11.89253580
+2 3.01799110 130.54350100
+2 2.64727180 -65.69229880
+Er D
+2 2.57008760 33.73180760
+0 6.16564070 7.48868910
+Er ul
+1 15.95965310 -16.00435820
+1 3.08505820 -6.26509470
+Tm nelec 46
+Tm S
+2 3.18338300 -143.24595300
+2 3.53531470 233.19553800
+0 16.16013920 11.61094440
+Tm P
+0 2.46378240 7.39619030
+2 2.72194360 119.57242900
+2 2.49552880 -84.01744530
+Tm D
+2 2.64361640 33.84116700
+0 5.89426350 7.23350090
+Tm ul
+1 16.72435540 -16.05536170
+1 3.22216200 -6.29832940
+Yb nelec 46
+Yb S
+2 3.29988440 -106.50639900
+2 3.77177090 201.14955200
+0 16.63449170 12.01001670
+Yb P
+0 2.65544200 7.59659140
+2 2.83854580 119.64934700
+2 2.59880230 -82.70469640
+Yb D
+2 2.71026500 34.17132690
+0 6.26424190 7.27486230
+Yb ul
+1 17.51231900 -16.10497510
+1 3.36313060 -6.33277960
+Lu nelec 46
+Lu S
+2 3.34224800 -71.31717050
+2 4.00509410 169.40139300
+0 16.18095210 11.88611540
+Lu P
+0 2.72110670 7.71934060
+2 2.98200450 116.10736300
+2 2.71480080 -78.17862840
+Lu D
+2 2.77571170 33.91192520
+0 5.89380130 7.01871010
+Lu ul
+1 18.32043650 -16.15200380
+1 3.50797850 -6.36766820
+Hf nelec 60
+Hf S
+2 14.76995900 1499.28471100
+2 7.38497900 40.28210100
+Hf P
+2 9.84949000 397.73300500
+2 4.92474500 19.31640600
+Hf D
+2 6.09675600 101.32980500
+2 3.04837800 5.87343800
+Hf F
+2 1.78577000 10.04672300
+Hf G
+2 2.63240000 -9.55824400
+Hf ul
+2 1.00000000 0.00000000
+Ta nelec 60
+Ta S
+2 14.54640800 1345.88064700
+2 7.27320400 36.76680600
+Ta P
+2 9.93556500 378.42530100
+2 4.96778200 22.29309100
+Ta D
+2 6.34737700 104.88395600
+2 3.17368800 8.75584800
+Ta F
+2 2.01788100 12.01796100
+Ta G
+2 3.04033000 -11.72893300
+Ta ul
+2 1.00000000 0.00000000
+W nelec 60
+W S
+2 14.32285600 1192.39588200
+2 7.16142800 32.52293300
+W P
+2 10.02164100 359.03196700
+2 5.01082000 24.03038000
+W D
+2 6.59799700 108.30134900
+2 3.29899900 10.98252800
+W F
+2 2.25888800 14.15257900
+W G
+2 3.46411000 -14.05643500
+W ul
+2 1.00000000 0.00000000
+Re nelec 60
+Re S
+2 14.09930500 1038.95157200
+2 7.04965300 29.56173800
+Re P
+2 10.10771700 339.54351000
+2 5.05385800 24.91369600
+Re D
+2 6.84861800 111.69965300
+2 3.42430900 12.62432900
+Re F
+2 2.50865100 16.44985200
+Re G
+2 3.90124500 -16.50112000
+Re ul
+2 1.00000000 0.00000000
+Os nelec 60
+Os S
+2 13.87575400 885.40571900
+2 6.93787700 25.96704000
+Os P
+2 10.19379300 320.08390200
+2 5.09689600 26.14876500
+Os D
+2 7.09923800 115.04484300
+2 3.54961900 13.62257500
+Os F
+2 2.76707500 18.90945700
+Os G
+2 4.34990500 -19.02759500
+Os ul
+2 1.00000000 0.00000000
+Ir nelec 60
+Ir S
+2 13.65220300 732.26920000
+2 6.82610100 26.48472100
+Ir P
+2 10.27986800 299.48947400
+2 5.13993400 26.46623400
+Ir D
+2 7.34985900 124.45759500
+2 3.67492900 14.03599500
+Ir F
+2 3.03407200 21.53103100
+Ir G
+2 4.80885700 -21.60759700
+Ir ul
+2 1.00000000 0.00000000
+Pt nelec 60
+Pt S
+2 13.42865100 579.22386100
+2 6.71432600 29.66949100
+Pt P
+2 10.36594400 280.86077400
+2 5.18297200 26.74538200
+Pt D
+2 7.60047900 120.39644400
+2 3.80024000 15.81092100
+Pt F
+2 3.30956900 24.31437600
+Pt G
+2 5.27728900 -24.21867500
+Pt ul
+2 1.00000000 0.00000000
+Au nelec 60
+Au S
+2 13.20510000 426.84667900
+2 6.60255000 37.00708300
+Au P
+2 10.45202000 261.19958000
+2 5.22601000 26.96249600
+Au D
+2 7.85110000 124.79066600
+2 3.92555000 16.30072600
+Au F
+2 4.78982000 30.49008900
+2 2.39491000 5.17107400
+Au ul
+2 1.00000000 0.00000000
+Hg nelec 60
+Hg S
+2 12.98154900 275.73721200
+2 6.49077400 49.08921200
+Hg P
+2 10.53809600 241.54007400
+2 5.26904800 27.39659100
+Hg D
+2 8.10172100 127.86700800
+2 4.05086000 16.60831200
+Hg F
+2 3.88579100 30.36499600
+Hg G
+2 6.24095500 -29.47311800
+Hg ul
+2 1.00000000 0.00000000
+Tl nelec 78
+Tl S
+2 0.32623800 -1.01649800
+2 1.97754100 51.70795900
+2 10.00000000 73.18668300
+Tl P
+2 0.54306300 -2.96267300
+2 1.03214000 19.73043100
+Tl D
+2 0.35481700 2.77269000
+2 0.70963300 -3.97943900
+Tl F
+2 0.68915600 -4.42678600
+Tl G
+2 0.82061700 -12.27054000
+Tl ul
+2 1.00000000 0.00000000
+Pb nelec 78
+Pb S
+2 0.52916100 -1.87334200
+2 1.45672700 20.86079700
+2 9.99991100 97.58795500
+Pb P
+2 0.67811900 -7.76820900
+2 1.24901300 51.71925400
+Pb D
+2 0.30744600 1.30076000
+2 0.74493000 2.64082200
+Pb F
+2 0.84869900 -5.70605600
+Pb G
+2 0.99994100 -7.48418400
+Pb ul
+2 1.00000000 0.00000000
+Bi nelec 78
+Bi S
+2 0.16115200 -0.16198800
+2 1.50983500 14.03169000
+2 10.00000000 122.04740100
+Bi P
+2 0.76049000 -6.18852600
+2 1.42641500 51.04586800
+Bi D
+2 0.78022600 20.53580400
+2 0.26007500 -0.13619600
+Bi F
+2 0.97360800 -6.41422600
+Bi G
+2 1.08819500 -6.65606400
+Bi ul
+2 1.00000000 0.00000000
+Po nelec 78
+Po S
+2 0.92238600 -4.15930400
+2 1.78191500 33.83035400
+2 10.00000000 146.33910100
+Po P
+2 0.72429100 -4.12531100
+2 1.36386000 35.00707800
+Po D
+2 0.47697900 1.20651800
+2 0.95395700 13.35612500
+Po F
+2 1.07545400 -6.77517400
+Po G
+2 1.12209600 -5.51544100
+Po ul
+2 1.00000000 0.00000000
+At nelec 78
+At S
+2 0.92238600 -5.52846100
+2 1.78191500 39.56886900
+2 10.00000000 170.71138600
+At P
+2 0.72429100 -2.29538700
+2 1.36386000 25.49292000
+At D
+2 0.63597200 4.86510700
+2 1.27194300 14.57941300
+At F
+2 1.15410800 -6.85786700
+At G
+2 1.34511500 -7.61303900
+At ul
+2 1.00000000 0.00000000
+Rn nelec 78
+Rn S
+2 0.92238600 -5.01900500
+2 1.78191500 37.03679000
+2 10.80460100 195.10330800
+Rn P
+2 0.72429100 -1.96648100
+2 1.36386000 23.46405900
+Rn D
+2 0.76940000 7.48345700
+2 1.53880000 9.36190000
+Rn F
+2 1.21389700 -6.76315000
+Rn G
+2 1.57646900 -9.91566200
+Rn ul
+2 1.00000000 0.00000000
+END
diff --git a/gpu4pyscf/drivers/h2o.xyz b/gpu4pyscf/drivers/h2o.xyz
index 8c50538d..6072e217 100644
--- a/gpu4pyscf/drivers/h2o.xyz
+++ b/gpu4pyscf/drivers/h2o.xyz
@@ -1,5 +1,5 @@
3
-O 99.814000000 100.835000000 101.232000000
-H 99.329200000 99.976800000 101.063000000
-H 99.151600000 101.561000000 101.414000000
+O 0.0000000000 -0.0000000000 0.1174000000
+H -0.7570000000 -0.0000000000 -0.4696000000
+H 0.7570000000 0.0000000000 -0.4696000000
diff --git a/gpu4pyscf/drivers/opt_3c_driver.py b/gpu4pyscf/drivers/opt_3c_driver.py
new file mode 100644
index 00000000..20a97aa7
--- /dev/null
+++ b/gpu4pyscf/drivers/opt_3c_driver.py
@@ -0,0 +1,183 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###################################################################
+# This is a customized driver for three composite methods only
+# It only works for b97-3c, r2scan-3c, and wb97x-3c
+###################################################################
+
+import os
+import json
+import pyscf
+import argparse
+import tempfile
+import shutil
+import h5py
+from types import MethodType
+from pyscf import lib, gto
+from pyscf import dft, scf
+from pyscf.geomopt.geometric_solver import kernel
+
+from gpu4pyscf.drivers.dft_3c_driver import (
+ parse_3c, gen_disp_fun, gen_disp_grad_fun)
+
+def opt_mol(mol_name, config, constraints, charge=None, spin=0):
+ xc = config.get('xc', 'b3lyp')
+ verbose = config.get('verbose', 4)
+ scf_conv_tol = config.get('scf_conv_tol', 1e-10)
+ with_df = config.get('with_df', True)
+ auxbasis = config.get('auxbasis', None)
+ with_gpu = config.get('with_gpu', True)
+ with_solvent = config.get('with_solvent', False)
+ maxsteps = config.get('maxsteps', 50)
+ convergence_set = config.get('convergence_set', 'GAU')
+
+ default_solvent = {'method': 'iefpcm', 'eps': 78.3553, 'solvent': 'water'}
+ with_solvent = config.get('with_solvent', False)
+ solvent = config.get('solvent', default_solvent)
+
+ # I/O
+ fp = tempfile.TemporaryDirectory()
+ local_dir = f'{fp.name}/'
+ logfile = f'{mol_name[:-4]}_pyscf.log'
+
+ shutil.copyfile(config['input_dir']+mol_name, local_dir+mol_name)
+ if constraints is not None:
+ shutil.copyfile(config['input_dir']+constraints, local_dir+constraints)
+
+ pyscf_xc, nlc, basis, ecp, (xc_disp, disp), xc_gcp = parse_3c(xc)
+
+ lib.num_threads(8)
+ mol = pyscf.M(
+ atom=local_dir+mol_name,
+ basis=basis,
+ ecp=ecp,
+ max_memory=32000,
+ verbose=verbose,
+ charge=charge,
+ spin=spin,
+ output=f'{local_dir}/{logfile}')
+ mol.build()
+
+ mf = dft.KS(mol, xc=pyscf_xc)
+ mf.grids.atom_grid = (99,590)
+ if mf._numint.libxc.is_nlc(mf.xc):
+ mf.nlcgrids.atom_grid = (50,194)
+ mf.disp = disp
+ if with_df:
+ pyscf_auxbasis = auxbasis
+ if auxbasis == "RIJK-def2-tzvp":
+ pyscf_auxbasis = 'def2-tzvp-jkfit'
+ mf = mf.density_fit(auxbasis=pyscf_auxbasis)
+ if with_gpu:
+ mf = mf.to_gpu()
+
+ #### Changes for 3C methods #####
+ # Setup dispersion correction and GCP
+ mf.nlc = nlc
+ mf.get_dispersion = MethodType(gen_disp_fun(xc_disp, xc_gcp), mf)
+ mf.do_disp = lambda: True
+ #################################
+
+ mf.chkfile = None
+
+ if with_solvent:
+ if solvent['method'].endswith(('PCM', 'pcm')):
+ mf = mf.PCM()
+ mf.with_solvent.lebedev_order = 29
+ mf.with_solvent.method = solvent['method'].replace('PCM','-PCM')
+ mf.with_solvent.eps = solvent['eps']
+ elif with_solvent and solvent['method'].endswith(('smd', 'SMD')):
+ mf = mf.SMD()
+ mf.with_solvent.lebedev_order = 29
+ mf.with_solvent.method = 'SMD'
+ mf.with_solvent.solvent = solvent['solvent']
+ else:
+ raise NotImplementedError
+
+ mf.direct_scf_tol = 1e-14
+ mf.chkfile = None
+ mf.conv_tol = scf_conv_tol
+
+ history = []
+ def callback(envs):
+ result = {
+ 'energy': envs['energy'],
+ 'gradients': envs['gradients'],
+ 'coords': envs['coords'].tolist(),
+ 'e1': mf.scf_summary.get('e1', 0.0),
+ 'e_coul': mf.scf_summary.get('coul', 0.0),
+ 'e_xc': mf.scf_summary.get('exc', 0.0),
+ 'e_disp': mf.scf_summary.get('dispersion', 0.0)
+ }
+ history.append(result)
+
+ grad_scanner = mf.nuc_grad_method().as_scanner()
+ get_disp = gen_disp_grad_fun(xc_disp, xc_gcp)
+ grad_scanner.get_dispersion = MethodType(get_disp, grad_scanner)
+
+ geometric_log = f'{mol_name[:-4]}_geometric.log'
+ import sys
+ # PySCF forwards geometric log to sys.stderr
+ with open(f'{local_dir}/{geometric_log}', 'w') as log_file:
+ sys.stderr = log_file
+ conv, mol_eq = kernel(
+ grad_scanner,
+ maxsteps=maxsteps,
+ callback=callback,
+ convergence_set=convergence_set,
+ constraints=constraints)
+ sys.stderr = sys.__stderr__
+
+ # copy the files to destination folder
+ output_dir = config['output_dir']
+ isExist = os.path.exists(output_dir)
+ if not isExist:
+ os.makedirs(output_dir)
+ optimized_xyz = f'{mol_name[:-4]}_opt.xyz'
+ hist_file = f'{mol_name[:-4]}_hist.h5'
+ mol_eq.tofile(f'{local_dir}/{optimized_xyz}', format='xyz')
+
+ with h5py.File(f'{local_dir}/{hist_file}', 'w') as h5f:
+ #json.dump(history, f)
+ for step, info in enumerate(history):
+ group = h5f.create_group(f'step_{step}')
+ for key, array in info.items():
+ group.create_dataset(key, data=array)
+
+ shutil.copyfile(f'{local_dir}/{optimized_xyz}', f'{output_dir}/{optimized_xyz}')
+ shutil.copyfile(f'{local_dir}/{hist_file}', f'{output_dir}/{hist_file}')
+ shutil.copyfile(f'{local_dir}/{logfile}', f'{output_dir}/{logfile}')
+ shutil.copyfile(f'{local_dir}/{geometric_log}', f'{output_dir}/{geometric_log}')
+ if conv:
+ with open(f'{output_dir}/{mol_name[:-4]}_success.txt', 'w') as file:
+ file.write("Geometry optimization converged\n")
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Run DFT with GPU4PySCF for molecules')
+ parser.add_argument("--config", type=str, default='example.json')
+ parser.add_argument("--charge", type=int, default=None)
+ parser.add_argument("--spin", type=int, default=0)
+ args = parser.parse_args()
+
+ with open(args.config) as f:
+ config = json.load(f)
+ if isinstance(config, list):
+ config = config[0]
+ for i, mol_name in enumerate(config['molecules']):
+ constraints = None
+ if 'constraints' in config and config['constraints']:
+ assert len(config['constraints']) == len(config['molecules'])
+ constraints = config['constraints'][i]
+ opt_mol(mol_name, config, constraints, charge=args.charge, spin=args.spin)
diff --git a/gpu4pyscf/drivers/opt_b973c_sample.json b/gpu4pyscf/drivers/opt_b973c_sample.json
new file mode 100644
index 00000000..c1e12ef3
--- /dev/null
+++ b/gpu4pyscf/drivers/opt_b973c_sample.json
@@ -0,0 +1,22 @@
+[{
+ "input_dir": "./",
+ "output_dir": "./",
+ "molecules": [
+ "ethane.xyz",
+ "ethane.xyz"
+ ],
+ "constraints": [
+ "constraints.txt",
+ "constraints.txt"
+ ],
+ "xc": "b973c",
+ "auxbasis": "def2-universal-JFIT",
+ "verbose": 4,
+ "with_solvent": false,
+ "solvent": {
+ "eps": 78.3553,
+ "method": "CPCM"
+ },
+ "with_gpu": true,
+ "with_df": true
+}]
diff --git a/gpu4pyscf/drivers/opt_r2scan3c_sample.json b/gpu4pyscf/drivers/opt_r2scan3c_sample.json
new file mode 100644
index 00000000..d793f65a
--- /dev/null
+++ b/gpu4pyscf/drivers/opt_r2scan3c_sample.json
@@ -0,0 +1,22 @@
+[{
+ "input_dir": "./",
+ "output_dir": "./",
+ "molecules": [
+ "ethane.xyz",
+ "ethane.xyz"
+ ],
+ "constraints": [
+ "constraints.txt",
+ "constraints.txt"
+ ],
+ "xc": "r2scan3c",
+ "auxbasis": "def2-universal-JFIT",
+ "verbose": 4,
+ "with_solvent": false,
+ "solvent": {
+ "eps": 78.3553,
+ "method": "CPCM"
+ },
+ "with_gpu": true,
+ "with_df": true
+}]
diff --git a/gpu4pyscf/drivers/opt_wb97x3c_sample.json b/gpu4pyscf/drivers/opt_wb97x3c_sample.json
new file mode 100644
index 00000000..4f4ecb5b
--- /dev/null
+++ b/gpu4pyscf/drivers/opt_wb97x3c_sample.json
@@ -0,0 +1,21 @@
+[{
+ "input_dir": "./",
+ "output_dir": "./",
+ "molecules": [
+ "ethane.xyz",
+ "ethane.xyz"
+ ],
+ "constraints": [
+ "constraints.txt",
+ "constraints.txt"
+ ],
+ "xc": "wb97x3c",
+ "verbose": 4,
+ "with_solvent": false,
+ "solvent": {
+ "eps": 78.3553,
+ "method": "CPCM"
+ },
+ "with_gpu": true,
+ "with_df": true
+}]
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index dd374cc3..94399043 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -27,7 +27,7 @@
from gpu4pyscf.scf.hf import KohnShamDFT
from gpu4pyscf.lib.cupy_helper import tag_array, contract, condense, sandwich_dot, reduce_to_device
from gpu4pyscf.__config__ import props as gpu_specs
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
from gpu4pyscf.df import int3c2e #TODO: move int3c2e to out of df
from gpu4pyscf.lib import logger
from gpu4pyscf.scf import jk
@@ -127,7 +127,7 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None,
if vhfopt is None:
# Small group size for load balance
group_size = None
- if _num_devices > 1:
+ if num_devices > 1:
group_size = jk.GROUP_SIZE
vhfopt = _VHFOpt(mol).build(group_size=group_size)
@@ -156,13 +156,13 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None,
tasks.append((i,j,k,l))
tasks = np.array(tasks)
task_list = []
- for device_id in range(_num_devices):
- task_list.append(tasks[device_id::_num_devices])
+ for device_id in range(num_devices):
+ task_list.append(tasks[device_id::num_devices])
cp.cuda.get_current_stream().synchronize()
futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_ejk_ip1_task,
mol, dms, vhfopt, task_list[device_id],
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
index e0d535a4..25f43ef1 100644
--- a/gpu4pyscf/grad/rks.py
+++ b/gpu4pyscf/grad/rks.py
@@ -28,7 +28,7 @@
from gpu4pyscf.lib.cupy_helper import (
contract, get_avail_mem, add_sparse, tag_array, sandwich_dot, reduce_to_device)
from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
from pyscf import __config__
MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 128*128)
@@ -223,8 +223,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
futures = []
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_get_vxc_task,
ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
diff --git a/gpu4pyscf/grad/uks.py b/gpu4pyscf/grad/uks.py
index 90582d73..50e8fd05 100644
--- a/gpu4pyscf/grad/uks.py
+++ b/gpu4pyscf/grad/uks.py
@@ -29,7 +29,7 @@
from gpu4pyscf.lib.cupy_helper import (
contract, get_avail_mem, add_sparse, tag_array, reduce_to_device)
from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
from gpu4pyscf import __config__
MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 128*128)
@@ -230,8 +230,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
futures = []
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_get_vxc_task,
ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
diff --git a/gpu4pyscf/gto/int3c1e.py b/gpu4pyscf/gto/int3c1e.py
index 8e6ce88c..e445c458 100644
--- a/gpu4pyscf/gto/int3c1e.py
+++ b/gpu4pyscf/gto/int3c1e.py
@@ -24,7 +24,7 @@
from gpu4pyscf.scf.int4c2e import BasisProdCache
from gpu4pyscf.df.int3c2e import sort_mol, _split_l_ctr_groups, get_pairing
from gpu4pyscf.gto.mole import basis_seg_contraction
-from gpu4pyscf.__config__ import _num_devices, _streams
+from gpu4pyscf.__config__ import num_devices, _streams
BLKSIZE = 128
@@ -132,7 +132,7 @@ def get_n_hermite_density_of_angular_pair(l):
self.density_offset = np.append(0, np.cumsum(n_density_per_angular_pair)).astype(np.int32)
self._bpcache = {}
- for n in range(_num_devices):
+ for n in range(num_devices):
with cp.cuda.Device(n), _streams[n]:
bpcache = ctypes.POINTER(BasisProdCache)()
scale_shellpair_diag = 1.0
diff --git a/gpu4pyscf/gto/int3c1e_ipip.py b/gpu4pyscf/gto/int3c1e_ipip.py
new file mode 100644
index 00000000..b86abf46
--- /dev/null
+++ b/gpu4pyscf/gto/int3c1e_ipip.py
@@ -0,0 +1,410 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ctypes
+import cupy as cp
+import numpy as np
+from pyscf import lib
+from pyscf.gto import ATOM_OF
+from pyscf.lib import c_null_ptr
+from gpu4pyscf.lib.cupy_helper import load_library, cart2sph, get_avail_mem
+from gpu4pyscf.gto.int3c1e import VHFOpt
+
+libgint = load_library('libgint')
+
+def get_int3c1e_ipip1_charge_contracted(mol, grids, charge_exponents, charges, intopt):
+ omega = mol.omega
+ assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
+
+ grids = cp.asarray(grids, order='C')
+ if charge_exponents is not None:
+ charge_exponents = cp.asarray(charge_exponents, order='C')
+
+ assert charges.ndim == 1 and charges.shape[0] == grids.shape[0]
+ charges = cp.asarray(charges).astype(np.float64)
+
+ charges = charges.reshape([-1, 1], order='C')
+ grids = cp.concatenate([grids, charges], axis=1)
+
+ int1e_charge_contracted = cp.empty([3, 3, mol.nao, mol.nao], order='C')
+ for cp_ij_id, _ in enumerate(intopt.log_qs):
+ cpi = intopt.cp_idx[cp_ij_id]
+ cpj = intopt.cp_jdx[cp_ij_id]
+ li = intopt.angular[cpi]
+ lj = intopt.angular[cpj]
+
+ stream = cp.cuda.get_current_stream()
+
+ log_q_ij = intopt.log_qs[cp_ij_id]
+
+ nbins = 1
+ bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+
+ i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
+ j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
+ ni = i1 - i0
+ nj = j1 - j0
+
+ ao_offsets = np.array([i0, j0], dtype=np.int32)
+ strides = np.array([ni, ni*nj], dtype=np.int32)
+
+ charge_exponents_pointer = c_null_ptr()
+ if charge_exponents is not None:
+ charge_exponents_pointer = charge_exponents.data.ptr
+
+ ngrids = grids.shape[0]
+ # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid
+ # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points
+ n_charge_sum_per_thread = 100 # This number roughly optimize kernel performance on a large system
+
+ int1e_angular_slice = cp.zeros([3, 3, j1-j0, i1-i0], order='C')
+
+ err = libgint.GINTfill_int3c1e_ipip1_charge_contracted(
+ ctypes.cast(stream.ptr, ctypes.c_void_p),
+ intopt.bpcache,
+ ctypes.cast(grids.data.ptr, ctypes.c_void_p),
+ ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
+ ctypes.c_int(ngrids),
+ ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p),
+ strides.ctypes.data_as(ctypes.c_void_p),
+ ao_offsets.ctypes.data_as(ctypes.c_void_p),
+ bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+ ctypes.c_int(nbins),
+ ctypes.c_int(cp_ij_id),
+ ctypes.c_double(omega),
+ ctypes.c_int(n_charge_sum_per_thread))
+
+ if err != 0:
+ raise RuntimeError('GINTfill_int3c1e_charge_contracted failed')
+
+ int1e_angular_slice[1,0] = int1e_angular_slice[0,1]
+ int1e_angular_slice[2,0] = int1e_angular_slice[0,2]
+ int1e_angular_slice[2,1] = int1e_angular_slice[1,2]
+
+ i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
+ j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
+ if not mol.cart:
+ int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj)
+ int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li)
+
+ int1e_charge_contracted[:, :, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,1,3,2)
+
+ return intopt.unsort_orbitals(int1e_charge_contracted, axis=[2,3])
+
+def get_int3c1e_ipvip1_charge_contracted(mol, grids, charge_exponents, charges, intopt):
+ omega = mol.omega
+ assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
+
+ grids = cp.asarray(grids, order='C')
+ if charge_exponents is not None:
+ charge_exponents = cp.asarray(charge_exponents, order='C')
+
+ assert charges.ndim == 1 and charges.shape[0] == grids.shape[0]
+ charges = cp.asarray(charges).astype(np.float64)
+
+ charges = charges.reshape([-1, 1], order='C')
+ grids = cp.concatenate([grids, charges], axis=1)
+
+ int1e_charge_contracted = cp.empty([3, 3, mol.nao, mol.nao], order='C')
+ for cp_ij_id, _ in enumerate(intopt.log_qs):
+ cpi = intopt.cp_idx[cp_ij_id]
+ cpj = intopt.cp_jdx[cp_ij_id]
+ li = intopt.angular[cpi]
+ lj = intopt.angular[cpj]
+
+ stream = cp.cuda.get_current_stream()
+
+ log_q_ij = intopt.log_qs[cp_ij_id]
+
+ nbins = 1
+ bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+
+ i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
+ j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
+ ni = i1 - i0
+ nj = j1 - j0
+
+ ao_offsets = np.array([i0, j0], dtype=np.int32)
+ strides = np.array([ni, ni*nj], dtype=np.int32)
+
+ charge_exponents_pointer = c_null_ptr()
+ if charge_exponents is not None:
+ charge_exponents_pointer = charge_exponents.data.ptr
+
+ ngrids = grids.shape[0]
+ # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid
+ # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points
+ n_charge_sum_per_thread = 100 # This number roughly optimize kernel performance on a large system
+
+ int1e_angular_slice = cp.zeros([3, 3, j1-j0, i1-i0], order='C')
+
+ err = libgint.GINTfill_int3c1e_ipvip1_charge_contracted(
+ ctypes.cast(stream.ptr, ctypes.c_void_p),
+ intopt.bpcache,
+ ctypes.cast(grids.data.ptr, ctypes.c_void_p),
+ ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
+ ctypes.c_int(ngrids),
+ ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p),
+ strides.ctypes.data_as(ctypes.c_void_p),
+ ao_offsets.ctypes.data_as(ctypes.c_void_p),
+ bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+ ctypes.c_int(nbins),
+ ctypes.c_int(cp_ij_id),
+ ctypes.c_double(omega),
+ ctypes.c_int(n_charge_sum_per_thread))
+
+ if err != 0:
+ raise RuntimeError('GINTfill_int3c1e_charge_contracted failed')
+
+ i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
+ j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
+ if not mol.cart:
+ int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj)
+ int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li)
+
+ int1e_charge_contracted[:, :, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,1,3,2)
+
+ return intopt.unsort_orbitals(int1e_charge_contracted, axis=[2,3])
+
+def get_int3c1e_ip1ip2_charge_contracted(mol, grids, charge_exponents, charges, intopt):
+ omega = mol.omega
+ assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
+
+ grids = cp.asarray(grids, order='C')
+ if charge_exponents is not None:
+ charge_exponents = cp.asarray(charge_exponents, order='C')
+
+ assert charges.ndim == 1 and charges.shape[0] == grids.shape[0]
+ charges = cp.asarray(charges).astype(np.float64)
+
+ charges = charges.reshape([-1, 1], order='C')
+ grids = cp.concatenate([grids, charges], axis=1)
+
+ int1e_charge_contracted = cp.empty([3, 3, mol.nao, mol.nao], order='C')
+ for cp_ij_id, _ in enumerate(intopt.log_qs):
+ cpi = intopt.cp_idx[cp_ij_id]
+ cpj = intopt.cp_jdx[cp_ij_id]
+ li = intopt.angular[cpi]
+ lj = intopt.angular[cpj]
+
+ stream = cp.cuda.get_current_stream()
+
+ log_q_ij = intopt.log_qs[cp_ij_id]
+
+ nbins = 1
+ bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+
+ i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
+ j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
+ ni = i1 - i0
+ nj = j1 - j0
+
+ ao_offsets = np.array([i0, j0], dtype=np.int32)
+ strides = np.array([ni, ni*nj], dtype=np.int32)
+
+ charge_exponents_pointer = c_null_ptr()
+ if charge_exponents is not None:
+ charge_exponents_pointer = charge_exponents.data.ptr
+
+ ngrids = grids.shape[0]
+ # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid
+ # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points
+ n_charge_sum_per_thread = 100 # This number roughly optimize kernel performance on a large system
+
+ int1e_angular_slice = cp.zeros([3, 3, j1-j0, i1-i0], order='C')
+
+ err = libgint.GINTfill_int3c1e_ip1ip2_charge_contracted(
+ ctypes.cast(stream.ptr, ctypes.c_void_p),
+ intopt.bpcache,
+ ctypes.cast(grids.data.ptr, ctypes.c_void_p),
+ ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
+ ctypes.c_int(ngrids),
+ ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p),
+ strides.ctypes.data_as(ctypes.c_void_p),
+ ao_offsets.ctypes.data_as(ctypes.c_void_p),
+ bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+ ctypes.c_int(nbins),
+ ctypes.c_int(cp_ij_id),
+ ctypes.c_double(omega),
+ ctypes.c_int(n_charge_sum_per_thread))
+
+ if err != 0:
+ raise RuntimeError('GINTfill_int3c1e_charge_contracted failed')
+
+ i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
+ j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
+ if not mol.cart:
+ int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj)
+ int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li)
+
+ int1e_charge_contracted[:, :, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,1,3,2)
+
+ return intopt.unsort_orbitals(int1e_charge_contracted, axis=[2,3])
+
+def get_int3c1e_ipip2_density_contracted(mol, grids, charge_exponents, dm, intopt):
+ omega = mol.omega
+ assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
+
+ nao_cart = intopt._sorted_mol.nao
+ ngrids = grids.shape[0]
+
+ grids = cp.asarray(grids, order='C')
+ if charge_exponents is not None:
+ charge_exponents = cp.asarray(charge_exponents, order='C')
+
+ dm = cp.asarray(dm)
+ assert dm.ndim == 2
+ assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao
+
+ dm = intopt.sort_orbitals(dm, [0,1])
+ if not mol.cart:
+ cart2sph_transformation_matrix = intopt.cart2sph
+ # TODO: This part is inefficient (O(N^3)), should be changed to the O(N^2) algorithm
+ dm = cart2sph_transformation_matrix @ dm @ cart2sph_transformation_matrix.T
+ dm = dm.flatten(order='F') # Column major order matches (i + j * n_ao) access pattern in the C function
+
+ dm = cp.asnumpy(dm)
+
+ ao_loc_sorted_order = intopt._sorted_mol.ao_loc_nr(cart = True)
+ l_ij = intopt.l_ij.T.flatten()
+ bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten()
+
+ n_total_hermite_density = intopt.density_offset[-1]
+ dm_pair_ordered = np.empty(n_total_hermite_density)
+ libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p),
+ dm_pair_ordered.ctypes.data_as(ctypes.c_void_p),
+ ctypes.c_int(1), ctypes.c_int(nao_cart),
+ ctypes.c_int(len(intopt.bas_pairs_locs) - 1),
+ intopt.bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
+ intopt.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p),
+ l_ij.ctypes.data_as(ctypes.c_void_p),
+ intopt.density_offset.ctypes.data_as(ctypes.c_void_p),
+ ao_loc_sorted_order.ctypes.data_as(ctypes.c_void_p),
+ bas_coords.ctypes.data_as(ctypes.c_void_p),
+ ctypes.c_bool(False))
+
+ dm_pair_ordered = cp.asarray(dm_pair_ordered)
+
+ n_threads_per_block_1d = 16
+ n_max_blocks_per_grid_1d = 65535
+ n_max_threads_1d = n_threads_per_block_1d * n_max_blocks_per_grid_1d
+ n_grid_split = int(np.ceil(ngrids / n_max_threads_1d))
+ if (n_grid_split > 100):
+ print(f"Grid dimension = {ngrids} is too large, more than 100 kernels for one electron integral will be launched.")
+ ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split
+
+ int3c_density_contracted = cp.zeros([3, 3, ngrids], order='C')
+
+ for p0, p1 in lib.prange(0, ngrids, ngrids_per_split):
+ for cp_ij_id, _ in enumerate(intopt.log_qs):
+ stream = cp.cuda.get_current_stream()
+
+ log_q_ij = intopt.log_qs[cp_ij_id]
+
+ nbins = 1
+ bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+
+ charge_exponents_pointer = c_null_ptr()
+ if charge_exponents is not None:
+ exponents_slice = charge_exponents[p0:p1]
+ charge_exponents_pointer = exponents_slice.data.ptr
+ grids_slice = grids[p0:p1]
+
+ # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid
+ # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type
+ n_pair_sum_per_thread = nao_cart
+
+ err = libgint.GINTfill_int3c1e_ipip2_density_contracted(
+ ctypes.cast(stream.ptr, ctypes.c_void_p),
+ intopt.bpcache,
+ ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p),
+ ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
+ ctypes.c_int(p1-p0),
+ ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p),
+ intopt.density_offset.ctypes.data_as(ctypes.c_void_p),
+ ctypes.cast(int3c_density_contracted[:, p0:p1].data.ptr, ctypes.c_void_p),
+ bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+ ctypes.c_int(nbins),
+ ctypes.c_int(cp_ij_id),
+ ctypes.c_double(omega),
+ ctypes.c_int(n_pair_sum_per_thread))
+
+ if err != 0:
+ raise RuntimeError('GINTfill_int3c1e_density_contracted failed')
+
+ int3c_density_contracted[1,0] = int3c_density_contracted[0,1]
+ int3c_density_contracted[2,0] = int3c_density_contracted[0,2]
+ int3c_density_contracted[2,1] = int3c_density_contracted[1,2]
+
+ return int3c_density_contracted
+
+def int1e_grids_ipip1(mol, grids, charge_exponents=None, charges=None, direct_scf_tol=1e-13, intopt=None):
+ assert grids is not None
+ assert charges is not None
+
+ if intopt is None:
+ intopt = VHFOpt(mol)
+ intopt.build(direct_scf_tol, aosym=False)
+ else:
+ assert isinstance(intopt, VHFOpt), \
+ f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
+ assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
+ assert not intopt.aosym
+
+ return get_int3c1e_ipip1_charge_contracted(mol, grids, charge_exponents, charges, intopt)
+
+def int1e_grids_ipvip1(mol, grids, charge_exponents=None, charges=None, direct_scf_tol=1e-13, intopt=None):
+ assert grids is not None
+ assert charges is not None
+
+ if intopt is None:
+ intopt = VHFOpt(mol)
+ intopt.build(direct_scf_tol, aosym=False)
+ else:
+ assert isinstance(intopt, VHFOpt), \
+ f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
+ assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
+ assert not intopt.aosym
+
+ return get_int3c1e_ipvip1_charge_contracted(mol, grids, charge_exponents, charges, intopt)
+
+def int1e_grids_ip1ip2(mol, grids, charge_exponents=None, charges=None, direct_scf_tol=1e-13, intopt=None):
+ assert grids is not None
+ assert charges is not None
+
+ if intopt is None:
+ intopt = VHFOpt(mol)
+ intopt.build(direct_scf_tol, aosym=False)
+ else:
+ assert isinstance(intopt, VHFOpt), \
+ f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
+ assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
+ assert not intopt.aosym
+
+ return get_int3c1e_ip1ip2_charge_contracted(mol, grids, charge_exponents, charges, intopt)
+
+def int1e_grids_ipip2(mol, grids, charge_exponents=None, dm=None, direct_scf_tol=1e-13, intopt=None):
+ assert grids is not None
+ assert dm is not None
+
+ if intopt is None:
+ intopt = VHFOpt(mol)
+ intopt.build(direct_scf_tol, aosym=False)
+ else:
+ assert isinstance(intopt, VHFOpt), \
+ f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
+ assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
+ assert not intopt.aosym
+
+ return get_int3c1e_ipip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
diff --git a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
index 56f87e4b..de68266b 100644
--- a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
+++ b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
@@ -364,5 +364,5 @@ def test_int1e_grids_ip1_density_contracted(self):
cp.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
if __name__ == "__main__":
- print("Full Tests for One Electron Coulomb Integrals")
+ print("Full Tests for One Electron Coulomb Integrals 1st Derivative")
unittest.main()
diff --git a/gpu4pyscf/gto/tests/test_int1e_grids_ipip.py b/gpu4pyscf/gto/tests/test_int1e_grids_ipip.py
new file mode 100644
index 00000000..18f9fed9
--- /dev/null
+++ b/gpu4pyscf/gto/tests/test_int1e_grids_ipip.py
@@ -0,0 +1,480 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import unittest
+import numpy as np
+import cupy as cp
+import pyscf
+from pyscf import lib, gto, df
+from gpu4pyscf.gto.int3c1e_ipip import int1e_grids_ipip1, int1e_grids_ipvip1, int1e_grids_ip1ip2, int1e_grids_ipip2
+
+def setUpModule():
+ global mol_sph, mol_cart, grid_points, integral_threshold, density_contraction_threshold, charge_contraction_threshold
+ atom = '''
+O 0.0000 0.7375 -0.0528
+O 0.0000 -0.7375 -0.1528
+H 0.8190 0.8170 0.4220
+H -0.8190 -0.8170 0.4220
+'''
+ bas='def2-qzvpp'
+
+ mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000)
+ mol_sph.output = '/dev/null'
+ mol_sph.verbose = 0
+ mol_sph.build()
+
+ mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=True)
+ mol_cart.output = '/dev/null'
+ mol_cart.verbose = 0
+ mol_cart.build()
+
+ xs = np.arange(-2.01, 2.0, 0.5)
+ ys = np.arange(-2.02, 2.0, 0.5)
+ zs = np.arange(-2.03, 2.0, 0.5)
+ grid_points = lib.cartesian_prod([xs, ys, zs])
+
+ # All of the following thresholds bound the max value of the corresponding matrix / tensor.
+ integral_threshold = 1e-12
+ density_contraction_threshold = 1e-10
+ charge_contraction_threshold = 1e-12
+
+def tearDownModule():
+ global mol_sph, mol_cart, grid_points
+ mol_sph.stdout.close()
+ mol_cart.stdout.close()
+ del mol_sph, mol_cart, grid_points
+
+class KnownValues(unittest.TestCase):
+ '''
+ Values are compared to PySCF CPU intor() function
+ '''
+ def test_int1e_grids_ipip1_charge_contracted_cart(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+ mol = mol_cart
+ fakemol = gto.fakemol_for_charges(grid_points)
+
+ int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges)
+
+ assert isinstance(test_int1e_dAdA, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold)
+
+ def test_int1e_grids_ipip1_charge_contracted_sph(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+ mol = mol_sph
+ fakemol = gto.fakemol_for_charges(grid_points)
+
+ int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges)
+
+ assert isinstance(test_int1e_dAdA, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold)
+
+ def test_int1e_grids_ipip1_charge_contracted_gaussian_charge(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+ charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+ mol = mol_sph
+ fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+ int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges, charge_exponents = charge_exponents)
+
+ assert isinstance(test_int1e_dAdA, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold)
+
+ def test_int1e_grids_ipip1_charge_contracted_omega(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+ omega = 0.8
+ mol_sph_omega = mol_sph.copy()
+ mol_sph_omega.set_range_coulomb(omega)
+
+ mol = mol_sph_omega
+ fakemol = gto.fakemol_for_charges(grid_points)
+
+ int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges)
+
+ assert isinstance(test_int1e_dAdA, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold)
+
+ def test_int1e_grids_ipip1_charge_contracted_gaussian_charge_omega(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+ charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+ omega = 0.8
+ mol_sph_omega = mol_sph.copy()
+ mol_sph_omega.set_range_coulomb(omega)
+
+ mol = mol_sph_omega
+ fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+ int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges, charge_exponents = charge_exponents)
+
+ assert isinstance(test_int1e_dAdA, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold)
+
+ # ^ ipip1 v ipvip1
+
+ def test_int1e_grids_ipvip1_charge_contracted_cart(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+ mol = mol_cart
+ fakemol = gto.fakemol_for_charges(grid_points)
+
+ int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges)
+
+ assert isinstance(test_int1e_dAdB, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold)
+
+ def test_int1e_grids_ipvip1_charge_contracted_sph(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+ mol = mol_sph
+ fakemol = gto.fakemol_for_charges(grid_points)
+
+ int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges)
+
+ assert isinstance(test_int1e_dAdB, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold)
+
+ def test_int1e_grids_ipvip1_charge_contracted_gaussian_charge(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+ charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+ mol = mol_sph
+ fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+ int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges, charge_exponents = charge_exponents)
+
+ assert isinstance(test_int1e_dAdB, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold)
+
+ def test_int1e_grids_ipvip1_charge_contracted_omega(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+ omega = 0.8
+ mol_sph_omega = mol_sph.copy()
+ mol_sph_omega.set_range_coulomb(omega)
+
+ mol = mol_sph_omega
+ fakemol = gto.fakemol_for_charges(grid_points)
+
+ int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges)
+
+ assert isinstance(test_int1e_dAdB, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold)
+
+ def test_int1e_grids_ipvip1_charge_contracted_gaussian_charge_omega(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+ charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+ omega = 0.8
+ mol_sph_omega = mol_sph.copy()
+ mol_sph_omega.set_range_coulomb(omega)
+
+ mol = mol_sph_omega
+ fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+ int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges, charge_exponents = charge_exponents)
+
+ assert isinstance(test_int1e_dAdB, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold)
+
+ # ^ ipvip1 v ip1ip2
+
+ def test_int1e_grids_ip1ip2_charge_contracted_cart(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+ mol = mol_cart
+ fakemol = gto.fakemol_for_charges(grid_points)
+
+ int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges)
+
+ assert isinstance(test_int1e_dAdC, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold)
+
+ def test_int1e_grids_ip1ip2_charge_contracted_sph(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+ mol = mol_sph
+ fakemol = gto.fakemol_for_charges(grid_points)
+
+ int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges)
+
+ assert isinstance(test_int1e_dAdC, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold)
+
+ def test_int1e_grids_ip1ip2_charge_contracted_gaussian_charge(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+ charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+ mol = mol_sph
+ fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+ int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges, charge_exponents = charge_exponents)
+
+ assert isinstance(test_int1e_dAdC, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold)
+
+ def test_int1e_grids_ip1ip2_charge_contracted_omega(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+ omega = 0.8
+ mol_sph_omega = mol_sph.copy()
+ mol_sph_omega.set_range_coulomb(omega)
+
+ mol = mol_sph_omega
+ fakemol = gto.fakemol_for_charges(grid_points)
+
+ int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges)
+
+ assert isinstance(test_int1e_dAdC, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold)
+
+ def test_int1e_grids_ip1ip2_charge_contracted_gaussian_charge_omega(self):
+ np.random.seed(12345)
+ charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+ charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+ omega = 0.8
+ mol_sph_omega = mol_sph.copy()
+ mol_sph_omega.set_range_coulomb(omega)
+
+ mol = mol_sph_omega
+ fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+ int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+ ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges)
+ ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao)
+
+ test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges, charge_exponents = charge_exponents)
+
+ assert isinstance(test_int1e_dAdC, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold)
+
+ # ^ ip1ip2 v ipip2
+
+ def test_int1e_grids_ipip2_charge_contracted_cart(self):
+ np.random.seed(12345)
+ dm = np.random.uniform(-2.0, 2.0, (mol_cart.nao, mol_cart.nao))
+
+ mol = mol_cart
+ fakemol = gto.fakemol_for_charges(grid_points)
+
+ # Note: we cannot compute ipip2 (dCdC) directly due to numerical problems,
+ # pyscf treat a point charge as a sharp Gaussian, and we cannot take 2nd derivative of it.
+ int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+ v_nj = -v_nj - v_nj.transpose(0, 2, 1, 3) # dCdC = -dAdC - dBdC
+ ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm)
+ ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0])
+
+ test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm)
+
+ assert isinstance(test_int1e_dCdC, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold)
+
+ def test_int1e_grids_ipip2_charge_contracted_sph(self):
+ np.random.seed(12345)
+ dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao))
+
+ mol = mol_sph
+ fakemol = gto.fakemol_for_charges(grid_points)
+
+ # Note: we cannot compute ipip2 (dCdC) directly due to numerical problems,
+ # pyscf treat a point charge as a sharp Gaussian, and we cannot take 2nd derivative of it.
+ int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+ v_nj = -v_nj - v_nj.transpose(0, 2, 1, 3) # dCdC = -dAdC - dBdC
+ ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm)
+ ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0])
+
+ test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm)
+
+ assert isinstance(test_int1e_dCdC, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold)
+
+ def test_int1e_grids_ipip2_charge_contracted_gaussian_charge(self):
+ np.random.seed(12345)
+ dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao))
+ charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+ mol = mol_sph
+ fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+ int3c2e_ipip2 = mol._add_suffix('int3c2e_ipip2')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip2)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip2, aosym='s1', cintopt=cintopt)
+ ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm)
+ ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0])
+
+ test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm, charge_exponents = charge_exponents)
+
+ assert isinstance(test_int1e_dCdC, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold)
+
+ def test_int1e_grids_ipip2_charge_contracted_omega(self):
+ np.random.seed(12345)
+ dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao))
+
+ omega = 0.8
+ mol_sph_omega = mol_sph.copy()
+ mol_sph_omega.set_range_coulomb(omega)
+
+ mol = mol_sph_omega
+ fakemol = gto.fakemol_for_charges(grid_points)
+
+ # Note: we cannot compute ipip2 (dCdC) directly due to numerical problems,
+ # pyscf treat a point charge as a sharp Gaussian, and we cannot take 2nd derivative of it.
+ int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+ v_nj = -v_nj - v_nj.transpose(0, 2, 1, 3) # dCdC = -dAdC - dBdC
+ ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm)
+ ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0])
+
+ test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm)
+
+ assert isinstance(test_int1e_dCdC, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold)
+
+ def test_int1e_grids_ipip2_charge_contracted_gaussian_charge_omega(self):
+ np.random.seed(12345)
+ dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao))
+ charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+ omega = 0.8
+ mol_sph_omega = mol_sph.copy()
+ mol_sph_omega.set_range_coulomb(omega)
+
+ mol = mol_sph_omega
+ fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+ int3c2e_ipip2 = mol._add_suffix('int3c2e_ipip2')
+ cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip2)
+ v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip2, aosym='s1', cintopt=cintopt)
+ ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm)
+ ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0])
+
+ test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm, charge_exponents = charge_exponents)
+
+ assert isinstance(test_int1e_dCdC, cp.ndarray)
+ cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold)
+
+if __name__ == "__main__":
+ print("Full Tests for One Electron Coulomb Integrals 2nd Derivative")
+ unittest.main()
diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py
index 65edff6b..5ed768c2 100644
--- a/gpu4pyscf/hessian/jk.py
+++ b/gpu4pyscf/hessian/jk.py
@@ -33,7 +33,7 @@
reduce_to_device, contract)
from gpu4pyscf.__config__ import props as gpu_specs
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
from gpu4pyscf.lib import logger
@@ -174,7 +174,7 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None,
if vhfopt is None:
# Small group size for load balance
group_size = None
- if _num_devices > 1:
+ if num_devices > 1:
group_size = jk.GROUP_SIZE
vhfopt = _VHFOpt(mol).build(group_size=group_size)
@@ -202,13 +202,13 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None,
tasks.append((i,j,k,l))
tasks = np.array(tasks)
task_list = []
- for device_id in range(_num_devices):
- task_list.append(tasks[device_id::_num_devices])
+ for device_id in range(num_devices):
+ task_list.append(tasks[device_id::num_devices])
cp.cuda.get_current_stream().synchronize()
futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_jk_task,
mol, dms, mo_coeff, mo_occ, vhfopt, task_list[device_id], hermi=hermi,
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 775a6e98..b39aab8e 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -32,7 +32,7 @@
contract, tag_array, sandwich_dot, transpose_sum, get_avail_mem, condense,
krylov)
from gpu4pyscf.__config__ import props as gpu_specs
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
from gpu4pyscf.lib import logger
from gpu4pyscf.scf.jk import (
LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, GROUP_SIZE, libvhf_rys, _VHFOpt,
@@ -271,7 +271,7 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non
if vhfopt is None:
# Small group size for load balance
group_size = None
- if _num_devices > 1:
+ if num_devices > 1:
group_size = GROUP_SIZE
vhfopt = _VHFOpt(mol).build(group_size=group_size)
@@ -296,13 +296,13 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non
tasks.append((i,j,k,l))
tasks = np.array(tasks)
task_list = []
- for device_id in range(_num_devices):
- task_list.append(tasks[device_id::_num_devices])
+ for device_id in range(num_devices):
+ task_list.append(tasks[device_id::num_devices])
cp.cuda.get_current_stream().synchronize()
futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_ejk_ip2_task,
mol, dms, vhfopt, task_list[device_id],
@@ -494,7 +494,7 @@ def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=Non
vhfopt.tile = 1
# Small group size for load balance
group_size = None
- if _num_devices > 1:
+ if num_devices > 1:
group_size = GROUP_SIZE
vhfopt.build(group_size=group_size)
@@ -532,13 +532,13 @@ def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=Non
tasks.append((i,j,k,l))
tasks = np.array(tasks)
task_list = []
- for device_id in range(_num_devices):
- task_list.append(tasks[device_id::_num_devices])
+ for device_id in range(num_devices):
+ task_list.append(tasks[device_id::num_devices])
cp.cuda.get_current_stream().synchronize()
futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_build_jk_ip1_task,
mol, dms, vhfopt, task_list[device_id], atoms_slice,
@@ -908,7 +908,7 @@ def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ,
with mol.with_range_coulomb(omega):
# Small group size for load balance
group_size = None
- if _num_devices > 1:
+ if num_devices > 1:
group_size = GROUP_SIZE
vhfopt = _VHFOpt(mol, mf.direct_scf_tol).build(group_size=group_size)
mf._opt_gpu[omega] = vhfopt
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index a1c01079..c12ef0e2 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -30,7 +30,7 @@
from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem,
reduce_to_device, transpose_sum)
from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
from gpu4pyscf.hessian import jk
def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
@@ -49,7 +49,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
dm0 = cupy.dot(mocc, mocc.T) * 2
if mf.do_nlc():
- raise NotImplementedError
+ raise NotImplementedError("2nd derivative of NLC is not implemented.")
omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
with_k = ni.libxc.is_hybrid_xc(mf.xc)
@@ -524,8 +524,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
futures = []
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_get_vxc_deriv2_task,
hessobj, grids, mo_coeff, mo_occ, max_memory,
@@ -550,7 +550,6 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id
ngrids_glob = grids.coords.shape[0]
grid_start, grid_end = numint.gen_grid_range(ngrids_glob, device_id)
-
with cupy.cuda.Device(device_id), _streams[device_id]:
mo_occ = cupy.asarray(mo_occ)
mo_coeff = cupy.asarray(mo_coeff)
@@ -688,8 +687,8 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
futures = []
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_get_vxc_deriv1_task,
hessobj, grids, mo_coeff, mo_occ, max_memory,
@@ -796,8 +795,8 @@ def nr_rks_fxc_mo(ni, mol, grids, xc_code, dm0=None, dms=None, mo_coeff=None, re
futures = []
cupy.cuda.get_current_stream().synchronize()
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(
_nr_rks_fxc_mo_task,
ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc,
diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py
index 0c2995f8..620331e0 100644
--- a/gpu4pyscf/hessian/uks.py
+++ b/gpu4pyscf/hessian/uks.py
@@ -47,8 +47,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
dm0b = moccb.dot(moccb.T)
dm0 = cp.asarray((dm0a, dm0b))
- if mf.nlc != '':
- raise NotImplementedError
+ if mf.do_nlc():
+ raise NotImplementedError("2nd derivative of NLC is not implemented.")
omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
with_k = ni.libxc.is_hybrid_xc(mf.xc)
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index fe197c71..4c62d8db 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -23,8 +23,8 @@
from gpu4pyscf.gto import mole
from gpu4pyscf.lib.cutensor import contract
from gpu4pyscf.lib.cusolver import eigh, cholesky #NOQA
-from gpu4pyscf.lib.memcpy import copy_array #NOQA
-from gpu4pyscf.__config__ import _streams, _num_devices, _p2p_access
+from gpu4pyscf.lib.memcpy import copy_array, p2p_transfer #NOQA
+from gpu4pyscf.__config__ import _streams, num_devices, _p2p_access
LMAX_ON_GPU = 7
DSOLVE_LINDEP = 1e-13
@@ -81,23 +81,6 @@ def get_avail_mem():
mem_avail = cupy.cuda.runtime.memGetInfo()[0]
return mem_avail + total_mem - used_mem
-def p2p_transfer(a, b):
- ''' If the direct P2P data transfer is not available, transfer data via CPU memory
- '''
- if a.device == b.device:
- a[:] = b
- elif _p2p_access:
- a[:] = b
- '''
- elif a.strides == b.strides and a.flags.c_contiguous and a.dtype == b.dtype:
- # cupy supports a direct copy from different devices without p2p. See also
- # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L48
- # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L1015
- a[:] = b
- '''
- else:
- copy_array(b, a)
-
def concatenate(array_list):
''' Concatenate axis=0 only
'''
@@ -126,8 +109,8 @@ def reduce_to_device(array_list, inplace=False):
''' Reduce a list of ndarray in different devices to device 0
TODO: reduce memory footprint, improve throughput
'''
- assert len(array_list) == _num_devices
- if _num_devices == 1:
+ assert len(array_list) == num_devices
+ if num_devices == 1:
return array_list[0]
out_shape = array_list[0].shape
diff --git a/gpu4pyscf/lib/cusolver.py b/gpu4pyscf/lib/cusolver.py
index 5c8d2dd6..393d7d96 100644
--- a/gpu4pyscf/lib/cusolver.py
+++ b/gpu4pyscf/lib/cusolver.py
@@ -16,11 +16,13 @@
import numpy as np
import cupy
import ctypes
+from ctypes.util import find_library
from cupy_backends.cuda.libs import cusolver
from cupy_backends.cuda.libs import cublas
from cupy.cuda import device
-libcusolver = ctypes.CDLL('libcusolver.so')
+libcusolver = find_library('cusolver')
+libcusolver = ctypes.CDLL(libcusolver)
CUSOLVER_EIG_TYPE_1 = 1
CUSOLVER_EIG_TYPE_2 = 2
diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py
index 0599e39a..034076ab 100644
--- a/gpu4pyscf/lib/cutensor.py
+++ b/gpu4pyscf/lib/cutensor.py
@@ -103,10 +103,11 @@ def contraction(
ws = cupy.empty(ws_size, dtype=np.int8)
out = c
- alpha = np.asarray(alpha)
- beta = np.asarray(beta)
+ alpha = np.asarray(alpha, dtype=dtype)
+ beta = np.asarray(beta, dtype=dtype)
- cutensor_backend.contract(cutensor._get_handle().ptr, plan.ptr,
+ handler = cutensor._get_handle()
+ cutensor_backend.contract(handler.ptr, plan.ptr,
alpha.ctypes.data, a.data.ptr, b.data.ptr,
beta.ctypes.data, c.data.ptr, out.data.ptr,
ws.data.ptr, ws_size)
@@ -114,13 +115,10 @@ def contraction(
return out
import os
-if 'CONTRACT_ENGINE' in os.environ:
- contract_engine = os.environ['CONTRACT_ENGINE']
-else:
- contract_engine = None
-
+contract_engine = None
if cutensor is None:
contract_engine = 'cupy' # default contraction engine
+contract_engine = os.environ.get('CONTRACT_ENGINE', contract_engine)
# override the 'contract' function if einsum is customized or cutensor is not found
if contract_engine is not None:
@@ -139,10 +137,15 @@ def contraction(
warnings.warn(f'using {contract_engine} as the tensor contraction engine.')
def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None):
if out is None:
- return cupy.asarray(einsum(pattern, a, b), order='C')
+ out = einsum(pattern, a, b)
+ out *= alpha
+ elif beta == 0.:
+ out[:] = einsum(pattern, a, b)
+ out *= alpha
else:
- out[:] = alpha*einsum(pattern, a, b) + beta*out
- return cupy.asarray(out, order='C')
+ out *= beta
+ out += alpha*einsum(pattern, a, b)
+ return cupy.asarray(out, order='C')
else:
def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None):
'''
diff --git a/gpu4pyscf/lib/gint/CMakeLists.txt b/gpu4pyscf/lib/gint/CMakeLists.txt
index 464efed6..7647c2c3 100644
--- a/gpu4pyscf/lib/gint/CMakeLists.txt
+++ b/gpu4pyscf/lib/gint/CMakeLists.txt
@@ -26,6 +26,7 @@ add_library(gint SHARED
nr_fill_ao_ints.cu
nr_fill_ao_int3c1e.cu
nr_fill_ao_int3c1e_ip.cu
+ nr_fill_ao_int3c1e_ipip.cu
nr_fill_ao_int3c2e.cu
nr_fill_ao_int3c2e_ip1.cu
nr_fill_ao_int3c2e_ip2.cu
diff --git a/gpu4pyscf/lib/gint/g3c1e_ipip.cu b/gpu4pyscf/lib/gint/g3c1e_ipip.cu
new file mode 100644
index 00000000..87ebb270
--- /dev/null
+++ b/gpu4pyscf/lib/gint/g3c1e_ipip.cu
@@ -0,0 +1,635 @@
+/*
+ * Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gint.h"
+
+template
+__device__
+static void GINTwrite_int3c1e_ipip1_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double prefactor, const int i_l, const int j_l)
+{
+ const int *idx = c_idx;
+ const int *idy = c_idx + TOT_NF;
+ const int *idz = c_idx + TOT_NF * 2;
+
+ const int g_size = NROOTS * (i_l + 2 + 1) * (j_l + 1);
+ const double* __restrict__ gx = g;
+ const double* __restrict__ gy = g + g_size;
+ const double* __restrict__ gz = g + g_size * 2;
+
+ const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+ const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+ const int n_density_elements_ij = n_density_elements_i * n_density_elements_j;
+ for (int j = 0; j < n_density_elements_j; j++) {
+ for (int i = 0; i < n_density_elements_i; i++) {
+ const int loc_j = c_l_locs[j_l] + j;
+ const int loc_i = c_l_locs[i_l] + i;
+ const int ix = idx[loc_i];
+ const int iy = idy[loc_i];
+ const int iz = idz[loc_i];
+ const int jx = idx[loc_j];
+ const int jy = idy[loc_j];
+ const int jz = idz[loc_j];
+ const int gx_offset = ix + jx * (i_l + 2 + 1);
+ const int gy_offset = iy + jy * (i_l + 2 + 1);
+ const int gz_offset = iz + jz * (i_l + 2 + 1);
+
+ double d2eri_dAxdAx = 0;
+ double d2eri_dAxdAy = 0;
+ double d2eri_dAxdAz = 0;
+ double d2eri_dAydAy = 0;
+ double d2eri_dAydAz = 0;
+ double d2eri_dAzdAz = 0;
+#pragma unroll
+ for (int i_root = 0; i_root < NROOTS; i_root++) {
+ const double gx_minus_2 = (ix >= 2 ? gx[(gx_offset - 2) * NROOTS + i_root] : 0);
+ const double gy_minus_2 = (iy >= 2 ? gy[(gy_offset - 2) * NROOTS + i_root] : 0);
+ const double gz_minus_2 = (iz >= 2 ? gz[(gz_offset - 2) * NROOTS + i_root] : 0);
+ const double gx_minus_1 = (ix >= 1 ? gx[(gx_offset - 1) * NROOTS + i_root] : 0);
+ const double gy_minus_1 = (iy >= 1 ? gy[(gy_offset - 1) * NROOTS + i_root] : 0);
+ const double gz_minus_1 = (iz >= 1 ? gz[(gz_offset - 1) * NROOTS + i_root] : 0);
+ const double gx_0 = gx[gx_offset * NROOTS + i_root];
+ const double gy_0 = gy[gy_offset * NROOTS + i_root];
+ const double gz_0 = gz[gz_offset * NROOTS + i_root];
+ const double gx_1 = gx[(gx_offset + 1) * NROOTS + i_root];
+ const double gy_1 = gy[(gy_offset + 1) * NROOTS + i_root];
+ const double gz_1 = gz[(gz_offset + 1) * NROOTS + i_root];
+ const double gx_2 = gx[(gx_offset + 2) * NROOTS + i_root];
+ const double gy_2 = gy[(gy_offset + 2) * NROOTS + i_root];
+ const double gz_2 = gz[(gz_offset + 2) * NROOTS + i_root];
+ const double dgx_dAx = ix * gx_minus_1 + minus_two_a * gx_1;
+ const double dgy_dAy = iy * gy_minus_1 + minus_two_a * gy_1;
+ const double dgz_dAz = iz * gz_minus_1 + minus_two_a * gz_1;
+ const double d2gx_dAx2 = ix * (ix - 1) * gx_minus_2 + minus_two_a * (2 * ix + 1) * gx_0 + minus_two_a * minus_two_a * gx_2;
+ const double d2gy_dAy2 = iy * (iy - 1) * gy_minus_2 + minus_two_a * (2 * iy + 1) * gy_0 + minus_two_a * minus_two_a * gy_2;
+ const double d2gz_dAz2 = iz * (iz - 1) * gz_minus_2 + minus_two_a * (2 * iz + 1) * gz_0 + minus_two_a * minus_two_a * gz_2;
+ d2eri_dAxdAx += d2gx_dAx2 * gy_0 * gz_0;
+ d2eri_dAxdAy += dgx_dAx * dgy_dAy * gz_0;
+ d2eri_dAxdAz += dgx_dAx * gy_0 * dgz_dAz;
+ d2eri_dAydAy += gx_0 * d2gy_dAy2 * gz_0;
+ d2eri_dAydAz += gx_0 * dgy_dAy * dgz_dAz;
+ d2eri_dAzdAz += gx_0 * gy_0 * d2gz_dAz2;
+ }
+ local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += d2eri_dAxdAx * prefactor;
+ local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += d2eri_dAxdAy * prefactor;
+ local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += d2eri_dAxdAz * prefactor;
+ local_output[i + j * n_density_elements_i + 3 * n_density_elements_ij] += d2eri_dAydAy * prefactor;
+ local_output[i + j * n_density_elements_i + 4 * n_density_elements_ij] += d2eri_dAydAz * prefactor;
+ local_output[i + j * n_density_elements_i + 5 * n_density_elements_ij] += d2eri_dAzdAz * prefactor;
+ }
+ }
+}
+
+template
+__global__
+static void GINTfill_int3c1e_ipip1_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+ const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+ const double omega, const double* grid_points, const double* charge_exponents)
+{
+ const int ntasks_ij = offsets.ntasks_ij;
+ const int ngrids = offsets.ntasks_kl;
+ const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+ if (task_ij >= ntasks_ij) {
+ return;
+ }
+
+ const int bas_ij = offsets.bas_ij + task_ij;
+ const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+ const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+ const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+ const int ish = bas_pair2bra[bas_ij];
+ const int jsh = bas_pair2ket[bas_ij];
+ const double* __restrict__ a_exponents = c_bpcache.a1;
+
+ constexpr int l_sum_max = (NROOTS - 1) * 2 + 1;
+ constexpr int l_i_max_density_elements = (l_sum_max + 1) / 2;
+ constexpr int l_j_max_density_elements = l_sum_max - l_i_max_density_elements;
+ double output_cache[(l_i_max_density_elements + 1) * (l_i_max_density_elements + 2) / 2
+ * (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2
+ * 6] { 0.0 };
+
+ for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) {
+ const double* grid_point = grid_points + task_grid * 4;
+ const double charge = grid_point[3];
+ const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+ double g[GSIZE_INT3C_1E];
+
+ for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
+ GINT_g1e(g, grid_point, ish, jsh, ij, i_l + 2, j_l, charge_exponent, omega);
+ const double minus_two_a = -2.0 * a_exponents[ij];
+ GINTwrite_int3c1e_ipip1_charge_contracted(g, output_cache, minus_two_a, charge, i_l, j_l);
+ }
+ }
+
+ const int* ao_loc = c_bpcache.ao_loc;
+
+ const int i0 = ao_loc[ish] - ao_offsets_i;
+ const int j0 = ao_loc[jsh] - ao_offsets_j;
+ const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+ const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+ const int n_density_elements_ij = n_density_elements_i * n_density_elements_j;
+ for (int j = 0; j < n_density_elements_j; j++) {
+ for (int i = 0; i < n_density_elements_i; i++) {
+ const double d2eri_dAxdAx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij];
+ const double d2eri_dAxdAy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij];
+ const double d2eri_dAxdAz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij];
+ const double d2eri_dAydAy = output_cache[i + j * n_density_elements_i + 3 * n_density_elements_ij];
+ const double d2eri_dAydAz = output_cache[i + j * n_density_elements_i + 4 * n_density_elements_ij];
+ const double d2eri_dAzdAz = output_cache[i + j * n_density_elements_i + 5 * n_density_elements_ij];
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), d2eri_dAxdAx);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), d2eri_dAxdAy);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), d2eri_dAxdAz);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 4 * stride_ij), d2eri_dAydAy);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 5 * stride_ij), d2eri_dAydAz);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 8 * stride_ij), d2eri_dAzdAz);
+ }
+ }
+}
+
+template
+__device__
+static void GINTwrite_int3c1e_ipvip1_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double minus_two_b, const double prefactor, const int i_l, const int j_l)
+{
+ const int *idx = c_idx;
+ const int *idy = c_idx + TOT_NF;
+ const int *idz = c_idx + TOT_NF * 2;
+
+ const int g_size = NROOTS * (i_l + 1 + 1) * (j_l + 1 + 1);
+ const double* __restrict__ gx = g;
+ const double* __restrict__ gy = g + g_size;
+ const double* __restrict__ gz = g + g_size * 2;
+
+ const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+ const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+ const int n_density_elements_ij = n_density_elements_i * n_density_elements_j;
+ for (int j = 0; j < n_density_elements_j; j++) {
+ for (int i = 0; i < n_density_elements_i; i++) {
+ const int loc_j = c_l_locs[j_l] + j;
+ const int loc_i = c_l_locs[i_l] + i;
+ const int ix = idx[loc_i];
+ const int iy = idy[loc_i];
+ const int iz = idz[loc_i];
+ const int jx = idx[loc_j];
+ const int jy = idy[loc_j];
+ const int jz = idz[loc_j];
+ const int j_offset = i_l + 1 + 1;
+
+ double d2eri_dAxdBx = 0;
+ double d2eri_dAxdBy = 0;
+ double d2eri_dAxdBz = 0;
+ double d2eri_dAydBx = 0;
+ double d2eri_dAydBy = 0;
+ double d2eri_dAydBz = 0;
+ double d2eri_dAzdBx = 0;
+ double d2eri_dAzdBy = 0;
+ double d2eri_dAzdBz = 0;
+#pragma unroll
+ for (int i_root = 0; i_root < NROOTS; i_root++) {
+ const double gx_i_minus_1_j_minus_1 = ix * jx * (ix >= 1 && jx >= 1 ? gx[(ix - 1 + (jx - 1) * j_offset) * NROOTS + i_root] : 0);
+ const double gy_i_minus_1_j_minus_1 = iy * jy * (iy >= 1 && jy >= 1 ? gy[(iy - 1 + (jy - 1) * j_offset) * NROOTS + i_root] : 0);
+ const double gz_i_minus_1_j_minus_1 = iz * jz * (iz >= 1 && jz >= 1 ? gz[(iz - 1 + (jz - 1) * j_offset) * NROOTS + i_root] : 0);
+ const double gx_i_minus_1_j_1 = ix * minus_two_b * (ix >= 1 ? gx[(ix - 1 + (jx + 1) * j_offset) * NROOTS + i_root] : 0);
+ const double gy_i_minus_1_j_1 = iy * minus_two_b * (iy >= 1 ? gy[(iy - 1 + (jy + 1) * j_offset) * NROOTS + i_root] : 0);
+ const double gz_i_minus_1_j_1 = iz * minus_two_b * (iz >= 1 ? gz[(iz - 1 + (jz + 1) * j_offset) * NROOTS + i_root] : 0);
+ const double gx_i_1_j_minus_1 = jx * minus_two_a * (jx >= 1 ? gx[(ix + 1 + (jx - 1) * j_offset) * NROOTS + i_root] : 0);
+ const double gy_i_1_j_minus_1 = jy * minus_two_a * (jy >= 1 ? gy[(iy + 1 + (jy - 1) * j_offset) * NROOTS + i_root] : 0);
+ const double gz_i_1_j_minus_1 = jz * minus_two_a * (jz >= 1 ? gz[(iz + 1 + (jz - 1) * j_offset) * NROOTS + i_root] : 0);
+ const double gx_i_1_j_1 = minus_two_a * minus_two_b * gx[(ix + 1 + (jx + 1) * j_offset) * NROOTS + i_root];
+ const double gy_i_1_j_1 = minus_two_a * minus_two_b * gy[(iy + 1 + (jy + 1) * j_offset) * NROOTS + i_root];
+ const double gz_i_1_j_1 = minus_two_a * minus_two_b * gz[(iz + 1 + (jz + 1) * j_offset) * NROOTS + i_root];
+ const double gx_0 = gx[(ix + jx * j_offset) * NROOTS + i_root];
+ const double gy_0 = gy[(iy + jy * j_offset) * NROOTS + i_root];
+ const double gz_0 = gz[(iz + jz * j_offset) * NROOTS + i_root];
+ const double gx_i_1_j_0 = minus_two_a * gx[(ix + 1 + jx * j_offset) * NROOTS + i_root];
+ const double gy_i_1_j_0 = minus_two_a * gy[(iy + 1 + jy * j_offset) * NROOTS + i_root];
+ const double gz_i_1_j_0 = minus_two_a * gz[(iz + 1 + jz * j_offset) * NROOTS + i_root];
+ const double gx_i_minus_1_j_0 = ix * (ix >= 1 ? gx[(ix - 1 + jx * j_offset) * NROOTS + i_root] : 0);
+ const double gy_i_minus_1_j_0 = iy * (iy >= 1 ? gy[(iy - 1 + jy * j_offset) * NROOTS + i_root] : 0);
+ const double gz_i_minus_1_j_0 = iz * (iz >= 1 ? gz[(iz - 1 + jz * j_offset) * NROOTS + i_root] : 0);
+ const double gx_i_0_j_1 = minus_two_b * gx[(ix + (jx + 1) * j_offset) * NROOTS + i_root];
+ const double gy_i_0_j_1 = minus_two_b * gy[(iy + (jy + 1) * j_offset) * NROOTS + i_root];
+ const double gz_i_0_j_1 = minus_two_b * gz[(iz + (jz + 1) * j_offset) * NROOTS + i_root];
+ const double gx_i_0_j_minus_1 = jx * (jx >= 1 ? gx[(ix + (jx - 1) * j_offset) * NROOTS + i_root] : 0);
+ const double gy_i_0_j_minus_1 = jy * (jy >= 1 ? gy[(iy + (jy - 1) * j_offset) * NROOTS + i_root] : 0);
+ const double gz_i_0_j_minus_1 = jz * (jz >= 1 ? gz[(iz + (jz - 1) * j_offset) * NROOTS + i_root] : 0);
+
+ d2eri_dAxdBx += (gx_i_minus_1_j_minus_1 + gx_i_minus_1_j_1 + gx_i_1_j_minus_1 + gx_i_1_j_1) * gy_0 * gz_0;
+ d2eri_dAxdBy += (gx_i_minus_1_j_0 + gx_i_1_j_0) * (gy_i_0_j_minus_1 + gy_i_0_j_1) * gz_0;
+ d2eri_dAxdBz += (gx_i_minus_1_j_0 + gx_i_1_j_0) * gy_0 * (gz_i_0_j_minus_1 + gz_i_0_j_1);
+ d2eri_dAydBx += (gx_i_0_j_minus_1 + gx_i_0_j_1) * (gy_i_minus_1_j_0 + gy_i_1_j_0) * gz_0;
+ d2eri_dAydBy += gx_0 * (gy_i_minus_1_j_minus_1 + gy_i_minus_1_j_1 + gy_i_1_j_minus_1 + gy_i_1_j_1) * gz_0;
+ d2eri_dAydBz += gx_0 * (gy_i_minus_1_j_0 + gy_i_1_j_0) * (gz_i_0_j_minus_1 + gz_i_0_j_1);
+ d2eri_dAzdBx += (gx_i_0_j_minus_1 + gx_i_0_j_1) * gy_0 * (gz_i_minus_1_j_0 + gz_i_1_j_0);
+ d2eri_dAzdBy += gx_0 * (gy_i_0_j_minus_1 + gy_i_0_j_1) * (gz_i_minus_1_j_0 + gz_i_1_j_0);
+ d2eri_dAzdBz += gx_0 * gy_0 * (gz_i_minus_1_j_minus_1 + gz_i_minus_1_j_1 + gz_i_1_j_minus_1 + gz_i_1_j_1);
+ }
+ local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += d2eri_dAxdBx * prefactor;
+ local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += d2eri_dAxdBy * prefactor;
+ local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += d2eri_dAxdBz * prefactor;
+ local_output[i + j * n_density_elements_i + 3 * n_density_elements_ij] += d2eri_dAydBx * prefactor;
+ local_output[i + j * n_density_elements_i + 4 * n_density_elements_ij] += d2eri_dAydBy * prefactor;
+ local_output[i + j * n_density_elements_i + 5 * n_density_elements_ij] += d2eri_dAydBz * prefactor;
+ local_output[i + j * n_density_elements_i + 6 * n_density_elements_ij] += d2eri_dAzdBx * prefactor;
+ local_output[i + j * n_density_elements_i + 7 * n_density_elements_ij] += d2eri_dAzdBy * prefactor;
+ local_output[i + j * n_density_elements_i + 8 * n_density_elements_ij] += d2eri_dAzdBz * prefactor;
+ }
+ }
+}
+
+template
+__global__
+static void GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+ const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+ const double omega, const double* grid_points, const double* charge_exponents)
+{
+ const int ntasks_ij = offsets.ntasks_ij;
+ const int ngrids = offsets.ntasks_kl;
+ const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+ if (task_ij >= ntasks_ij) {
+ return;
+ }
+
+ const int bas_ij = offsets.bas_ij + task_ij;
+ const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+ const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+ const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+ const int ish = bas_pair2bra[bas_ij];
+ const int jsh = bas_pair2ket[bas_ij];
+ const double* __restrict__ a_exponents = c_bpcache.a1;
+ const double* __restrict__ b_exponents = c_bpcache.a2;
+
+ constexpr int l_sum_max = (NROOTS - 1) * 2 + 1;
+ constexpr int l_i_max_density_elements = (l_sum_max + 1) / 2;
+ constexpr int l_j_max_density_elements = l_sum_max - l_i_max_density_elements;
+ double output_cache[(l_i_max_density_elements + 1) * (l_i_max_density_elements + 2) / 2
+ * (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2
+ * 9] { 0.0 };
+
+ for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) {
+ const double* grid_point = grid_points + task_grid * 4;
+ const double charge = grid_point[3];
+ const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+ double g[GSIZE_INT3C_1E];
+
+ for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
+ GINT_g1e(g, grid_point, ish, jsh, ij, i_l + 1, j_l + 1, charge_exponent, omega);
+ const double minus_two_a = -2.0 * a_exponents[ij];
+ const double minus_two_b = -2.0 * b_exponents[ij];
+ GINTwrite_int3c1e_ipvip1_charge_contracted(g, output_cache, minus_two_a, minus_two_b, charge, i_l, j_l);
+ }
+ }
+
+ const int* ao_loc = c_bpcache.ao_loc;
+
+ const int i0 = ao_loc[ish] - ao_offsets_i;
+ const int j0 = ao_loc[jsh] - ao_offsets_j;
+ const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+ const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+ const int n_density_elements_ij = n_density_elements_i * n_density_elements_j;
+ for (int j = 0; j < n_density_elements_j; j++) {
+ for (int i = 0; i < n_density_elements_i; i++) {
+ const double d2eri_dAxdBx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij];
+ const double d2eri_dAxdBy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij];
+ const double d2eri_dAxdBz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij];
+ const double d2eri_dAydBx = output_cache[i + j * n_density_elements_i + 3 * n_density_elements_ij];
+ const double d2eri_dAydBy = output_cache[i + j * n_density_elements_i + 4 * n_density_elements_ij];
+ const double d2eri_dAydBz = output_cache[i + j * n_density_elements_i + 5 * n_density_elements_ij];
+ const double d2eri_dAzdBx = output_cache[i + j * n_density_elements_i + 6 * n_density_elements_ij];
+ const double d2eri_dAzdBy = output_cache[i + j * n_density_elements_i + 7 * n_density_elements_ij];
+ const double d2eri_dAzdBz = output_cache[i + j * n_density_elements_i + 8 * n_density_elements_ij];
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), d2eri_dAxdBx);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), d2eri_dAxdBy);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), d2eri_dAxdBz);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 3 * stride_ij), d2eri_dAydBx);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 4 * stride_ij), d2eri_dAydBy);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 5 * stride_ij), d2eri_dAydBz);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 6 * stride_ij), d2eri_dAzdBx);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 7 * stride_ij), d2eri_dAzdBy);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 8 * stride_ij), d2eri_dAzdBz);
+ }
+ }
+}
+
+template
+__device__
+static void GINTwrite_int3c1e_ip1ip2_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double* u2, const double* AC, const double prefactor, const int i_l, const int j_l)
+{
+ const int *idx = c_idx;
+ const int *idy = c_idx + TOT_NF;
+ const int *idz = c_idx + TOT_NF * 2;
+
+ const int g_size = NROOTS * (i_l + 2 + 1) * (j_l + 1);
+ const double* __restrict__ gx = g;
+ const double* __restrict__ gy = g + g_size;
+ const double* __restrict__ gz = g + g_size * 2;
+
+ const double ACx = AC[0];
+ const double ACy = AC[1];
+ const double ACz = AC[2];
+
+ const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+ const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+ const int n_density_elements_ij = n_density_elements_i * n_density_elements_j;
+ for (int j = 0; j < n_density_elements_j; j++) {
+ for (int i = 0; i < n_density_elements_i; i++) {
+ const int loc_j = c_l_locs[j_l] + j;
+ const int loc_i = c_l_locs[i_l] + i;
+ const int ix = idx[loc_i];
+ const int iy = idy[loc_i];
+ const int iz = idz[loc_i];
+ const int jx = idx[loc_j];
+ const int jy = idy[loc_j];
+ const int jz = idz[loc_j];
+ const int gx_offset = ix + jx * (i_l + 2 + 1);
+ const int gy_offset = iy + jy * (i_l + 2 + 1);
+ const int gz_offset = iz + jz * (i_l + 2 + 1);
+
+ double d2eri_dAxdCx = 0;
+ double d2eri_dAxdCy = 0;
+ double d2eri_dAxdCz = 0;
+ double d2eri_dAydCx = 0;
+ double d2eri_dAydCy = 0;
+ double d2eri_dAydCz = 0;
+ double d2eri_dAzdCx = 0;
+ double d2eri_dAzdCy = 0;
+ double d2eri_dAzdCz = 0;
+#pragma unroll
+ for (int i_root = 0; i_root < NROOTS; i_root++) {
+ const double gx_minus_1 = (ix >= 1 ? gx[(gx_offset - 1) * NROOTS + i_root] : 0);
+ const double gy_minus_1 = (iy >= 1 ? gy[(gy_offset - 1) * NROOTS + i_root] : 0);
+ const double gz_minus_1 = (iz >= 1 ? gz[(gz_offset - 1) * NROOTS + i_root] : 0);
+ const double gx_0 = gx[gx_offset * NROOTS + i_root];
+ const double gy_0 = gy[gy_offset * NROOTS + i_root];
+ const double gz_0 = gz[gz_offset * NROOTS + i_root];
+ const double gx_1 = gx[(gx_offset + 1) * NROOTS + i_root];
+ const double gy_1 = gy[(gy_offset + 1) * NROOTS + i_root];
+ const double gz_1 = gz[(gz_offset + 1) * NROOTS + i_root];
+ const double gx_2 = gx[(gx_offset + 2) * NROOTS + i_root];
+ const double gy_2 = gy[(gy_offset + 2) * NROOTS + i_root];
+ const double gz_2 = gz[(gz_offset + 2) * NROOTS + i_root];
+
+ const double two_u2 = 2.0 * u2[i_root];
+ const double dgx_dAx = ix * gx_minus_1 + minus_two_a * gx_1;
+ const double dgy_dAy = iy * gy_minus_1 + minus_two_a * gy_1;
+ const double dgz_dAz = iz * gz_minus_1 + minus_two_a * gz_1;
+ const double dgx_dCx = two_u2 * (ACx * gx_0 + gx_1);
+ const double dgy_dCy = two_u2 * (ACy * gy_0 + gy_1);
+ const double dgz_dCz = two_u2 * (ACz * gz_0 + gz_1);
+ const double d2gx_dAxdCx = two_u2 * (ix * ACx * gx_minus_1 + ix * gx_0 + minus_two_a * ACx * gx_1 + minus_two_a * gx_2);
+ const double d2gy_dAydCy = two_u2 * (iy * ACy * gy_minus_1 + iy * gy_0 + minus_two_a * ACy * gy_1 + minus_two_a * gy_2);
+ const double d2gz_dAzdCz = two_u2 * (iz * ACz * gz_minus_1 + iz * gz_0 + minus_two_a * ACz * gz_1 + minus_two_a * gz_2);
+
+ d2eri_dAxdCx += - d2gx_dAxdCx * gy_0 * gz_0;
+ d2eri_dAxdCy += - dgx_dAx * dgy_dCy * gz_0;
+ d2eri_dAxdCz += - dgx_dAx * gy_0 * dgz_dCz;
+ d2eri_dAydCx += - dgx_dCx * dgy_dAy * gz_0;
+ d2eri_dAydCy += - gx_0 * d2gy_dAydCy * gz_0;
+ d2eri_dAydCz += - gx_0 * dgy_dAy * dgz_dCz;
+ d2eri_dAzdCx += - dgx_dCx * gy_0 * dgz_dAz;
+ d2eri_dAzdCy += - gx_0 * dgy_dCy * dgz_dAz;
+ d2eri_dAzdCz += - gx_0 * gy_0 * d2gz_dAzdCz;
+ }
+ local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += d2eri_dAxdCx * prefactor;
+ local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += d2eri_dAxdCy * prefactor;
+ local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += d2eri_dAxdCz * prefactor;
+ local_output[i + j * n_density_elements_i + 3 * n_density_elements_ij] += d2eri_dAydCx * prefactor;
+ local_output[i + j * n_density_elements_i + 4 * n_density_elements_ij] += d2eri_dAydCy * prefactor;
+ local_output[i + j * n_density_elements_i + 5 * n_density_elements_ij] += d2eri_dAydCz * prefactor;
+ local_output[i + j * n_density_elements_i + 6 * n_density_elements_ij] += d2eri_dAzdCx * prefactor;
+ local_output[i + j * n_density_elements_i + 7 * n_density_elements_ij] += d2eri_dAzdCy * prefactor;
+ local_output[i + j * n_density_elements_i + 8 * n_density_elements_ij] += d2eri_dAzdCz * prefactor;
+ }
+ }
+}
+
+template
+__global__
+static void GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+ const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+ const double omega, const double* grid_points, const double* charge_exponents)
+{
+ const int ntasks_ij = offsets.ntasks_ij;
+ const int ngrids = offsets.ntasks_kl;
+ const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+ if (task_ij >= ntasks_ij) {
+ return;
+ }
+
+ const int bas_ij = offsets.bas_ij + task_ij;
+ const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+ const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+ const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+ const int ish = bas_pair2bra[bas_ij];
+ const int jsh = bas_pair2ket[bas_ij];
+ const double* __restrict__ a_exponents = c_bpcache.a1;
+
+ const int nbas = c_bpcache.nbas;
+ const double* __restrict__ bas_x = c_bpcache.bas_coords;
+ const double* __restrict__ bas_y = bas_x + nbas;
+ const double* __restrict__ bas_z = bas_y + nbas;
+ const double Ax = bas_x[ish];
+ const double Ay = bas_y[ish];
+ const double Az = bas_z[ish];
+
+ constexpr int l_sum_max = (NROOTS - 1) * 2 + 1;
+ constexpr int l_i_max_density_elements = (l_sum_max + 1) / 2;
+ constexpr int l_j_max_density_elements = l_sum_max - l_i_max_density_elements;
+ double output_cache[(l_i_max_density_elements + 1) * (l_i_max_density_elements + 2) / 2
+ * (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2
+ * 9] { 0.0 };
+
+ for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) {
+ const double* grid_point = grid_points + task_grid * 4;
+ const double Cx = grid_point[0];
+ const double Cy = grid_point[1];
+ const double Cz = grid_point[2];
+ const double charge = grid_point[3];
+ const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+ const double AC[3] { Ax - Cx, Ay - Cy, Az - Cz };
+
+ double g[GSIZE_INT3C_1E];
+ double u2[NROOTS];
+
+ for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
+ GINT_g1e_save_u2(g, u2, grid_point, ish, jsh, ij, i_l + 2, j_l, charge_exponent, omega);
+ const double minus_two_a = -2.0 * a_exponents[ij];
+ GINTwrite_int3c1e_ip1ip2_charge_contracted(g, output_cache, minus_two_a, u2, AC, charge, i_l, j_l);
+ }
+ }
+
+ const int* ao_loc = c_bpcache.ao_loc;
+
+ const int i0 = ao_loc[ish] - ao_offsets_i;
+ const int j0 = ao_loc[jsh] - ao_offsets_j;
+ const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+ const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+ const int n_density_elements_ij = n_density_elements_i * n_density_elements_j;
+ for (int j = 0; j < n_density_elements_j; j++) {
+ for (int i = 0; i < n_density_elements_i; i++) {
+ const double d2eri_dAxdCx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij];
+ const double d2eri_dAxdCy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij];
+ const double d2eri_dAxdCz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij];
+ const double d2eri_dAydCx = output_cache[i + j * n_density_elements_i + 3 * n_density_elements_ij];
+ const double d2eri_dAydCy = output_cache[i + j * n_density_elements_i + 4 * n_density_elements_ij];
+ const double d2eri_dAydCz = output_cache[i + j * n_density_elements_i + 5 * n_density_elements_ij];
+ const double d2eri_dAzdCx = output_cache[i + j * n_density_elements_i + 6 * n_density_elements_ij];
+ const double d2eri_dAzdCy = output_cache[i + j * n_density_elements_i + 7 * n_density_elements_ij];
+ const double d2eri_dAzdCz = output_cache[i + j * n_density_elements_i + 8 * n_density_elements_ij];
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), d2eri_dAxdCx);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), d2eri_dAxdCy);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), d2eri_dAxdCz);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 3 * stride_ij), d2eri_dAydCx);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 4 * stride_ij), d2eri_dAydCy);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 5 * stride_ij), d2eri_dAydCz);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 6 * stride_ij), d2eri_dAzdCx);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 7 * stride_ij), d2eri_dAzdCy);
+ atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 8 * stride_ij), d2eri_dAzdCz);
+ }
+ }
+}
+
+template
+__global__
+static void GINTfill_int3c1e_ipip2_density_contracted_kernel_general(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
+ const BasisProdOffsets offsets, const int nprim_ij,
+ const double omega, const double* grid_points, const double* charge_exponents)
+{
+ constexpr int NROOTS = (L_SUM + 2) / 2 + 1;
+
+ const int ntasks_ij = offsets.ntasks_ij;
+ const int ngrids = offsets.ntasks_kl;
+ const int task_grid = blockIdx.y * blockDim.y + threadIdx.y;
+ if (task_grid >= ngrids) {
+ return;
+ }
+
+ const double* grid_point = grid_points + task_grid * 3;
+ const double Cx = grid_point[0];
+ const double Cy = grid_point[1];
+ const double Cz = grid_point[2];
+ const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+ double d2eri_dCxdCx_pair_sum = 0.0;
+ double d2eri_dCxdCy_pair_sum = 0.0;
+ double d2eri_dCxdCz_pair_sum = 0.0;
+ double d2eri_dCydCy_pair_sum = 0.0;
+ double d2eri_dCydCz_pair_sum = 0.0;
+ double d2eri_dCzdCz_pair_sum = 0.0;
+ for (int task_ij = blockIdx.x * blockDim.x + threadIdx.x; task_ij < ntasks_ij; task_ij += gridDim.x * blockDim.x) {
+
+ const int bas_ij = offsets.bas_ij + task_ij;
+ const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+ const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+ // const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+ const int ish = bas_pair2bra[bas_ij];
+ // const int jsh = bas_pair2ket[bas_ij];
+ const int nbas = c_bpcache.nbas;
+ const double* __restrict__ bas_x = c_bpcache.bas_coords;
+ const double* __restrict__ bas_y = bas_x + nbas;
+ const double* __restrict__ bas_z = bas_y + nbas;
+ const double Ax = bas_x[ish];
+ const double Ay = bas_y[ish];
+ const double Az = bas_z[ish];
+
+ const double ACx = Ax - Cx;
+ const double ACy = Ay - Cy;
+ const double ACz = Az - Cz;
+
+ double D_hermite[(L_SUM + 1) * (L_SUM + 2) * (L_SUM + 3) / 6];
+#pragma unroll
+ for (int i_t = 0; i_t < (L_SUM + 1) * (L_SUM + 2) * (L_SUM + 3) / 6; i_t++) {
+ D_hermite[i_t] = density[bas_ij - hermite_density_offsets.pair_offset_of_angular_pair + hermite_density_offsets.density_offset_of_angular_pair + i_t * hermite_density_offsets.n_pair_of_angular_pair];
+ }
+
+ double d2eri_dCxdCx = 0.0;
+ double d2eri_dCxdCy = 0.0;
+ double d2eri_dCxdCz = 0.0;
+ double d2eri_dCydCy = 0.0;
+ double d2eri_dCydCz = 0.0;
+ double d2eri_dCzdCz = 0.0;
+ for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
+ double g[NROOTS * (L_SUM + 2 + 1) * 3];
+ double u2[NROOTS];
+ GINT_g1e_without_hrr_save_u2(g, u2, Cx, Cy, Cz, ish, ij, charge_exponent, omega);
+
+ const double* __restrict__ gx = g;
+ const double* __restrict__ gy = g + NROOTS * (L_SUM + 2 + 1);
+ const double* __restrict__ gz = g + NROOTS * (L_SUM + 2 + 1) * 2;
+
+#pragma unroll
+ for (int i_x = 0, i_t = 0; i_x <= L_SUM; i_x++) {
+#pragma unroll
+ for (int i_y = 0; i_x + i_y <= L_SUM; i_y++) {
+#pragma unroll
+ for (int i_z = 0; i_x + i_y + i_z <= L_SUM; i_z++, i_t++) {
+ double d2eri_dCxdCx_per_hermite = 0.0;
+ double d2eri_dCxdCy_per_hermite = 0.0;
+ double d2eri_dCxdCz_per_hermite = 0.0;
+ double d2eri_dCydCy_per_hermite = 0.0;
+ double d2eri_dCydCz_per_hermite = 0.0;
+ double d2eri_dCzdCz_per_hermite = 0.0;
+#pragma unroll
+ for (int i_root = 0; i_root < NROOTS; i_root++) {
+ const double gx_0 = gx[i_root + NROOTS * i_x];
+ const double gy_0 = gy[i_root + NROOTS * i_y];
+ const double gz_0 = gz[i_root + NROOTS * i_z];
+ const double gx_1 = gx[i_root + NROOTS * (i_x + 1)];
+ const double gy_1 = gy[i_root + NROOTS * (i_y + 1)];
+ const double gz_1 = gz[i_root + NROOTS * (i_z + 1)];
+ const double gx_2 = gx[i_root + NROOTS * (i_x + 2)];
+ const double gy_2 = gy[i_root + NROOTS * (i_y + 2)];
+ const double gz_2 = gz[i_root + NROOTS * (i_z + 2)];
+ const double two_u2 = 2.0 * u2[i_root];
+ const double dgx_dCx = two_u2 * (gx_1 + ACx * gx_0);
+ const double dgy_dCy = two_u2 * (gy_1 + ACy * gy_0);
+ const double dgz_dCz = two_u2 * (gz_1 + ACz * gz_0);
+ const double d2gx_dCx2 = two_u2 * (-gx_0 + two_u2 * (gx_2 + ACx * gx_1 * 2 + ACx * ACx * gx_0));
+ const double d2gy_dCy2 = two_u2 * (-gy_0 + two_u2 * (gy_2 + ACy * gy_1 * 2 + ACy * ACy * gy_0));
+ const double d2gz_dCz2 = two_u2 * (-gz_0 + two_u2 * (gz_2 + ACz * gz_1 * 2 + ACz * ACz * gz_0));
+ d2eri_dCxdCx_per_hermite += d2gx_dCx2 * gy_0 * gz_0;
+ d2eri_dCxdCy_per_hermite += dgx_dCx * dgy_dCy * gz_0;
+ d2eri_dCxdCz_per_hermite += dgx_dCx * gy_0 * dgz_dCz;
+ d2eri_dCydCy_per_hermite += gx_0 * d2gy_dCy2 * gz_0;
+ d2eri_dCydCz_per_hermite += gx_0 * dgy_dCy * dgz_dCz;
+ d2eri_dCzdCz_per_hermite += gx_0 * gy_0 * d2gz_dCz2;
+ }
+ const double D_t = D_hermite[i_t];
+ d2eri_dCxdCx += d2eri_dCxdCx_per_hermite * D_t;
+ d2eri_dCxdCy += d2eri_dCxdCy_per_hermite * D_t;
+ d2eri_dCxdCz += d2eri_dCxdCz_per_hermite * D_t;
+ d2eri_dCydCy += d2eri_dCydCy_per_hermite * D_t;
+ d2eri_dCydCz += d2eri_dCydCz_per_hermite * D_t;
+ d2eri_dCzdCz += d2eri_dCzdCz_per_hermite * D_t;
+ }
+ }
+ }
+ }
+ d2eri_dCxdCx_pair_sum += d2eri_dCxdCx;
+ d2eri_dCxdCy_pair_sum += d2eri_dCxdCy;
+ d2eri_dCxdCz_pair_sum += d2eri_dCxdCz;
+ d2eri_dCydCy_pair_sum += d2eri_dCydCy;
+ d2eri_dCydCz_pair_sum += d2eri_dCydCz;
+ d2eri_dCzdCz_pair_sum += d2eri_dCzdCz;
+ }
+ atomicAdd(output + task_grid + ngrids * 0, d2eri_dCxdCx_pair_sum);
+ atomicAdd(output + task_grid + ngrids * 1, d2eri_dCxdCy_pair_sum);
+ atomicAdd(output + task_grid + ngrids * 2, d2eri_dCxdCz_pair_sum);
+ atomicAdd(output + task_grid + ngrids * 4, d2eri_dCydCy_pair_sum);
+ atomicAdd(output + task_grid + ngrids * 5, d2eri_dCydCz_pair_sum);
+ atomicAdd(output + task_grid + ngrids * 8, d2eri_dCzdCz_pair_sum);
+}
diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu
new file mode 100644
index 00000000..4f3a3dee
--- /dev/null
+++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu
@@ -0,0 +1,361 @@
+/*
+ * Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include "gint.h"
+#include "gint1e.h"
+#include "cuda_alloc.cuh"
+#include "cint2e.cuh"
+
+#include "rys_roots.cu"
+#include "g1e.cu"
+#include "g3c1e_ipip.cu"
+
+static int GINTfill_int3c1e_ipip1_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+ const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+ const double omega, const double* grid_points, const double* charge_exponents,
+ const int n_charge_sum_per_thread, const cudaStream_t stream)
+{
+ const int ntasks_ij = offsets.ntasks_ij;
+ const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread;
+
+ const dim3 threads(THREADSX, THREADSY);
+ const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
+ const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+ switch (nrys_roots) {
+ case 2: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<2, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ case 3: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<3, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ case 4: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ case 5: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ case 6: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<6, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ default:
+ fprintf(stderr, "nrys_roots = %d out of range\n", nrys_roots);
+ return 1;
+ }
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess) {
+ fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err));
+ return 1;
+ }
+ return 0;
+}
+
+static int GINTfill_int3c1e_ipvip1_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+ const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+ const double omega, const double* grid_points, const double* charge_exponents,
+ const int n_charge_sum_per_thread, const cudaStream_t stream)
+{
+ const int ntasks_ij = offsets.ntasks_ij;
+ const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread;
+
+ const dim3 threads(THREADSX, THREADSY);
+ const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
+ const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+ switch (nrys_roots) {
+ case 2: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<2, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ case 3: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<3, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ case 4: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ case 5: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ case 6: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<6, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ default:
+ fprintf(stderr, "nrys_roots = %d out of range\n", nrys_roots);
+ return 1;
+ }
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess) {
+ fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err));
+ return 1;
+ }
+ return 0;
+}
+
+static int GINTfill_int3c1e_ip1ip2_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+ const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+ const double omega, const double* grid_points, const double* charge_exponents,
+ const int n_charge_sum_per_thread, const cudaStream_t stream)
+{
+ const int ntasks_ij = offsets.ntasks_ij;
+ const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread;
+
+ const dim3 threads(THREADSX, THREADSY);
+ const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
+ const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+ switch (nrys_roots) {
+ case 2: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<2, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ case 3: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<3, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ case 4: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ case 5: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ case 6: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<6, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+ default:
+ fprintf(stderr, "nrys_roots = %d out of range\n", nrys_roots);
+ return 1;
+ }
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess) {
+ fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err));
+ return 1;
+ }
+ return 0;
+}
+
+static int GINTfill_int3c1e_ipip2_density_contracted_tasks(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
+ const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+ const double omega, const double* grid_points, const double* charge_exponents,
+ const int n_pair_sum_per_thread, const cudaStream_t stream)
+{
+ const int ntasks_ij = (offsets.ntasks_ij + n_pair_sum_per_thread - 1) / n_pair_sum_per_thread;
+ const int ngrids = offsets.ntasks_kl;
+
+ const dim3 threads(THREADSX, THREADSY);
+ const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
+ switch (i_l + j_l) {
+ case 0: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 0> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+ case 1: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 1> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+ case 2: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 2> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+ case 3: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 3> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+ case 4: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 4> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+ case 5: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 5> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+ case 6: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 6> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+ case 7: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 7> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+ case 8: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 8> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+ // Up to g + g = 8 now
+ default:
+ fprintf(stderr, "i_l + j_l = %d out of range\n", i_l + j_l);
+ return 1;
+ }
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess) {
+ fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err));
+ return 1;
+ }
+ return 0;
+}
+
+extern "C" {
+int GINTfill_int3c1e_ipip1_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
+ const double* grid_points, const double* charge_exponents, const int ngrids,
+ double* integral_charge_contracted,
+ const int* strides, const int* ao_offsets,
+ const int* bins_locs_ij, const int nbins,
+ const int cp_ij_id, const double omega, const int n_charge_sum_per_thread)
+{
+ const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
+ const int i_l = cp_ij->l_bra;
+ const int j_l = cp_ij->l_ket;
+ const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+ const int nprim_ij = cp_ij->nprim_12;
+
+ if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) {
+ fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots);
+ return 2;
+ }
+
+ checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+
+ const int* bas_pairs_locs = bpcache->bas_pairs_locs;
+ const int* primitive_pairs_locs = bpcache->primitive_pairs_locs;
+ for (int ij_bin = 0; ij_bin < nbins; ij_bin++) {
+ const int bas_ij0 = bins_locs_ij[ij_bin];
+ const int bas_ij1 = bins_locs_ij[ij_bin + 1];
+ const int ntasks_ij = bas_ij1 - bas_ij0;
+ if (ntasks_ij <= 0) {
+ continue;
+ }
+
+ BasisProdOffsets offsets;
+ offsets.ntasks_ij = ntasks_ij;
+ offsets.ntasks_kl = ngrids;
+ offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0;
+ offsets.bas_kl = -1;
+ offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij;
+ offsets.primitive_kl = -1;
+
+ const int err = GINTfill_int3c1e_ipip1_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij,
+ strides[0], strides[1], ao_offsets[0], ao_offsets[1],
+ omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream);
+
+ if (err != 0) {
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+int GINTfill_int3c1e_ipvip1_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
+ const double* grid_points, const double* charge_exponents, const int ngrids,
+ double* integral_charge_contracted,
+ const int* strides, const int* ao_offsets,
+ const int* bins_locs_ij, const int nbins,
+ const int cp_ij_id, const double omega, const int n_charge_sum_per_thread)
+{
+ const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
+ const int i_l = cp_ij->l_bra;
+ const int j_l = cp_ij->l_ket;
+ const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+ const int nprim_ij = cp_ij->nprim_12;
+
+ if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) {
+ fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots);
+ return 2;
+ }
+
+ checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+
+ const int* bas_pairs_locs = bpcache->bas_pairs_locs;
+ const int* primitive_pairs_locs = bpcache->primitive_pairs_locs;
+ for (int ij_bin = 0; ij_bin < nbins; ij_bin++) {
+ const int bas_ij0 = bins_locs_ij[ij_bin];
+ const int bas_ij1 = bins_locs_ij[ij_bin + 1];
+ const int ntasks_ij = bas_ij1 - bas_ij0;
+ if (ntasks_ij <= 0) {
+ continue;
+ }
+
+ BasisProdOffsets offsets;
+ offsets.ntasks_ij = ntasks_ij;
+ offsets.ntasks_kl = ngrids;
+ offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0;
+ offsets.bas_kl = -1;
+ offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij;
+ offsets.primitive_kl = -1;
+
+ const int err = GINTfill_int3c1e_ipvip1_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij,
+ strides[0], strides[1], ao_offsets[0], ao_offsets[1],
+ omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream);
+
+ if (err != 0) {
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+int GINTfill_int3c1e_ip1ip2_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
+ const double* grid_points, const double* charge_exponents, const int ngrids,
+ double* integral_charge_contracted,
+ const int* strides, const int* ao_offsets,
+ const int* bins_locs_ij, const int nbins,
+ const int cp_ij_id, const double omega, const int n_charge_sum_per_thread)
+{
+ const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
+ const int i_l = cp_ij->l_bra;
+ const int j_l = cp_ij->l_ket;
+ const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+ const int nprim_ij = cp_ij->nprim_12;
+
+ if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) {
+ fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots);
+ return 2;
+ }
+
+ checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+
+ const int* bas_pairs_locs = bpcache->bas_pairs_locs;
+ const int* primitive_pairs_locs = bpcache->primitive_pairs_locs;
+ for (int ij_bin = 0; ij_bin < nbins; ij_bin++) {
+ const int bas_ij0 = bins_locs_ij[ij_bin];
+ const int bas_ij1 = bins_locs_ij[ij_bin + 1];
+ const int ntasks_ij = bas_ij1 - bas_ij0;
+ if (ntasks_ij <= 0) {
+ continue;
+ }
+
+ BasisProdOffsets offsets;
+ offsets.ntasks_ij = ntasks_ij;
+ offsets.ntasks_kl = ngrids;
+ offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0;
+ offsets.bas_kl = -1;
+ offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij;
+ offsets.primitive_kl = -1;
+
+ const int err = GINTfill_int3c1e_ip1ip2_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij,
+ strides[0], strides[1], ao_offsets[0], ao_offsets[1],
+ omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream);
+
+ if (err != 0) {
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+int GINTfill_int3c1e_ipip2_density_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
+ const double* grid_points, const double* charge_exponents, const int ngrids,
+ const double* dm_pair_ordered, const int* density_offset,
+ double* integral_density_contracted,
+ const int* bins_locs_ij, const int nbins,
+ const int cp_ij_id, const double omega, const int n_pair_sum_per_thread)
+{
+ const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
+ const int i_l = cp_ij->l_bra;
+ const int j_l = cp_ij->l_ket;
+ const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+ const int nprim_ij = cp_ij->nprim_12;
+
+ if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) {
+ fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots);
+ return 2;
+ }
+
+ checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+
+ const int* bas_pairs_locs = bpcache->bas_pairs_locs;
+ const int* primitive_pairs_locs = bpcache->primitive_pairs_locs;
+ for (int ij_bin = 0; ij_bin < nbins; ij_bin++) {
+ const int bas_ij0 = bins_locs_ij[ij_bin];
+ const int bas_ij1 = bins_locs_ij[ij_bin + 1];
+ const int ntasks_ij = bas_ij1 - bas_ij0;
+ if (ntasks_ij <= 0) {
+ continue;
+ }
+
+ BasisProdOffsets offsets;
+ offsets.ntasks_ij = ntasks_ij;
+ offsets.ntasks_kl = ngrids;
+ offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0;
+ offsets.bas_kl = -1;
+ offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij;
+ offsets.primitive_kl = -1;
+
+ HermiteDensityOffsets hermite_density_offsets;
+ hermite_density_offsets.density_offset_of_angular_pair = density_offset[cp_ij_id];
+ hermite_density_offsets.pair_offset_of_angular_pair = bas_pairs_locs[cp_ij_id];
+ hermite_density_offsets.n_pair_of_angular_pair = bas_pairs_locs[cp_ij_id + 1] - bas_pairs_locs[cp_ij_id];
+
+ const int err = GINTfill_int3c1e_ipip2_density_contracted_tasks(integral_density_contracted, dm_pair_ordered, hermite_density_offsets,
+ offsets, i_l, j_l, nprim_ij,
+ omega, grid_points, charge_exponents, n_pair_sum_per_thread, stream);
+
+ if (err != 0) {
+ return err;
+ }
+ }
+
+ return 0;
+}
+}
diff --git a/gpu4pyscf/lib/gvhf-rys/cart2xyz.c b/gpu4pyscf/lib/gvhf-rys/cart2xyz.c
index ee564cf9..ba10aca6 100644
--- a/gpu4pyscf/lib/gvhf-rys/cart2xyz.c
+++ b/gpu4pyscf/lib/gvhf-rys/cart2xyz.c
@@ -3,6 +3,9 @@
#include
#include "vhf.cuh"
+// up to l=7
+#define L_SLOTS 8
+
static int _LEN_CART0[] = {
0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, 136
};
@@ -32,9 +35,9 @@ static void _get_dm_to_dm_xyz_coeff(double* pcx, double* rij, int lmax)
{
int lmax1 = lmax + 1;
int l, lx;
- double rx_pow[LMAX1];
- double ry_pow[LMAX1];
- double rz_pow[LMAX1];
+ double rx_pow[L_SLOTS];
+ double ry_pow[L_SLOTS];
+ double rz_pow[L_SLOTS];
rx_pow[0] = 1.0;
ry_pow[0] = 1.0;
@@ -67,7 +70,7 @@ static void _dm_to_dm_xyz(double* dm_xyz, double* dm, int nao, int li, int lj, d
int lij = li + lj;
int l1 = lij + 1;
int l1l1 = l1 * l1;
- double pcx[LMAX1*LMAX1*3];
+ double pcx[L_SLOTS*L_SLOTS*3];
double *pcy = pcx + lj1 * lj1;
double *pcz = pcy + lj1 * lj1;
_get_dm_to_dm_xyz_coeff(pcx, rij, lj);
@@ -116,7 +119,7 @@ static void _dm_xyz_to_dm(double* dm_xyz, double* dm, int nao, int li, int lj, d
int lj1 = lj + 1;
int l1 = li + lj + 1;
int l1l1 = l1 * l1;
- double pcx[LMAX1*LMAX1*3];
+ double pcx[L_SLOTS*L_SLOTS*3];
double *pcy = pcx + lj1 * lj1;
double *pcz = pcy + lj1 * lj1;
_get_dm_to_dm_xyz_coeff(pcx, rij, lj);
@@ -152,7 +155,7 @@ void transform_cart_to_xyz(double *dm_xyz, double *dm, int *ao_loc, int *pair_lo
int *bas, int nbas, double *env)
{
int nao = ao_loc[nbas];
- double cache[(LMAX*2+1)*(LMAX*2+1)*(LMAX*2+1)];
+ double cache[L_SLOTS*L_SLOTS*L_SLOTS*8];
for (int ish = 0; ish < nbas; ish++) {
int i0 = ao_loc[ish];
int li = bas[ish*BAS_SLOTS+ANG_OF];
@@ -182,7 +185,7 @@ void transform_xyz_to_cart(double *vj, double *vj_xyz, int *ao_loc, int *pair_lo
int *bas, int nbas, double *env)
{
int nao = ao_loc[nbas];
- double cache[(LMAX*2+1)*(LMAX*2+1)*(LMAX*2+1)];
+ double cache[L_SLOTS*L_SLOTS*L_SLOTS*8];
for (int ish = 0; ish < nbas; ish++) {
int i0 = ao_loc[ish];
int li = bas[ish*BAS_SLOTS+ANG_OF];
diff --git a/gpu4pyscf/lib/gvhf-rys/create_tasks.cu b/gpu4pyscf/lib/gvhf-rys/create_tasks.cu
index ae8ef8ad..262f9de0 100644
--- a/gpu4pyscf/lib/gvhf-rys/create_tasks.cu
+++ b/gpu4pyscf/lib/gvhf-rys/create_tasks.cu
@@ -97,39 +97,35 @@ static int _fill_jk_tasks(ShellQuartet *shl_quartet_idx,
}
// https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
- extern __shared__ int thread_offsets[];
- thread_offsets[t_id] = count;
+ extern __shared__ int cum_count[];
+ cum_count[t_id] = count;
// Up-sweep phase
for (int stride = 1; stride < threads; stride *= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
if (index < threads) {
- thread_offsets[index] += thread_offsets[index-stride];
+ cum_count[index] += cum_count[index-stride];
}
}
__syncthreads();
- if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
// Down-sweep phase
- for (int stride = threads/2; stride > 0; stride /= 2) {
+ for (int stride = threads/4; stride > 0; stride /= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
- if (index < threads) {
- int temp = thread_offsets[index - stride];
- thread_offsets[index - stride] = thread_offsets[index];
- thread_offsets[index] += temp;
+ if (index + stride < threads) {
+ cum_count[index + stride] += cum_count[index];
}
}
__syncthreads();
- __shared__ int ntasks;
- if (t_id == threads-1) {
- ntasks = thread_offsets[threads-1] + count;
- }
- __syncthreads();
+ int ntasks = cum_count[threads-1];
if (ntasks == 0) {
return ntasks;
}
- int offset = thread_offsets[t_id];
+ int offset = 0;
+ if (t_id > 0) {
+ offset = cum_count[t_id-1];
+ }
for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
int tile_kl = tile_kl_mapping[t_kl_id];
if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
@@ -311,7 +307,7 @@ static int _fill_sr_jk_tasks(ShellQuartet *shl_quartet_idx,
float ypq = yij - ykl;
float zpq = zij - zkl;
float rr = xpq*xpq + ypq*ypq + zpq*zpq;
- float theta_rr = logf(rr + 1e-30f) + theta * rr;
+ float theta_rr = logf(rr + 1.f) + theta * rr;
d_cutoff = skl_cutoff - s_estimator[bas_kl] + theta_rr;
if (d_cutoff > 0) {
continue;
@@ -332,39 +328,35 @@ static int _fill_sr_jk_tasks(ShellQuartet *shl_quartet_idx,
}
// https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
- extern __shared__ int thread_offsets[];
- thread_offsets[t_id] = count;
+ extern __shared__ int cum_count[];
+ cum_count[t_id] = count;
// Up-sweep phase
for (int stride = 1; stride < threads; stride *= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
if (index < threads) {
- thread_offsets[index] += thread_offsets[index-stride];
+ cum_count[index] += cum_count[index-stride];
}
}
__syncthreads();
- if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
// Down-sweep phase
- for (int stride = threads/2; stride > 0; stride /= 2) {
+ for (int stride = threads/4; stride > 0; stride /= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
- if (index < threads) {
- int temp = thread_offsets[index - stride];
- thread_offsets[index - stride] = thread_offsets[index];
- thread_offsets[index] += temp;
+ if (index + stride < threads) {
+ cum_count[index + stride] += cum_count[index];
}
}
__syncthreads();
- __shared__ int ntasks;
- if (t_id == threads-1) {
- ntasks = thread_offsets[threads-1] + count;
- }
- __syncthreads();
+ int ntasks = cum_count[threads-1];
if (ntasks == 0) {
return ntasks;
}
- int offset = thread_offsets[t_id];
+ int offset = 0;
+ if (t_id > 0) {
+ offset = cum_count[t_id-1];
+ }
for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
int tile_kl = tile_kl_mapping[t_kl_id];
if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
@@ -457,7 +449,7 @@ static int _fill_sr_jk_tasks(ShellQuartet *shl_quartet_idx,
float ypq = yij - ykl;
float zpq = zij - zkl;
float rr = xpq*xpq + ypq*ypq + zpq*zpq;
- float theta_rr = logf(rr + 1e-30f) + theta * rr;
+ float theta_rr = logf(rr + 1.f) + theta * rr;
d_cutoff = skl_cutoff - s_estimator[bas_kl] + theta_rr;
if (d_cutoff > 0) {
continue;
diff --git a/gpu4pyscf/lib/gvhf-rys/create_tasks_ip1.cu b/gpu4pyscf/lib/gvhf-rys/create_tasks_ip1.cu
index 83803180..6ec7132e 100644
--- a/gpu4pyscf/lib/gvhf-rys/create_tasks_ip1.cu
+++ b/gpu4pyscf/lib/gvhf-rys/create_tasks_ip1.cu
@@ -93,40 +93,35 @@ static int _fill_ejk_tasks(ShellQuartet *shl_quartet_idx,
}
}
- // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
- extern __shared__ int thread_offsets[];
- thread_offsets[t_id] = count;
+ extern __shared__ int cum_count[];
+ cum_count[t_id] = count;
// Up-sweep phase
for (int stride = 1; stride < threads; stride *= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
if (index < threads) {
- thread_offsets[index] += thread_offsets[index-stride];
+ cum_count[index] += cum_count[index-stride];
}
}
__syncthreads();
- if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
// Down-sweep phase
- for (int stride = threads/2; stride > 0; stride /= 2) {
+ for (int stride = threads/4; stride > 0; stride /= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
- if (index < threads) {
- int temp = thread_offsets[index - stride];
- thread_offsets[index - stride] = thread_offsets[index];
- thread_offsets[index] += temp;
+ if (index + stride < threads) {
+ cum_count[index + stride] += cum_count[index];
}
}
__syncthreads();
- __shared__ int ntasks;
- if (t_id == threads-1) {
- ntasks = thread_offsets[threads-1] + count;
- }
- __syncthreads();
+ int ntasks = cum_count[threads-1];
if (ntasks == 0) {
return ntasks;
}
- int offset = thread_offsets[t_id];
+ int offset = 0;
+ if (t_id > 0) {
+ offset = cum_count[t_id-1];
+ }
for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
int tile_kl = tile_kl_mapping[t_kl_id];
if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
@@ -317,40 +312,35 @@ static int _fill_sr_ejk_tasks(ShellQuartet *shl_quartet_idx,
}
}
- // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
- extern __shared__ int thread_offsets[];
- thread_offsets[t_id] = count;
+ extern __shared__ int cum_count[];
+ cum_count[t_id] = count;
// Up-sweep phase
for (int stride = 1; stride < threads; stride *= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
if (index < threads) {
- thread_offsets[index] += thread_offsets[index-stride];
+ cum_count[index] += cum_count[index-stride];
}
}
__syncthreads();
- if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
// Down-sweep phase
- for (int stride = threads/2; stride > 0; stride /= 2) {
+ for (int stride = threads/4; stride > 0; stride /= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
- if (index < threads) {
- int temp = thread_offsets[index - stride];
- thread_offsets[index - stride] = thread_offsets[index];
- thread_offsets[index] += temp;
+ if (index + stride < threads) {
+ cum_count[index + stride] += cum_count[index];
}
}
__syncthreads();
- __shared__ int ntasks;
- if (t_id == threads-1) {
- ntasks = thread_offsets[threads-1] + count;
- }
- __syncthreads();
+ int ntasks = cum_count[threads-1];
if (ntasks == 0) {
return ntasks;
}
- int offset = thread_offsets[t_id];
+ int offset = 0;
+ if (t_id > 0) {
+ offset = cum_count[t_id-1];
+ }
for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
int tile_kl = tile_kl_mapping[t_kl_id];
if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
@@ -504,40 +494,35 @@ static int _fill_jk_tasks_s2kl(ShellQuartet *shl_quartet_idx,
}
}
- // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
- extern __shared__ int thread_offsets[];
- thread_offsets[t_id] = count;
+ extern __shared__ int cum_count[];
+ cum_count[t_id] = count;
// Up-sweep phase
for (int stride = 1; stride < threads; stride *= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
if (index < threads) {
- thread_offsets[index] += thread_offsets[index-stride];
+ cum_count[index] += cum_count[index-stride];
}
}
__syncthreads();
- if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
// Down-sweep phase
- for (int stride = threads/2; stride > 0; stride /= 2) {
+ for (int stride = threads/4; stride > 0; stride /= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
- if (index < threads) {
- int temp = thread_offsets[index - stride];
- thread_offsets[index - stride] = thread_offsets[index];
- thread_offsets[index] += temp;
+ if (index + stride < threads) {
+ cum_count[index + stride] += cum_count[index];
}
}
__syncthreads();
- __shared__ int ntasks;
- if (t_id == threads-1) {
- ntasks = thread_offsets[threads-1] + count;
- }
- __syncthreads();
+ int ntasks = cum_count[threads-1];
if (ntasks == 0) {
return ntasks;
}
- int offset = thread_offsets[t_id];
+ int offset = 0;
+ if (t_id > 0) {
+ offset = cum_count[t_id-1];
+ }
ShellQuartet sq = {(uint16_t)ish, (uint16_t)jsh};
for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
int bas_kl = pair_kl_mapping[t_kl_id];
@@ -562,156 +547,3 @@ static int _fill_jk_tasks_s2kl(ShellQuartet *shl_quartet_idx,
}
return ntasks;
}
-
-__device__
-static int _fill_ejk_tasks_tmp(ShellQuartet *shl_quartet_idx,
- RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
- int batch_ij, int batch_kl)
-{
- int nbas = envs.nbas;
- int *tile_ij_mapping = bounds.tile_ij_mapping;
- int *tile_kl_mapping = bounds.tile_kl_mapping;
- float *q_cond = bounds.q_cond;
- float *tile_q_cond = bounds.tile_q_cond;
- float *dm_cond = bounds.dm_cond;
- float cutoff = bounds.cutoff;
- int t_id = threadIdx.y * blockDim.x + threadIdx.x;
- int t_kl0 = batch_kl * TILES_IN_BATCH;
- int t_kl1 = MIN(t_kl0 + TILES_IN_BATCH, bounds.ntile_kl_pairs);
- int threads = blockDim.x * blockDim.y;
-
- int tile_ij = tile_ij_mapping[batch_ij];
- int nbas_tiles = nbas / TILE;
- int tile_i = tile_ij / nbas_tiles;
- int tile_j = tile_ij % nbas_tiles;
- int ish0 = tile_i * TILE;
- int jsh0 = tile_j * TILE;
- int ish1 = ish0 + TILE;
- int jsh1 = jsh0 + TILE;
- int do_j = jk.vj != NULL;
- int do_k = jk.vk != NULL;
-
- int count = 0;
- float tile_q_ij = tile_q_cond[tile_ij];
- for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
- int tile_kl = tile_kl_mapping[t_kl_id];
- if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
- break;
- }
- int tile_k = tile_kl / nbas_tiles;
- int tile_l = tile_kl % nbas_tiles;
- int ksh0 = tile_k * TILE;
- int lsh0 = tile_l * TILE;
- int ksh1 = ksh0 + TILE;
- int lsh1 = lsh0 + TILE;
- for (int ish = ish0; ish < ish1; ++ish) {
- for (int jsh = jsh0; jsh < MIN(ish+1, jsh1); ++jsh) {
- float q_ij = q_cond [ish*nbas+jsh];
- float d_ij = dm_cond[ish*nbas+jsh];
- int bas_ij = ish * nbas + jsh;
- for (int ksh = ksh0; ksh < MIN(ish+1, ksh1); ++ksh) {
- float d_ik = dm_cond[ish*nbas+ksh];
- float d_jk = dm_cond[jsh*nbas+ksh];
- for (int lsh = lsh0; lsh < MIN(ksh+1, lsh1); ++lsh) {
- int bas_kl = ksh * nbas + lsh;
- if (bas_ij < bas_kl) {
- continue;
- }
- float q_ijkl = q_ij + q_cond[ksh*nbas+lsh];
- if (q_ijkl < cutoff) {
- continue;
- }
- float d_cutoff = cutoff - q_ijkl;
- if ((do_k && (d_ik+dm_cond[jsh*nbas+lsh] > d_cutoff ||
- d_jk+dm_cond[ish*nbas+lsh] > d_cutoff)) ||
- (do_j && d_ij+dm_cond[ksh*nbas+lsh] > d_cutoff)) {
- ++count;
- }
- }
- }
- }
- }
- }
-
- // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
- extern __shared__ int thread_offsets[];
- thread_offsets[t_id] = count;
- // Up-sweep phase
- for (int stride = 1; stride < threads; stride *= 2) {
- __syncthreads();
- int index = (t_id + 1) * stride * 2 - 1;
- if (index < threads) {
- thread_offsets[index] += thread_offsets[index-stride];
- }
- }
- __syncthreads();
- if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
- // Down-sweep phase
- for (int stride = threads/2; stride > 0; stride /= 2) {
- __syncthreads();
- int index = (t_id + 1) * stride * 2 - 1;
- if (index < threads) {
- int temp = thread_offsets[index - stride];
- thread_offsets[index - stride] = thread_offsets[index];
- thread_offsets[index] += temp;
- }
- }
- __syncthreads();
- __shared__ int ntasks;
- if (t_id == threads-1) {
- ntasks = thread_offsets[threads-1] + count;
- }
- __syncthreads();
- if (ntasks == 0) {
- return ntasks;
- }
-
- int offset = thread_offsets[t_id];
- for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
- int tile_kl = tile_kl_mapping[t_kl_id];
- if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
- break;
- }
- int tile_k = tile_kl / nbas_tiles;
- int tile_l = tile_kl % nbas_tiles;
- int ksh0 = tile_k * TILE;
- int lsh0 = tile_l * TILE;
- int ksh1 = ksh0 + TILE;
- int lsh1 = lsh0 + TILE;
- ShellQuartet sq;
- for (int ish = ish0; ish < ish1; ++ish) {
- for (int jsh = jsh0; jsh < MIN(ish+1, jsh1); ++jsh) {
- float q_ij = q_cond [ish*nbas+jsh];
- float d_ij = dm_cond[ish*nbas+jsh];
- int bas_ij = ish * nbas + jsh;
- sq.i = ish;
- sq.j = jsh;
- for (int ksh = ksh0; ksh < MIN(ish+1, ksh1); ++ksh) {
- float d_ik = dm_cond[ish*nbas+ksh];
- float d_jk = dm_cond[jsh*nbas+ksh];
- for (int lsh = lsh0; lsh < MIN(ksh+1, lsh1); ++lsh) {
- int bas_kl = ksh * nbas + lsh;
- if (bas_ij < bas_kl) {
- continue;
- }
- float q_ijkl = q_ij + q_cond[ksh*nbas+lsh];
- if (q_ijkl < cutoff) {
- continue;
- }
- float d_cutoff = cutoff - q_ijkl;
- if ((do_k && (d_ik+dm_cond[jsh*nbas+lsh] > d_cutoff ||
- d_jk+dm_cond[ish*nbas+lsh] > d_cutoff)) ||
- (do_j && d_ij+dm_cond[ksh*nbas+lsh] > d_cutoff)) {
- sq.k = ksh;
- sq.l = lsh;
- shl_quartet_idx[offset] = sq;
- ++offset;
- }
- }
- }
- }
- }
- }
- return ntasks;
-}
-
diff --git a/gpu4pyscf/lib/gvhf-rys/create_tasks_ip2.cu b/gpu4pyscf/lib/gvhf-rys/create_tasks_ip2.cu
index ef62227a..df22b535 100644
--- a/gpu4pyscf/lib/gvhf-rys/create_tasks_ip2.cu
+++ b/gpu4pyscf/lib/gvhf-rys/create_tasks_ip2.cu
@@ -71,40 +71,35 @@ static int _fill_ejk_ip2_type2_tasks(ShellQuartet *shl_quartet_idx,
}
}
- // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
- extern __shared__ int thread_offsets[];
- thread_offsets[t_id] = count;
+ extern __shared__ int cum_count[];
+ cum_count[t_id] = count;
// Up-sweep phase
for (int stride = 1; stride < threads; stride *= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
if (index < threads) {
- thread_offsets[index] += thread_offsets[index-stride];
+ cum_count[index] += cum_count[index-stride];
}
}
__syncthreads();
- if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
// Down-sweep phase
- for (int stride = threads/2; stride > 0; stride /= 2) {
+ for (int stride = threads/4; stride > 0; stride /= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
- if (index < threads) {
- int temp = thread_offsets[index - stride];
- thread_offsets[index - stride] = thread_offsets[index];
- thread_offsets[index] += temp;
+ if (index + stride < threads) {
+ cum_count[index + stride] += cum_count[index];
}
}
__syncthreads();
- __shared__ int ntasks;
- if (t_id == threads-1) {
- ntasks = thread_offsets[threads-1] + count;
- }
- __syncthreads();
+ int ntasks = cum_count[threads-1];
if (ntasks == 0) {
return ntasks;
}
- int offset = thread_offsets[t_id];
+ int offset = 0;
+ if (t_id > 0) {
+ offset = cum_count[t_id-1];
+ }
for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
int tile_kl = tile_kl_mapping[t_kl_id];
if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
@@ -218,40 +213,35 @@ static int _fill_ejk_ip2_type3_tasks(ShellQuartet *shl_quartet_idx,
}
}
- // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
- extern __shared__ int thread_offsets[];
- thread_offsets[t_id] = count;
+ extern __shared__ int cum_count[];
+ cum_count[t_id] = count;
// Up-sweep phase
for (int stride = 1; stride < threads; stride *= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
if (index < threads) {
- thread_offsets[index] += thread_offsets[index-stride];
+ cum_count[index] += cum_count[index-stride];
}
}
__syncthreads();
- if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
// Down-sweep phase
- for (int stride = threads/2; stride > 0; stride /= 2) {
+ for (int stride = threads/4; stride > 0; stride /= 2) {
__syncthreads();
int index = (t_id + 1) * stride * 2 - 1;
- if (index < threads) {
- int temp = thread_offsets[index - stride];
- thread_offsets[index - stride] = thread_offsets[index];
- thread_offsets[index] += temp;
+ if (index + stride < threads) {
+ cum_count[index + stride] += cum_count[index];
}
}
__syncthreads();
- __shared__ int ntasks;
- if (t_id == threads-1) {
- ntasks = thread_offsets[threads-1] + count;
- }
- __syncthreads();
+ int ntasks = cum_count[threads-1];
if (ntasks == 0) {
return ntasks;
}
- int offset = thread_offsets[t_id];
+ int offset = 0;
+ if (t_id > 0) {
+ offset = cum_count[t_id-1];
+ }
for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
int tile_kl = tile_kl_mapping[t_kl_id];
if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
diff --git a/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu b/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu
index 1b2b79b3..6cbd22a5 100644
--- a/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu
+++ b/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu
@@ -23,6 +23,9 @@
#include "rys_roots.cu"
#include "create_tasks.cu"
+// TODO: benchmark performance for 34, 36, 41, 43, 45, 47, 51, 57
+#define GOUT_WIDTH 42
+
__device__
static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
ShellQuartet *shl_quartet_idx, int ntasks)
@@ -69,7 +72,7 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
double *g = rw + nsq_per_block * nroots*2;
double *Rpa_cicj = g + nsq_per_block * g_size*3;
double Rqc[3], Rpq[3];
- double gout[GWIDTH];
+ double gout[GOUT_WIDTH];
for (int task0 = 0; task0 < ntasks; task0 += nsq_per_block) {
__syncthreads();
@@ -126,9 +129,10 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
double Kab = exp(-theta_ij * (xjxi*xjxi+yjyi*yjyi+zjzi*zjzi));
Rpa[sq_id+3*nsq_per_block] = fac_sym * ci[ip] * cj[jp] * Kab;
}
- for (int gout_start = 0; gout_start < nfij*nfkl; gout_start+=gout_stride*GWIDTH) {
+ for (int gout_start = 0; gout_start < nfij*nfkl;
+ gout_start+=gout_stride*GOUT_WIDTH) {
#pragma unroll
- for (int n = 0; n < GWIDTH; ++n) { gout[n] = 0; }
+ for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; }
for (int klp = 0; klp < kprim*lprim; ++klp) {
int kp = klp / lprim;
@@ -197,11 +201,6 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
}
double rt = rw[sq_id + irys*2*nsq_per_block];
double rt_aa = rt / (aij + akl);
- double rt_aij = rt_aa * akl;
- double rt_akl = rt_aa * aij;
- double b00 = .5 * rt_aa;
- double b10 = .5/aij * (1 - rt_aij);
- double b01 = .5/akl * (1 - rt_akl);
// TRR
//for i in range(lij):
@@ -211,6 +210,8 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
// trr(i,k+1) = c0p * trr(i,k) + k*b01 * trr(i,k-1) + i*b00 * trr(i-1,k)
if (lij > 0) {
__syncthreads();
+ double rt_aij = rt_aa * akl;
+ double b10 = .5/aij * (1 - rt_aij);
// gx(0,n+1) = c0*gx(0,n) + n*b10*gx(0,n-1)
for (int n = gout_id; n < 3; n += gout_stride) {
double *_gx = g + n * g_size * nsq_per_block;
@@ -230,6 +231,9 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
if (lkl > 0) {
int lij3 = (lij+1)*3;
+ double rt_akl = rt_aa * aij;
+ double b00 = .5 * rt_aa;
+ double b01 = .5/akl * (1 - rt_akl);
for (int n = gout_id; n < lij3+gout_id; n += gout_stride) {
__syncthreads();
int i = n / 3; //for i in range(lij+1):
@@ -315,7 +319,7 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
double *gy = gx + nsq_per_block * g_size;
double *gz = gy + nsq_per_block * g_size;
#pragma unroll
- for (int n = 0; n < GWIDTH; ++n) {
+ for (int n = 0; n < GOUT_WIDTH; ++n) {
int ijkl = (gout_start + n*gout_stride+gout_id);
int kl = ijkl / nfij;
int ij = ijkl % nfij;
@@ -338,7 +342,7 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
int do_k = vk != NULL;
for (int i_dm = 0; i_dm < jk.n_dm; ++i_dm) {
#pragma unroll
- for (int n = 0; n < GWIDTH; ++n) {
+ for (int n = 0; n < GOUT_WIDTH; ++n) {
int ijkl = (gout_start + n*gout_stride+gout_id);
int kl = ijkl / nfij;
int ij = ijkl % nfij;
@@ -422,7 +426,7 @@ static void rys_sr_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds
double *g = rw + nsq_per_block * nroots*2;
double *Rpa_cicj = g + nsq_per_block * g_size*3;
double Rqc[3], Rpq[3];
- double gout[GWIDTH];
+ double gout[GOUT_WIDTH];
for (int task0 = 0; task0 < ntasks; task0 += nsq_per_block) {
__syncthreads();
@@ -479,9 +483,10 @@ static void rys_sr_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds
double Kab = exp(-theta_ij * (xjxi*xjxi+yjyi*yjyi+zjzi*zjzi));
Rpa[sq_id+3*nsq_per_block] = fac_sym * ci[ip] * cj[jp] * Kab;
}
- for (int gout_start = 0; gout_start < nfij*nfkl; gout_start+=gout_stride*GWIDTH) {
+ for (int gout_start = 0; gout_start < nfij*nfkl;
+ gout_start+=gout_stride*GOUT_WIDTH) {
#pragma unroll
- for (int n = 0; n < GWIDTH; ++n) { gout[n] = 0; }
+ for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; }
for (int klp = 0; klp < kprim*lprim; ++klp) {
int kp = klp / lprim;
@@ -669,7 +674,7 @@ static void rys_sr_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds
double *gy = gx + nsq_per_block * g_size;
double *gz = gy + nsq_per_block * g_size;
#pragma unroll
- for (int n = 0; n < GWIDTH; ++n) {
+ for (int n = 0; n < GOUT_WIDTH; ++n) {
int ijkl = gout_start + n*gout_stride+gout_id;
int kl = ijkl / nfij;
int ij = ijkl % nfij;
@@ -692,7 +697,7 @@ static void rys_sr_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds
int do_k = vk != NULL;
for (int i_dm = 0; i_dm < jk.n_dm; ++i_dm) {
#pragma unroll
- for (int n = 0; n < GWIDTH; ++n) {
+ for (int n = 0; n < GOUT_WIDTH; ++n) {
int ijkl = (gout_start + n*gout_stride+gout_id);
int kl = ijkl / nfij;
int ij = ijkl % nfij;
diff --git a/gpu4pyscf/lib/gvhf-rys/rys_jk_driver.cu b/gpu4pyscf/lib/gvhf-rys/rys_jk_driver.cu
index 04c6d3ee..ba3c14a5 100644
--- a/gpu4pyscf/lib/gvhf-rys/rys_jk_driver.cu
+++ b/gpu4pyscf/lib/gvhf-rys/rys_jk_driver.cu
@@ -201,7 +201,7 @@ int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao,
int gout_stride = scheme[1];
int ij_prims = iprim * jprim;
dim3 threads(quartets_per_block, gout_stride);
- int buflen = (nroots*4 + g_size*3 + ij_prims*4) * quartets_per_block;// + ij_prims*4*TILE2;
+ int buflen = (nroots*2 + g_size*3 + ij_prims*4) * quartets_per_block;// + ij_prims*4*TILE2;
rys_sr_jk_kernel<<>>(envs, jk, bounds, pool, batch_head);
}
cudaError_t err = cudaGetLastError();
@@ -329,7 +329,7 @@ int RYS_per_atom_jk_ip1(double *ejk, double j_factor, double k_factor,
int ij_prims = iprim * jprim;
dim3 threads(quartets_per_block, gout_stride);
int buflen = (nroots*2 + g_size*3 + ij_prims*4) * quartets_per_block;
- buflen = MAX(buflen, 9*gout_stride*quartets_per_block);
+ buflen = MAX(buflen, 12*gout_stride*quartets_per_block);
rys_ejk_ip1_kernel<<>>(envs, jk, bounds, pool, batch_head);
}
cudaError_t err = cudaGetLastError();
diff --git a/gpu4pyscf/lib/logger.py b/gpu4pyscf/lib/logger.py
index c715976e..54713c43 100644
--- a/gpu4pyscf/lib/logger.py
+++ b/gpu4pyscf/lib/logger.py
@@ -17,9 +17,6 @@
import cupy
from pyscf import lib
-from pyscf.lib import parameters as param
-import pyscf.__config__
-
INFO = lib.logger.INFO
NOTE = lib.logger.NOTE
WARN = lib.logger.WARN
@@ -29,66 +26,63 @@
TIMER_LEVEL = lib.logger.TIMER_LEVEL
flush = lib.logger.flush
-if sys.version_info < (3, 0):
- process_clock = time.clock
- perf_counter = time.time
-else:
- process_clock = time.process_time
- perf_counter = time.perf_counter
+process_clock = time.process_time
+perf_counter = time.perf_counter
def init_timer(rec):
- if rec.verbose >= TIMER_LEVEL:
- e0 = cupy.cuda.Event()
- e0.record()
- return (process_clock(), perf_counter(), e0)
- elif rec.verbose >= DEBUG:
- return (process_clock(), perf_counter())
- else:
- return process_clock(),
+ e0 = cupy.cuda.Event()
+ e0.record()
+ return (process_clock(), perf_counter(), e0)
def timer(rec, msg, cpu0=None, wall0=None, gpu0=None):
- if cpu0 is None:
- cpu0 = rec._t0
- if wall0 and gpu0:
- rec._t0, rec._w0, rec._e0 = process_clock(), perf_counter(), cupy.cuda.Event()
+ if gpu0:
+ t0, w0, e0 = process_clock(), perf_counter(), cupy.cuda.Event()
+ e0.record()
if rec.verbose >= TIMER_LEVEL:
- rec._e0.record()
- rec._e0.synchronize()
-
+ e0.synchronize()
flush(rec, ' CPU time for %-50s %9.2f sec, wall time %9.2f sec, GPU time %9.2f ms'
- % (msg, rec._t0-cpu0, rec._w0-wall0, cupy.cuda.get_elapsed_time(gpu0,rec._e0)))
- return rec._t0, rec._w0, rec._e0
+ % (msg, t0-cpu0, w0-wall0, cupy.cuda.get_elapsed_time(gpu0,e0)))
+ return t0, w0, e0
elif wall0:
- rec._t0, rec._w0 = process_clock(), perf_counter()
+ t0, w0 = process_clock(), perf_counter()
if rec.verbose >= TIMER_LEVEL:
flush(rec, ' CPU time for %s %9.2f sec, wall time %9.2f sec'
- % (msg, rec._t0-cpu0, rec._w0-wall0))
- return rec._t0, rec._w0
+ % (msg, t0-cpu0, w0-wall0))
+ return t0, w0
else:
- rec._t0 = process_clock()
+ t0 = process_clock()
if rec.verbose >= TIMER_LEVEL:
- flush(rec, ' CPU time for %s %9.2f sec' % (msg, rec._t0-cpu0))
- return rec._t0,
+ flush(rec, ' CPU time for %s %9.2f sec' % (msg, t0-cpu0))
+ return t0,
def _timer_debug1(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True):
if rec.verbose >= DEBUG1:
return timer(rec, msg, cpu0, wall0, gpu0)
- elif wall0 and gpu0:
- rec._t0, rec._w0, rec._e0 = process_clock(), perf_counter(), cupy.cuda.Event()
- rec._e0.record()
- return rec._t0, rec._w0, rec._e0
+ elif gpu0:
+ t0, w0, e0 = process_clock(), perf_counter(), cupy.cuda.Event()
+ e0.record()
+ return t0, w0, e0
elif wall0:
- rec._t0, rec._w0 = process_clock(), perf_counter()
- return rec._t0, rec._w0
+ t0, w0 = process_clock(), perf_counter()
+ return t0, w0
else:
- rec._t0 = process_clock()
- return rec._t0,
+ t0 = process_clock()
+ return t0,
def _timer_debug2(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True):
if rec.verbose >= DEBUG2:
return timer(rec, msg, cpu0, wall0, gpu0)
- return cpu0, wall0, gpu0
+ elif gpu0:
+ t0, w0, e0 = process_clock(), perf_counter(), cupy.cuda.Event()
+ e0.record()
+ return t0, w0, e0
+ elif wall0:
+ t0, w0 = process_clock(), perf_counter()
+ return t0, w0
+ else:
+ t0 = process_clock()
+ return t0,
info = lib.logger.info
note = lib.logger.note
diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py
index c961a9a2..ce52046a 100644
--- a/gpu4pyscf/lib/memcpy.py
+++ b/gpu4pyscf/lib/memcpy.py
@@ -15,6 +15,27 @@
import cupy
import numpy as np
+from gpu4pyscf.__config__ import _p2p_access
+
+__all__ = ['p2p_transfer', 'copy_array']
+
+def p2p_transfer(a, b):
+ ''' If the direct P2P data transfer is not available, transfer data via CPU memory
+ '''
+ if a.device == b.device:
+ a[:] = b
+ elif _p2p_access:
+ a[:] = b
+ '''
+ elif a.strides == b.strides and a.flags.c_contiguous and a.dtype == b.dtype:
+ # cupy supports a direct copy from different devices without p2p. See also
+ # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L48
+ # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L1015
+ a[:] = b
+ '''
+ else:
+ copy_array(b, a)
+ return a
def find_contiguous_chunks(shape, h_strides, d_strides):
"""
diff --git a/gpu4pyscf/lib/multi_gpu.py b/gpu4pyscf/lib/multi_gpu.py
new file mode 100644
index 00000000..f9e1e8ee
--- /dev/null
+++ b/gpu4pyscf/lib/multi_gpu.py
@@ -0,0 +1,153 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from concurrent.futures import ThreadPoolExecutor
+import cupy as cp
+import numpy as np
+from pyscf.lib import prange
+from gpu4pyscf.lib.memcpy import p2p_transfer
+from gpu4pyscf.__config__ import num_devices
+
+def run(func, args=(), kwargs={}, non_blocking=False):
+ '''Execute a function on each GPU.
+
+ Kwargs:
+ non_blocking: If `True`, functions are executed in parallel using multi-threads.
+ '''
+ if num_devices == 1:
+ return [func(*args, *kwargs)]
+
+ def proc(device_id):
+ with cp.cuda.Device(device_id):
+ return func(*args, **kwargs)
+
+ if not non_blocking:
+ return [proc(i) for i in range(num_devices)]
+
+ with ThreadPoolExecutor(max_workers=num_devices) as ex:
+ futures = [ex.submit(proc, i) for i in range(num_devices)]
+ return [fut.result() for fut in futures]
+
+def map(func, tasks, args=(), kwargs={}, schedule='dynamic') -> list:
+ '''Distributes tasks to multiple GPU devices for parallel computation.
+
+ Kwargs:
+ schedule: controls how the tasks are distributed. Can be 'static' or 'dynamic'.
+ If 'static', tasks are distributed in the round-robin fashion;
+ If 'dynamic', tasks are scheduled dynamically, with better load balance.
+ '''
+ if num_devices == 1:
+ return [func(t, *args, *kwargs) for t in tasks]
+
+ tasks = list(enumerate(tasks))
+ result = [None] * len(tasks)
+
+ def consumer():
+ if schedule == 'dynamic':
+ stream = cp.cuda.stream.get_current_stream()
+ while tasks:
+ try:
+ key, t = tasks.pop()
+ except IndexError:
+ return
+ result[key] = func(t, *args, **kwargs)
+ stream.synchronize()
+ else:
+ device_id = cp.cuda.device.get_device_id()
+ for key, t in tasks[device_id::num_devices]:
+ result[key] = func(t, *args, **kwargs)
+
+ run(consumer, non_blocking=True)
+ return result
+
+def reduce(func, tasks, args=(), kwargs={}, schedule='dynamic'):
+ '''Processes tasks on multiple GPU devices and returns the sum of the results.
+ '''
+ result = map(func, tasks, args, kwargs)
+ dtype = cp.result_type(*result)
+ if num_devices == 1:
+ out = result[0].astype(dtype=dtype, copy=False)
+ for r in result[1:]:
+ out += r
+ return out
+
+ groups = [None] * num_devices
+ for r in result:
+ device_id = r.device.id
+ if groups[device_id] is None:
+ groups[device_id] = r.astype(dtype, copy=False)
+ else:
+ groups[device_id] += r
+
+ for i in num_devices:
+ if groups[i] is None:
+ groups[i] = cp.zeros(result[0].shape, dtype=dtype)
+ return array_reduce(groups, inplace=True)
+
+def array_broadcast(a):
+ '''Broadcast a cupy ndarray to all devices, return a list of cupy ndarrays.
+ '''
+ if num_devices == 1:
+ return [a]
+
+ out = [None] * num_devices
+ out[0] = a
+
+ # Tree broadcast
+ step = num_devices >> 1
+ while step > 0:
+ for device_id in range(0, num_devices, 2*step):
+ if device_id + step < num_devices:
+ with cp.cuda.Device(device_id+step):
+ out[device_id+step] = dst = cp.empty_like(a)
+ p2p_transfer(dst, a)
+ step >>= 1
+ return out
+
+def array_reduce(array_list, inplace=False):
+ '''The sum of cupy ndarrays from all devices to device 0.
+ '''
+ assert len(array_list) == num_devices
+ if num_devices == 1:
+ return array_list[0]
+
+ a0 = array_list[0]
+ out_shape = a0.shape
+ size = a0.size
+ dtype = a0.dtype
+ assert all(x.dtype == dtype for x in array_list)
+
+ array_list = list(array_list)
+ for device_id in range(num_devices):
+ with cp.cuda.Device(device_id):
+ if inplace or device_id % 2 == 1:
+ array_list[device_id] = array_list[device_id].ravel()
+ else:
+ array_list[device_id] = array_list[device_id].copy().ravel()
+
+ blksize = 1024*1024*1024 // dtype.itemsize # 1GB
+ # Tree-reduce
+ step = 1
+ while step < num_devices:
+ for device_id in range(0, num_devices, 2*step):
+ if device_id + step < num_devices:
+ with cp.cuda.Device(device_id):
+ dst = array_list[device_id]
+ src = array_list[device_id+step]
+ buf = cp.empty_like(dst[:blksize])
+ for p0, p1 in prange(0, size, blksize):
+ dst[p0:p1] += p2p_transfer(buf[:p1-p0], src[p0:p1])
+ step *= 2
+ return array_list[0].reshape(out_shape)
diff --git a/gpu4pyscf/lib/pbc/CMakeLists.txt b/gpu4pyscf/lib/pbc/CMakeLists.txt
index a961cbc2..f8d7a842 100644
--- a/gpu4pyscf/lib/pbc/CMakeLists.txt
+++ b/gpu4pyscf/lib/pbc/CMakeLists.txt
@@ -2,6 +2,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")# -maxrregcount=12
add_library(pbc SHARED
pbc_driver.cu ft_ao.cu unrolled_ft_ao.cu
+ fill_int3c2e.cu unrolled_int3c2e.cu
)
set_target_properties(pbc PROPERTIES
diff --git a/gpu4pyscf/lib/pbc/fill_int3c2e.cu b/gpu4pyscf/lib/pbc/fill_int3c2e.cu
new file mode 100644
index 00000000..55aa3fcc
--- /dev/null
+++ b/gpu4pyscf/lib/pbc/fill_int3c2e.cu
@@ -0,0 +1,702 @@
+/*
+ * Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+#include
+#include
+#include
+
+#include "gvhf-rys/vhf.cuh"
+#include "rys_roots.cu"
+#include "int3c2e.cuh"
+
+#define THREADS (WARP_SIZE*WARPS)
+// TODO: benchmark performance for 32, 38, 40, 45, 54
+#define GOUT_WIDTH 45
+
+__global__
+void pbc_int3c2e_kernel(double *out, PBCInt3c2eEnvVars envs, PBCInt3c2eBounds bounds)
+{
+ int nksh_per_block = blockDim.x;
+ int gout_stride = blockDim.y;
+ int nsp_per_block = blockDim.z;
+ int ksh_id = threadIdx.x;
+ int gout_id = threadIdx.y;
+ int sp_id = threadIdx.z;
+ int sp_block_id = blockIdx.x;
+ int ksh_block_id = blockIdx.y;
+
+ int nksp_per_block = nksh_per_block * nsp_per_block;
+ int ksp_id = nksh_per_block * sp_id + ksh_id;
+ int thread_id = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
+ int warp_id = thread_id / WARP_SIZE;
+ int nimgs = envs.nimgs;
+ int sp0_this_block = sp_block_id * nsp_per_block * SPTAKS_PER_BLOCK;
+ int ksh0_this_block = ksh_block_id * nksh_per_block;
+ int nksh = MIN(bounds.nksh - ksh0_this_block, nksh_per_block);
+ int ksh0 = ksh0_this_block + bounds.ksh0;
+
+ int li = bounds.li;
+ int lj = bounds.lj;
+ int lk = bounds.lk;
+ int lij = li + lj;
+ int nroots = bounds.nroots;
+ int nfi = bounds.nfi;
+ int nfij = bounds.nfij;
+ int nfk = bounds.nfk;
+ int iprim = bounds.iprim;
+ int jprim = bounds.jprim;
+ int kprim = bounds.kprim;
+ int ijprim = iprim * jprim;
+ int ijkprim = ijprim * kprim;
+ int stride_j = bounds.stride_j;
+ int stride_k = bounds.stride_k;
+ int g_size = bounds.g_size;
+ int *idx_ij = c_g_pair_idx + c_g_pair_offsets[li*LMAX1+lj];
+ int *idy_ij = idx_ij + nfij;
+ int *idz_ij = idy_ij + nfij;
+ int lk_offset = lk * (lk + 1) * (lk + 2) / 2;
+ int *idx_k = c_g_cart_idx + lk_offset;
+ int *idy_k = idx_k + nfk;
+ int *idz_k = idy_k + nfk;
+ int *bas = envs.bas;
+ double *env = envs.env;
+ double *img_coords = envs.img_coords;
+ int *img_idx = bounds.img_idx;
+ int *sp_img_offsets = bounds.img_offsets;
+ double omega = env[PTR_RANGE_OMEGA];
+
+ int gx_len = g_size * nksp_per_block;
+ extern __shared__ double rw_buffer[];
+ double *rw = rw_buffer + ksp_id;
+ double *g = rw + nksp_per_block * nroots*2;
+ double *gx = g;
+ double *gy = gx + gx_len;
+ double *gz = gy + gx_len;
+ double *rjri = gz + gx_len;
+ double *Rpq = rjri + nksp_per_block * 3;
+ __shared__ int img_counts_in_warp[WARPS];
+ double gout[GOUT_WIDTH];
+
+ int ntasks = nksh * nsp_per_block * SPTAKS_PER_BLOCK;
+ for (int task_id = 0; task_id < ntasks; task_id += nksp_per_block) {
+ // convert task_id to ish, jsh, ksh
+ int ijk_idx = task_id + ksp_id;
+ int ksh = ijk_idx % nksh + ksh0;
+ int pair_ij_idx = ijk_idx / nksh + sp0_this_block;
+ int img1 = 1;
+ int pair_ij = pair_ij_idx;
+ if (pair_ij_idx >= bounds.npairs_ij) {
+ pair_ij = sp0_this_block;
+ } else {
+ img1 = sp_img_offsets[pair_ij_idx+1];
+ }
+ int bas_ij = bounds.bas_ij_idx[pair_ij];
+ int img0 = sp_img_offsets[pair_ij];
+ int thread_id_in_warp = thread_id % WARP_SIZE;
+ if (thread_id_in_warp == 0) {
+ img_counts_in_warp[warp_id] = 0;
+ }
+ atomicMax(&img_counts_in_warp[warp_id], img1-img0);
+ __syncthreads();
+
+ int nbas = envs.cell0_nbas * envs.bvk_ncells;
+ int ish = bas_ij / nbas;
+ int jsh = bas_ij % nbas;
+ double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+ double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+ double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+ double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+ double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+ double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+
+ for (int gout_start = 0; gout_start < nfij*nfk;
+ gout_start+=gout_stride*GOUT_WIDTH) {
+#pragma unroll
+ for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; }
+
+ for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+ int ijp = ijkp / kprim;
+ int kp = ijkp % kprim;
+ int ip = ijp / jprim;
+ int jp = ijp % jprim;
+ double ai = expi[ip];
+ double aj = expj[jp];
+ double ak = expk[kp];
+ double aij = ai + aj;
+ double cijk = ci[ip] * cj[jp] * ck[kp];
+ __syncthreads();
+ if (gout_id == 0) {
+ double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+ gy[0] = fac;
+ }
+ int img_counts = img_counts_in_warp[warp_id];
+ for (int img = 0; img < img_counts; ++img) {
+ int img_id = img0 + img;
+ __syncthreads();
+ if (img_id >= img1) {
+ // ensure the same number of images processed in the same warp
+ img_id = img0;
+ if (gout_id == 0) {
+ gy[0] = 0.;
+ }
+ }
+ int img_ij = img_idx[img_id];
+ int iL = img_ij / nimgs;
+ int jL = img_ij % nimgs;
+ double xi = ri[0] + img_coords[iL*3+0];
+ double yi = ri[1] + img_coords[iL*3+1];
+ double zi = ri[2] + img_coords[iL*3+2];
+ double xj = rj[0] + img_coords[jL*3+0];
+ double yj = rj[1] + img_coords[jL*3+1];
+ double zj = rj[2] + img_coords[jL*3+2];
+ double xjxi = xj - xi;
+ double yjyi = yj - yi;
+ double zjzi = zj - zi;
+ double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+ double aj_aij = aj / aij;
+ double theta_ij = ai * aj_aij;
+ double Kab = theta_ij * rr_ij;
+
+ double xij = xjxi * aj_aij + xi;
+ double yij = yjyi * aj_aij + yi;
+ double zij = zjzi * aj_aij + zi;
+ double xpq = xij - rk[0];
+ double ypq = yij - rk[1];
+ double zpq = zij - rk[2];
+ double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+ double theta = aij * ak / (aij + ak);
+ double omega2 = omega * omega;
+ double theta_fac = omega2 / (omega2 + theta);
+ double theta_rr = theta * rr;
+// Somehow, this screening test does not filter out many integrals.
+// More benchmarks are needed
+#if 0
+ __shared__ int8_t img_mask[WARPS];
+ if (thread_id_in_warp == 0) {
+ img_mask[warp_id] = 0;
+ }
+ float Kab_f32 = Kab;
+ // IMPORTANT: run the screening test on each warp.
+ // When nksh_per_block*gout_stride>32, gout is evaluated across warps.
+ // If tests are skipped for some warps, g[xyz] vectors and
+ // gout on these warps will never be evaluated. These warps
+ // may proceeed to a wrong __syncthreads() barrier and
+ // produce wrong g[xyz].
+ float log_cutoff = envs.log_cutoff;
+ if ((thread_id_in_warp / nksh_per_block == 0) &&
+ img0+img < img1 && 5.f+2.f*lij-Kab_f32 > log_cutoff) {
+ // check any not vanished integrals
+ float ai_f32 = ai;
+ float aj_f32 = aj;
+ float aij_f32 = aij;
+ float ak_f32 = ak;
+ float fi = ai_f32 / aij_f32;
+ float fj = aj_f32 / aij_f32;
+ // fac_guess = log(sqrt(2.x/(omega*sqrt(pi))) * ((2*li+1)*(2*lj+1)*(2*lk+1))**.5/(4*pi)**1.5)
+ // ~ between [0, 2]
+ float fac_guess = 1.f;
+ // fac in Eq 63 of arXiv:2302.11307 ~ log(ci*cj*ck * (pi^2/(aij*ak))**1.5)
+ float log_fac = logf(fabs(cijk)) + 3.434f - 1.5f*logf(aij_f32*ak_f32) + fac_guess;
+ float theta_fac_rr = (float)theta_fac * (float)theta_rr;
+ float rt_aa = sqrtf((float)rr) / (aij_f32+ak_f32) + 1e-9f;
+ float rt_aij = rt_aa * ak_f32;
+ float rt_akl = rt_aa * aij_f32;
+ float r = sqrtf((float)rr_ij);
+ float ti = fj * r + rt_aij;
+ float tj = fi * r + rt_aij;
+ float ti_fac = .5f*li * logf(ti*ti + .5f*li/aij_f32);
+ float tj_fac = .5f*lj * logf(tj*tj + .5f*lj/aij_f32);
+ float tk_fac = .5f*lk * logf(rt_akl*rt_akl + .5f*lk/ak_f32);
+ float estimator = log_fac + ti_fac + tj_fac + tk_fac - Kab_f32 - theta_fac_rr;
+ if (estimator > log_cutoff) {
+ img_mask[warp_id] = 1;
+ }
+ }
+ __syncthreads();
+ if (img_mask[warp_id] == 0) {
+ continue;
+ }
+#endif
+ if (gout_id == 0) {
+ rjri[0*nksp_per_block] = xjxi;
+ rjri[1*nksp_per_block] = yjyi;
+ rjri[2*nksp_per_block] = zjzi;
+ Rpq[0*nksp_per_block] = xpq;
+ Rpq[1*nksp_per_block] = ypq;
+ Rpq[2*nksp_per_block] = zpq;
+ gx[0] = exp(-Kab);
+ }
+ int _nroots = nroots/2;
+ rys_roots(_nroots, theta_rr, rw+nroots*nksp_per_block,
+ nksp_per_block, gout_id, gout_stride);
+ rys_roots(_nroots, theta_fac*theta_rr, rw,
+ nksp_per_block, gout_id, gout_stride);
+ __syncthreads();
+ double sqrt_theta_fac = -sqrt(theta_fac);
+ for (int irys = gout_id; irys < _nroots; irys+=gout_stride) {
+ rw[ irys*2 *nksp_per_block] *= theta_fac;
+ rw[(irys*2+1)*nksp_per_block] *= sqrt_theta_fac;
+ }
+ double s0x, s1x, s2x;
+ for (int irys = 0; irys < nroots; ++irys) {
+ __syncthreads();
+ if (gout_id == 0) {
+ gz[0] = rw[(irys*2+1)*nksp_per_block];
+ }
+ double rt = rw[ irys*2 *nksp_per_block];
+ double rt_aa = rt / (aij + ak);
+
+ if (lij > 0) {
+ __syncthreads();
+ double rt_aij = rt_aa * ak;
+ double b10 = .5/aij * (1 - rt_aij);
+ // gx(0,n+1) = c0*gx(0,n) + n*b10*gx(0,n-1)
+ for (int n = gout_id; n < 3; n += gout_stride) {
+ double *_gx = gx + n * gx_len;
+ double xpa = rjri[n*nksp_per_block] * aj_aij;
+ //double c0x = Rpa[ir] - rt_aij * Rpq[n];
+ double c0x = xpa - rt_aij * Rpq[n*nksp_per_block];
+ s0x = _gx[0];
+ s1x = c0x * s0x;
+ _gx[nksp_per_block] = s1x;
+ for (int i = 1; i < lij; ++i) {
+ s2x = c0x * s1x + i * b10 * s0x;
+ _gx[(i+1)*nksp_per_block] = s2x;
+ s0x = s1x;
+ s1x = s2x;
+ }
+ }
+ }
+
+ if (lk > 0) {
+ int lij3 = (lij+1)*3;
+ double rt_ak = rt_aa * aij;
+ double b00 = .5 * rt_aa;
+ double b01 = .5/ak * (1 - rt_ak );
+ for (int n = gout_id; n < lij3+gout_id; n += gout_stride) {
+ __syncthreads();
+ int i = n / 3; //for i in range(lij+1):
+ int _ix = n % 3; // TODO: remove _ix for nroots > 2
+ double *_gx = gx + (i + _ix * g_size) * nksp_per_block;
+ double cpx = rt_ak * Rpq[_ix*nksp_per_block];
+ //for i in range(lij+1):
+ // trr(i,1) = c0p * trr(i,0) + i*b00 * trr(i-1,0)
+ if (n < lij3) {
+ s0x = _gx[0];
+ s1x = cpx * s0x;
+ if (i > 0) {
+ s1x += i * b00 * _gx[-nksp_per_block];
+ }
+ _gx[stride_k*nksp_per_block] = s1x;
+ }
+ //for k in range(1, lk):
+ // for i in range(lij+1):
+ // trr(i,k+1) = cp * trr(i,k) + k*b01 * trr(i,k-1) + i*b00 * trr(i-1,k)
+ for (int k = 1; k < lk; ++k) {
+ __syncthreads();
+ if (n < lij3) {
+ s2x = cpx*s1x + k*b01*s0x;
+ if (i > 0) {
+ s2x += i * b00 * _gx[(k*stride_k-1)*nksp_per_block];
+ }
+ _gx[(k*stride_k+stride_k)*nksp_per_block] = s2x;
+ s0x = s1x;
+ s1x = s2x;
+ }
+ }
+ }
+ }
+
+ // hrr
+ // g(i,j+1) = rirj * g(i,j) + g(i+1,j)
+ // g(...,k,l+1) = rkrl * g(...,k,l) + g(...,k+1,l)
+ if (lj > 0) {
+ __syncthreads();
+ if (task_id < ntasks) {
+ int lk3 = (lk+1)*3;
+ for (int m = gout_id; m < lk3; m += gout_stride) {
+ int k = m / 3;
+ int _ix = m % 3;
+ double xjxi = rjri[_ix*nksp_per_block];
+ double *_gx = g + (_ix*g_size + k*stride_k) * nksp_per_block;
+ for (int j = 0; j < lj; ++j) {
+ int ij = (lij-j) + j*stride_j;
+ s1x = _gx[ij*nksp_per_block];
+ for (--ij; ij >= j*stride_j; --ij) {
+ s0x = _gx[ij*nksp_per_block];
+ _gx[(ij+stride_j)*nksp_per_block] = s1x - xjxi * s0x;
+ s1x = s0x;
+ }
+ }
+ }
+ }
+ }
+
+ __syncthreads();
+#pragma unroll
+ for (int n = 0; n < GOUT_WIDTH; ++n) {
+ int ijk = gout_start + n*gout_stride+gout_id;
+ int k = ijk / nfij;
+ int ij = ijk % nfij;
+ if (k >= nfk) break;
+ int addrx = (idx_ij[ij] + idx_k[k] * stride_k) * nksp_per_block;
+ int addry = (idy_ij[ij] + idy_k[k] * stride_k) * nksp_per_block;
+ int addrz = (idz_ij[ij] + idz_k[k] * stride_k) * nksp_per_block;
+ gout[n] += gx[addrx] * gy[addry] * gz[addrz];
+ }
+ }
+ }
+ }
+
+ if (pair_ij_idx < bounds.npairs_ij) {
+ int *ao_loc = envs.ao_loc;
+ int nbasp = envs.cell0_nbas;
+ int ncells = envs.bvk_ncells;
+ int cell_i = ish / nbasp;
+ int cell0_ish = ish % nbasp;
+ int cell_j = jsh / nbasp;
+ int cell0_jsh = jsh % nbasp;
+ int nrow = bounds.nrow;
+ int ncol = bounds.ncol;
+ size_t naux = bounds.naux;
+ int i0 = ao_loc[cell0_ish] - ao_loc[bounds.ish0];
+ int j0 = ao_loc[cell0_jsh] - ao_loc[bounds.jsh0];
+ int k0 = ao_loc[ksh] - ao_loc[bounds.ksh0];
+ double *eri_tensor = out + (((cell_i * nrow + i0) * ncells +
+ cell_j) * ncol + j0) * naux + k0;
+ int nKj = ncells * ncol;
+ for (int n = 0; n < GOUT_WIDTH; ++n) {
+ int ijk = gout_start + n*gout_stride+gout_id;
+ size_t k = ijk / nfij;
+ size_t ij = ijk % nfij;
+ if (k >= nfk) break;
+ size_t i = ij % nfi;
+ size_t j = ij / nfi;
+ size_t addr = (i*nKj+j)*naux + k;
+ eri_tensor[addr] = gout[n];
+ }
+ }
+ }
+ }
+}
+
+__global__
+void sr_int3c2e_img_counts_kernel(int *img_counts, PBCInt3c2eEnvVars envs,
+ float *exps, float *log_coeff, float *aux_exps,
+ int ish0, int jsh0, int nish, int njsh)
+{
+ int Ki = blockIdx.x;
+ int Kj = blockIdx.y;
+ int cell_i = Ki / nish;
+ int cell_j = Kj / njsh;
+ int cell0_ish = Ki % nish + ish0;
+ int cell0_jsh = Kj % njsh + jsh0;
+ int nbasp = envs.cell0_nbas;
+ int ish = cell_i * nbasp + cell0_ish;
+ int jsh = cell_j * nbasp + cell0_jsh;
+ int ncells = envs.bvk_ncells;
+ int nKj = ncells * njsh;
+ int thread_id = threadIdx.x;
+ int threads = blockDim.x;
+ int nimgs = envs.nimgs;
+ int nimgs2 = nimgs * nimgs;
+ int cell0_natm = envs.cell0_natm;
+ int *atm = envs.atm;
+ int *bas = envs.bas;
+ double *env = envs.env;
+ double *img_coords = envs.img_coords;
+ extern __shared__ float x_cache[];
+ float *y_cache = x_cache + cell0_natm;
+ float *z_cache = y_cache + cell0_natm;
+ for (int k = thread_id; k < cell0_natm; k += threads) {
+ double *rk = env + atm[k*ATM_SLOTS+PTR_COORD];
+ x_cache[k] = rk[0];
+ y_cache[k] = rk[1];
+ z_cache[k] = rk[2];
+ }
+ __syncthreads();
+
+ int li = bas[ANG_OF + ish0*BAS_SLOTS];
+ int lj = bas[ANG_OF + jsh0*BAS_SLOTS];
+ float ai = exps[cell0_ish];
+ float aj = exps[cell0_jsh];
+ float log_ci = log_coeff[cell0_ish];
+ float log_cj = log_coeff[cell0_jsh];
+ float aij = ai + aj;
+ float u = .5f / aij;
+ float fi = ai / aij;
+ float fj = aj / aij;
+ float theta_ij = ai * aj / aij;
+ float omega = env[PTR_RANGE_OMEGA];
+ if (omega == 0) {
+ omega = 0.1f;
+ }
+ float omega2 = omega * omega;
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ float xi = ri[0];
+ float yi = ri[1];
+ float zi = ri[2];
+ float xj = rj[0];
+ float yj = rj[1];
+ float zj = rj[2];
+ float log_cutoff = envs.log_cutoff;
+
+ // fac_guess = log(sqrt(2.x/(omega*sqrt(pi))) * ((2*li+1)*(2*lj+1)*(2*lk+1))**.5/(4*pi)**1.5)
+ // ~ between [0, 2]
+ float fac_guess = .5f - logf(omega2)/4;
+ float log_fac = log_ci + log_cj + 1.717f - 1.5f*logf(aij) + fac_guess;
+
+ int count = 0;
+ for (int ijL = thread_id; ijL < nimgs2; ijL += threads) {
+ int iL = ijL / nimgs;
+ int jL = ijL % nimgs;
+ float xiL = xi + img_coords[iL*3+0];
+ float yiL = yi + img_coords[iL*3+1];
+ float ziL = zi + img_coords[iL*3+2];
+ float xjL = xj + img_coords[jL*3+0];
+ float yjL = yj + img_coords[jL*3+1];
+ float zjL = zj + img_coords[jL*3+2];
+ float xjxi = xjL - xiL;
+ float yjyi = yjL - yiL;
+ float zjzi = zjL - ziL;
+ float xij = xjxi * fj + xiL;
+ float yij = yjyi * fj + yiL;
+ float zij = zjzi * fj + ziL;
+ float theta = (omega2 * aij) / (omega2 + aij);
+ float rr_min = 1e3f;
+ float theta_rr_min = 1e6f;
+ for (int k = 0; k < cell0_natm; ++k) {
+ float dx = xij - x_cache[k];
+ float dy = yij - y_cache[k];
+ float dz = zij - z_cache[k];
+ float rr = dx * dx + dy * dy + dz * dz;
+ float ak = aux_exps[k];
+ float theta_k = theta * ak / (theta + ak);
+ float theta_rr = theta_k * rr;
+ if (theta_rr < theta_rr_min) {
+ theta_rr_min = theta_rr;
+ rr_min = rr;
+ }
+ }
+
+ // exp(- 1/(1/aij+1/ak+1/omega^2) * r_guess^2) < 1e-9
+ // => ~ exp(- omega^2 * r_guess^2) < 1e-9
+ // => r_guess > 5/omega
+ // 1/(1/aij+1/ak+1/omega^2)*r_guess/aij in Eq 64 of arXiv:2302.11307
+ // ~ omega^2*r_guess/aij ~ omega/aij * 5.f
+ //float rt_aij = fabs(omega)/aij * 5.;
+ float rt_aij = omega2 * sqrtf(rr_min) / aij + 1e-9f;
+ float rr_ij = xjxi * xjxi + yjyi * yjyi + zjzi * zjzi;
+ float dr = sqrtf(rr_ij);
+ float dri = fj * dr + rt_aij;
+ float drj = fi * dr + rt_aij;
+ float dri_fac = .5f*li * logf(dri*dri + li*u);
+ float drj_fac = .5f*lj * logf(drj*drj + lj*u);
+ float estimator = log_fac + dri_fac + drj_fac - theta_ij*rr_ij - theta_rr_min;
+ if (estimator > log_cutoff) {
+ count += 1;
+ }
+ }
+
+ extern __shared__ int counts[];
+ counts[thread_id] = count;
+ __syncthreads();
+ for (int stride = threads / 2; stride > 0; stride /= 2) {
+ if (thread_id < stride) {
+ counts[thread_id] += counts[thread_id + stride];
+ }
+ __syncthreads();
+ }
+ if (thread_id == 0) {
+ img_counts[Ki*nKj+Kj] = counts[0];
+ }
+}
+
+__global__
+void sr_int3c2e_img_idx_kernel(int *img_idx, int *img_offsets, int *bas_mapping,
+ PBCInt3c2eEnvVars envs,
+ float *exps, float *log_coeff, float *aux_exps,
+ int ish0, int jsh0, int nish, int njsh)
+{
+ int thread_id = threadIdx.x;
+ int threads = blockDim.x;
+ int ncells = envs.bvk_ncells;
+ int nKj = ncells * njsh;
+ int row_id = blockIdx.x;
+ int bas_ij = bas_mapping[row_id];
+ int Ki = bas_ij / nKj;
+ int Kj = bas_ij % nKj;
+ int cell_i = Ki / nish;
+ int cell_j = Kj / njsh;
+ int cell0_ish = Ki % nish + ish0;
+ int cell0_jsh = Kj % njsh + jsh0;
+ int nbasp = envs.cell0_nbas;
+ int ish = cell_i * nbasp + cell0_ish;
+ int jsh = cell_j * nbasp + cell0_jsh;
+ int nimgs = envs.nimgs;
+ int nimgs2 = nimgs * nimgs;
+ int cell0_natm = envs.cell0_natm;
+ int *atm = envs.atm;
+ int *bas = envs.bas;
+ double *env = envs.env;
+ double *img_coords = envs.img_coords;
+ extern __shared__ int8_t mask[];
+ uint16_t* cum_count = (uint16_t *)(mask + IMG_BLOCK);
+ float *x_cache = (float *)(cum_count + threads);
+ float *y_cache = x_cache + cell0_natm;
+ float *z_cache = y_cache + cell0_natm;
+ for (int k = thread_id; k < cell0_natm; k += threads) {
+ double *rk = env + atm[k*ATM_SLOTS+PTR_COORD];
+ x_cache[k] = rk[0];
+ y_cache[k] = rk[1];
+ z_cache[k] = rk[2];
+ }
+ for (int i = thread_id; i < IMG_BLOCK; i += threads) {
+ mask[i] = 0;
+ }
+ __syncthreads();
+
+ int li = bas[ANG_OF + ish0*BAS_SLOTS];
+ int lj = bas[ANG_OF + jsh0*BAS_SLOTS];
+ float ai = exps[cell0_ish];
+ float aj = exps[cell0_jsh];
+ float log_ci = log_coeff[cell0_ish];
+ float log_cj = log_coeff[cell0_jsh];
+ float aij = ai + aj;
+ float u = .5f / aij;
+ float fi = ai / aij;
+ float fj = aj / aij;
+ float theta_ij = ai * aj / aij;
+ float omega = env[PTR_RANGE_OMEGA];
+ if (omega == 0) {
+ omega = 0.1f;
+ }
+ float omega2 = omega * omega;
+ double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+ double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+ float xi = ri[0];
+ float yi = ri[1];
+ float zi = ri[2];
+ float xj = rj[0];
+ float yj = rj[1];
+ float zj = rj[2];
+ float log_cutoff = envs.log_cutoff;
+
+ // fac_guess = log(sqrt(2.x/(omega*sqrt(pi))) * ((2*li+1)*(2*lj+1)*(2*lk+1))**.5/(4*pi)**1.5)
+ // ~ between [0, 2]
+ float fac_guess = .5f - logf(omega2)/4;
+ float log_fac = log_ci + log_cj + 1.717f - 1.5f*logf(aij) + fac_guess;
+ int offset_start = img_offsets[row_id];
+
+ for (int img_start = 0; img_start < nimgs2; img_start += IMG_BLOCK) {
+ int block_nimgs2 = MIN(IMG_BLOCK, nimgs2-img_start);
+ int bacth_size = (block_nimgs2 + threads - 1) / threads;
+ int ij0 = img_start + thread_id * bacth_size;
+ int ij1 = MIN(ij0 + bacth_size, nimgs2);
+
+ int count = 0;
+ for (int ijL = ij0; ijL < ij1; ++ijL) {
+ int iL = ijL / nimgs;
+ int jL = ijL % nimgs;
+ float xiL = xi + img_coords[iL*3+0];
+ float yiL = yi + img_coords[iL*3+1];
+ float ziL = zi + img_coords[iL*3+2];
+ float xjL = xj + img_coords[jL*3+0];
+ float yjL = yj + img_coords[jL*3+1];
+ float zjL = zj + img_coords[jL*3+2];
+ float xjxi = xjL - xiL;
+ float yjyi = yjL - yiL;
+ float zjzi = zjL - ziL;
+ float xij = xjxi * fj + xiL;
+ float yij = yjyi * fj + yiL;
+ float zij = zjzi * fj + ziL;
+ float theta = (omega2 * aij) / (omega2 + aij);
+ float rr_min = 1e3f;
+ float theta_rr_min = 1e6f;
+ for (int k = 0; k < cell0_natm; ++k) {
+ float dx = xij - x_cache[k];
+ float dy = yij - y_cache[k];
+ float dz = zij - z_cache[k];
+ float rr = dx * dx + dy * dy + dz * dz;
+ float ak = aux_exps[k];
+ float theta_k = theta * ak / (theta + ak);
+ float theta_rr = theta_k * rr;
+ if (theta_rr < theta_rr_min) {
+ theta_rr_min = theta_rr;
+ rr_min = rr;
+ }
+ }
+
+ // exp(- 1/(1/aij+1/ak+1/omega^2) * r_guess^2) < 1e-9
+ // => ~ exp(- omega^2 * r_guess^2) < 1e-9
+ // => r_guess > 5/omega
+ // 1/(1/aij+1/ak+1/omega^2)*r_guess/aij in Eq 64 of arXiv:2302.11307
+ // ~ omega^2*r_guess/aij ~ omega/aij * 5.f
+ //float rt_aij = fabs(omega)/aij * 5.;
+ float rt_aij = omega2 * sqrtf(rr_min) / aij + 1e-9f;
+ float rr_ij = xjxi * xjxi + yjyi * yjyi + zjzi * zjzi;
+ float dr = sqrtf(rr_ij);
+ float dri = fj * dr + rt_aij;
+ float drj = fi * dr + rt_aij;
+ float dri_fac = .5f*li * logf(dri*dri + li*u);
+ float drj_fac = .5f*lj * logf(drj*drj + lj*u);
+ float estimator = log_fac + dri_fac + drj_fac - theta_ij*rr_ij - theta_rr_min;
+ if (estimator > log_cutoff) {
+ mask[ijL - img_start] = 1;
+ count += 1;
+ }
+ }
+
+ cum_count[thread_id] = count;
+ // Up-sweep phase
+ for (int stride = 1; stride < threads; stride *= 2) {
+ __syncthreads();
+ int index = (thread_id + 1) * stride * 2 - 1;
+ if (index < threads) {
+ cum_count[index] += cum_count[index-stride];
+ }
+ }
+ __syncthreads();
+ // Down-sweep phase
+ for (int stride = threads/4; stride > 0; stride /= 2) {
+ __syncthreads();
+ int index = (thread_id + 1) * stride * 2 - 1;
+ if (index + stride < threads) {
+ cum_count[index + stride] += cum_count[index];
+ }
+ }
+ __syncthreads();
+
+ int offset = offset_start;
+ if (thread_id > 0) {
+ offset += cum_count[thread_id-1];
+ }
+ for (int ijL = ij0; ijL < ij1; ++ijL) {
+ if (mask[ijL-img_start]) {
+ img_idx[offset] = ijL;
+ mask[ijL-img_start] = 0;
+ ++offset;
+ }
+ }
+ offset_start += cum_count[threads-1];
+ __syncthreads();
+ }
+}
diff --git a/gpu4pyscf/lib/pbc/ft_ao.cu b/gpu4pyscf/lib/pbc/ft_ao.cu
index d9b6d5e2..40438340 100644
--- a/gpu4pyscf/lib/pbc/ft_ao.cu
+++ b/gpu4pyscf/lib/pbc/ft_ao.cu
@@ -20,7 +20,7 @@
#include
#include "gvhf-rys/vhf.cuh"
-#include "ft_ao.h"
+#include "ft_ao.cuh"
#define GOUT_WIDTH 19
// pi^1.5
@@ -204,7 +204,7 @@ void ft_aopair_kernel(double *out, AFTIntEnvVars envs, AFTBoundsInfo bounds)
#pragma unroll
for (int n = 0; n < GOUT_WIDTH; ++n) {
int ij = n*gout_stride + gout_id;
- if (ij >= nfij) continue;
+ if (ij >= nfij) break;
int addrx = idx_ij[ij] * nGv_per_block;
int addry = idy_ij[ij] * nGv_per_block;
int addrz = idz_ij[ij] * nGv_per_block;
@@ -237,7 +237,7 @@ void ft_aopair_kernel(double *out, AFTIntEnvVars envs, AFTBoundsInfo bounds)
+ Gv_block_id*nGv_per_block + Gv_id) * OF_COMPLEX;
for (int n = 0; n < GOUT_WIDTH; ++n) {
int ij = n*gout_stride + gout_id;
- if (ij >= nfij) continue;
+ if (ij >= nfij) break;
size_t i = ij % nfi;
size_t j = ij / nfi;
size_t addr = (i*nao+j)*nGv;
diff --git a/gpu4pyscf/lib/pbc/ft_ao.h b/gpu4pyscf/lib/pbc/ft_ao.cuh
similarity index 100%
rename from gpu4pyscf/lib/pbc/ft_ao.h
rename to gpu4pyscf/lib/pbc/ft_ao.cuh
diff --git a/gpu4pyscf/lib/pbc/int3c2e.cuh b/gpu4pyscf/lib/pbc/int3c2e.cuh
new file mode 100644
index 00000000..746fa138
--- /dev/null
+++ b/gpu4pyscf/lib/pbc/int3c2e.cuh
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2024 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+
+#define WARP_SIZE 32
+// corresponding to 256 threads
+#define WARPS 8
+#define IMG_MASK_SLOTS 1024
+#define L_AUX_MAX 6
+#define SPTAKS_PER_BLOCK 32
+#define IMG_BLOCK 16384
+
+#ifndef HAVE_DEFINED_PBCINT3CENVVAS_H
+#define HAVE_DEFINED_PBCINT3CENVVAS_H
+typedef struct {
+ uint16_t cell0_natm; // in the reference cell
+ uint16_t cell0_nbas; // in the reference cell
+ uint16_t bvk_ncells; // in bvk-cell
+ uint16_t nimgs; // number of images in lattice sum
+ int *atm;
+ int *bas;
+ double *env;
+ int *ao_loc; // in bvk-cell
+ double *img_coords; // vectors in lattice sum
+ float log_cutoff;
+} PBCInt3c2eEnvVars;
+
+typedef struct {
+ uint8_t li;
+ uint8_t lj;
+ uint8_t lk;
+ uint8_t nroots;
+ uint8_t nfi;
+ uint8_t nfij;
+ uint8_t nfk;
+ uint8_t iprim;
+ uint8_t jprim;
+ uint8_t kprim;
+ uint8_t stride_i;
+ uint8_t stride_j;
+ uint8_t stride_k;
+ uint8_t g_size;
+ uint16_t nrow;
+ uint16_t ncol;
+ uint16_t naux;
+ uint16_t nksh;
+ uint16_t ish0;
+ uint16_t jsh0;
+ uint16_t ksh0;
+ int npairs_ij;
+ int *bas_ij_idx;
+ int *img_idx; // indices of img_coords in each shell-pair
+ int *img_offsets; // offset img_idx for each shell-pair
+} PBCInt3c2eBounds;
+
+#ifdef __CUDACC__
+extern __constant__ int c_g_pair_idx[];
+extern __constant__ int c_g_pair_offsets[];
+extern __constant__ int c_g_cart_idx[];
+#endif
+#endif
diff --git a/gpu4pyscf/lib/pbc/pbc_driver.cu b/gpu4pyscf/lib/pbc/pbc_driver.cu
index b800efd6..45ab84cb 100644
--- a/gpu4pyscf/lib/pbc/pbc_driver.cu
+++ b/gpu4pyscf/lib/pbc/pbc_driver.cu
@@ -1,3 +1,19 @@
+/*
+ * Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
#include
#include
#include
@@ -5,23 +21,37 @@
#include
#include "gvhf-rys/vhf.cuh"
-#include "ft_ao.h"
+#include "int3c2e.cuh"
+#include "ft_ao.cuh"
-__constant__ int c_g_pair_idx[3675];
+__constant__ int c_g_pair_idx[3675]; // corresponding to LMAX=4
__constant__ int c_g_pair_offsets[LMAX1*LMAX1];
+__constant__ int c_g_cart_idx[252]; // corresponding to LMAX=6
extern __global__
void ft_aopair_kernel(double *out, AFTIntEnvVars envs, AFTBoundsInfo bounds);
extern __global__
void ft_aopair_fill_triu(double *out, int *conj_mapping, int bvk_ncells, int nGv);
+extern __global__
+void pbc_int3c2e_kernel(double *out, PBCInt3c2eEnvVars envs, PBCInt3c2eBounds bounds);
+extern __global__
+void sr_int3c2e_img_counts_kernel(int *img_counts, PBCInt3c2eEnvVars envs,
+ float *exps, float *log_coeff, float *aux_exps,
+ int ish0, int jsh0, int nish, int njsh);
+extern __global__
+void sr_int3c2e_img_idx_kernel(int *img_idx, int *img_offsets, int *bas_mapping,
+ PBCInt3c2eEnvVars envs,
+ float *exps, float *log_coeff, float *aux_exps,
+ int ish0, int jsh0, int nish, int njsh);
int ft_ao_unrolled(double *out, AFTIntEnvVars *envs, AFTBoundsInfo *bounds, int *scheme);
+int int3c2e_unrolled(double *out, PBCInt3c2eEnvVars *envs, PBCInt3c2eBounds *bounds);
extern "C" {
-int PBC_build_ft_ao(double *out, AFTIntEnvVars *envs,
- int *scheme, int *shls_slice, int npairs_ij, int ngrids,
- int *ish_in_pair, int *jsh_in_pair, double *grids,
- int *atm, int natm, int *bas, int nbas, double *env)
+int build_ft_ao(double *out, AFTIntEnvVars *envs,
+ int *scheme, int *shls_slice, int npairs_ij, int ngrids,
+ int *ish_in_pair, int *jsh_in_pair, double *grids,
+ int *atm, int natm, int *bas, int nbas, double *env)
{
uint16_t ish0 = shls_slice[0];
uint16_t jsh0 = shls_slice[2];
@@ -53,13 +83,13 @@ int PBC_build_ft_ao(double *out, AFTIntEnvVars *envs,
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
- fprintf(stderr, "CUDA Error in PBC_build_ft_ao: %s\n", cudaGetErrorString(err));
+ fprintf(stderr, "CUDA Error in build_ft_ao: %s\n", cudaGetErrorString(err));
return 1;
}
return 0;
}
-int PBC_ft_aopair_fill_triu(double *out, int *conj_mapping, int nao, int bvk_ncells, int nGv)
+int ft_aopair_fill_triu(double *out, int *conj_mapping, int nao, int bvk_ncells, int nGv)
{
int nGv2 = nGv * 2; // *2 for complex number
int threads = 1024;
@@ -67,18 +97,147 @@ int PBC_ft_aopair_fill_triu(double *out, int *conj_mapping, int nao, int bvk_nce
ft_aopair_fill_triu<<>>(out, conj_mapping, bvk_ncells, nGv2);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
- fprintf(stderr, "CUDA Error in PBC_ft_aopair_fill_triu: %s\n", cudaGetErrorString(err));
+ fprintf(stderr, "CUDA Error in ft_aopair_fill_triu: %s\n", cudaGetErrorString(err));
+ return 1;
+ }
+ return 0;
+}
+
+int fill_int3c2e(double *out, PBCInt3c2eEnvVars *envs,
+ int *scheme, int *shls_slice, int bvk_ncells,
+ int nrow, int ncol, int naux, int npairs_ij,
+ int *bas_ij_idx, int *img_idx, int *img_offsets,
+ int *atm, int natm, int *bas, int nbas, double *env)
+{
+ uint16_t ish0 = shls_slice[0];
+ uint16_t jsh0 = shls_slice[2];
+ uint16_t ksh0 = shls_slice[4] + nbas;
+ uint16_t ksh1 = shls_slice[5] + nbas;
+ uint16_t nksh = ksh1 - ksh0;
+ uint8_t li = bas[ANG_OF + ish0*BAS_SLOTS];
+ uint8_t lj = bas[ANG_OF + jsh0*BAS_SLOTS];
+ uint8_t lk = bas[ANG_OF + ksh0*BAS_SLOTS];
+ uint8_t iprim = bas[NPRIM_OF + ish0*BAS_SLOTS];
+ uint8_t jprim = bas[NPRIM_OF + jsh0*BAS_SLOTS];
+ uint8_t kprim = bas[NPRIM_OF + ksh0*BAS_SLOTS];
+ uint8_t nfi = (li+1)*(li+2)/2;
+ uint8_t nfj = (lj+1)*(lj+2)/2;
+ uint8_t nfk = (lk+1)*(lk+2)/2;
+ uint8_t nfij = nfi * nfj;
+ uint8_t order = li + lj + lk;
+ uint8_t nroots = order / 2 + 1;
+ double omega = env[PTR_RANGE_OMEGA];
+ if (omega < 0) { // SR ERIs
+ nroots *= 2;
+ }
+ uint8_t stride_i = 1;
+ uint8_t stride_j = li + 1;
+ uint8_t stride_k = stride_j * (lj + 1);
+ // up to (gg|i)
+ uint8_t g_size = stride_k * (lk + 1);
+ PBCInt3c2eBounds bounds = {li, lj, lk, nroots, nfi, nfij, nfk,
+ iprim, jprim, kprim, stride_i, stride_j, stride_k, g_size,
+ (uint16_t)nrow, (uint16_t)ncol, (uint16_t)naux, nksh, ish0, jsh0, ksh0,
+ npairs_ij, bas_ij_idx, img_idx, img_offsets};
+
+ if (!int3c2e_unrolled(out, envs, &bounds)) {
+ int nksh_per_block = scheme[0];
+ int gout_stride = scheme[1];
+ int nsp_per_block = scheme[2];
+ dim3 threads(nksh_per_block, gout_stride, nsp_per_block);
+ int tasks_per_block = SPTAKS_PER_BLOCK * nsp_per_block;
+ int sp_blocks = (npairs_ij + tasks_per_block - 1) / tasks_per_block;
+ int ksh_blocks = (nksh + nksh_per_block - 1) / nksh_per_block;
+ dim3 blocks(sp_blocks, ksh_blocks);
+ int buflen = (nroots*2+g_size*3+6) * (nksh_per_block * nsp_per_block) * sizeof(double);
+ pbc_int3c2e_kernel<<>>(out, *envs, bounds);
+ }
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess) {
+ fprintf(stderr, "CUDA Error in fill_int3c2e: %s\n", cudaGetErrorString(err));
+ return 1;
+ }
+ return 0;
+}
+
+int int3c2e_img_counts(int *img_counts, PBCInt3c2eEnvVars *envs,
+ int *shls_slice, float *exps, float *log_cs, float *aux_exps,
+ int bvk_ncells, int cell0_natm)
+{
+ int ish0 = shls_slice[0];
+ int ish1 = shls_slice[1];
+ int jsh0 = shls_slice[2];
+ int jsh1 = shls_slice[3];
+ int nish = ish1 - ish0;
+ int njsh = jsh1 - jsh0;
+ dim3 blocks(bvk_ncells*nish, bvk_ncells*njsh);
+ int buflen = cell0_natm * 3 * sizeof(float);
+ int threads = 512;
+ buflen = MAX(buflen, threads*sizeof(int));
+ sr_int3c2e_img_counts_kernel<<>>(
+ img_counts, *envs, exps, log_cs, aux_exps, ish0, jsh0, nish, njsh);
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess) {
+ fprintf(stderr, "CUDA Error in int3c2e_q_mask: %s\n", cudaGetErrorString(err));
return 1;
}
return 0;
}
-int PBC_FT_init_constant(int *g_pair_idx, int *offsets,
- double *env, int env_size, int shm_size)
+int int3c2e_img_idx(int *img_idx, int *img_offsets, int *bas_mapping, int nrow,
+ PBCInt3c2eEnvVars *envs,
+ int *shls_slice, float *exps, float *log_cs, float *aux_exps,
+ int bvk_ncells, int cell0_natm)
+
+{
+ int ish0 = shls_slice[0];
+ int ish1 = shls_slice[1];
+ int jsh0 = shls_slice[2];
+ int jsh1 = shls_slice[3];
+ int nish = ish1 - ish0;
+ int njsh = jsh1 - jsh0;
+ dim3 blocks(bvk_ncells*nish, bvk_ncells*njsh);
+ int buflen = cell0_natm * 3 * sizeof(float);
+ int threads = 512;
+ buflen = buflen + threads*sizeof(uint16_t) + IMG_BLOCK;
+ sr_int3c2e_img_idx_kernel<<>>(
+ img_idx, img_offsets, bas_mapping, *envs,
+ exps, log_cs, aux_exps, ish0, jsh0, nish, njsh);
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess) {
+ fprintf(stderr, "CUDA Error in int3c2e_img_idx: %s\n", cudaGetErrorString(err));
+ return 1;
+ }
+ return 0;
+}
+
+int init_constant(int *g_pair_idx, int *offsets,
+ double *env, int env_size, int shm_size)
{
cudaMemcpyToSymbol(c_g_pair_idx, g_pair_idx, 3675*sizeof(int));
cudaMemcpyToSymbol(c_g_pair_offsets, offsets, sizeof(int) * LMAX1*LMAX1);
+
+ int *g_cart_idx = (int *)malloc(252*sizeof(int));
+ int *idx, *idy, *idz;
+ idx = g_cart_idx;
+ for (int l = 0; l <= L_AUX_MAX; ++l) {
+ int nf = (l + 1) * (l + 2) / 2;
+ idy = idx + nf;
+ idz = idy + nf;
+ for (int i = 0, ix = l; ix >= 0; --ix) {
+ for (int iy = l - ix; iy >= 0; --iy, ++i) {
+ int iz = l - ix - iy;
+ idx[i] = ix;
+ idy[i] = iy;
+ idz[i] = iz;
+ } }
+ idx += nf * 3;
+ }
+ cudaMemcpyToSymbol(c_g_cart_idx, g_cart_idx, 252*sizeof(int));
+ free(g_cart_idx);
+
cudaFuncSetAttribute(ft_aopair_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+ cudaFuncSetAttribute(pbc_int3c2e_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size,
diff --git a/gpu4pyscf/lib/pbc/rys_roots.cu b/gpu4pyscf/lib/pbc/rys_roots.cu
new file mode 100644
index 00000000..a8700bd2
--- /dev/null
+++ b/gpu4pyscf/lib/pbc/rys_roots.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gvhf-rys/rys_roots.cuh"
+
+#define SQRTPIE4 .8862269254527580136
+#define PIE4 .7853981633974483096
+
+__device__
+static void rys_roots(int nroots, double x, double *rw,
+ int block_size, int worker_id, int workers)
+{
+ if (x < 3.e-7){
+ int off = nroots * (nroots - 1) / 2;
+ for (int i = worker_id; i < nroots; i += workers) {
+ rw[(i*2 )*block_size] = ROOT_SMALLX_R0[off+i] + ROOT_SMALLX_R1[off+i] * x;
+ rw[(i*2+1)*block_size] = ROOT_SMALLX_W0[off+i] + ROOT_SMALLX_W1[off+i] * x;
+ }
+ return;
+ }
+
+ if (nroots == 1) {
+ if (worker_id == 0) {
+ double tt = sqrt(x);
+ double fmt0 = SQRTPIE4 / tt * erf(tt);
+ rw[block_size] = fmt0;
+ double e = exp(-x);
+ double b = .5 / x;
+ double fmt1 = b * (fmt0 - e);
+ rw[0] = fmt1 / fmt0;
+ }
+ return;
+ }
+
+ if (x > 35+nroots*5) {
+ int off = nroots * (nroots - 1) / 2;
+ double t = sqrt(PIE4/x);
+ for (int i = worker_id; i < nroots; i += workers) {
+ rw[(i*2 )*block_size] = ROOT_LARGEX_R_DATA[off+i] / x;
+ rw[(i*2+1)*block_size] = ROOT_LARGEX_W_DATA[off+i] * t;
+ }
+ return;
+ }
+
+ double *datax = ROOT_RW_DATA + DEGREE1*INTERVALS * nroots*(nroots-1);
+ int it = (int)(x * .4);
+ double u = (x - it * 2.5) * 0.8 - 1.;
+ double u2 = u * 2.;
+ for (int rt_id = worker_id; rt_id < nroots*2; rt_id += workers) {
+ double *c = datax + rt_id * DEGREE1 * INTERVALS;
+ //for i in range(2, degree + 1):
+ // c0, c1 = c[degree-i] - c1, c0 + c1*u2
+ double c0 = c[it + DEGREE *INTERVALS];
+ double c1 = c[it +(DEGREE-1)*INTERVALS];
+ double c2, c3;
+#pragma unroll
+ for (int n = DEGREE-2; n > 0; n-=2) {
+ c2 = c[it + n *INTERVALS] - c1;
+ c3 = c0 + c1*u2;
+ c1 = c2 + c3*u2;
+ c0 = c[it +(n-1)*INTERVALS] - c3;
+ }
+ if (DEGREE % 2 == 0) {
+ c2 = c[it] - c1;
+ c3 = c0 + c1*u2;
+ rw[rt_id*block_size] = c2 + c3*u;
+ } else {
+ rw[rt_id*block_size] = c0 + c1*u;
+ }
+ }
+}
diff --git a/gpu4pyscf/lib/pbc/unrolled_ft_ao.cu b/gpu4pyscf/lib/pbc/unrolled_ft_ao.cu
index d2845274..d95d22b2 100644
--- a/gpu4pyscf/lib/pbc/unrolled_ft_ao.cu
+++ b/gpu4pyscf/lib/pbc/unrolled_ft_ao.cu
@@ -1,5 +1,5 @@
/*
- * Copyright 2024 The PySCF Developers. All Rights Reserved.
+ * Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
#include
#include
#include "gvhf-rys/vhf.cuh"
-#include "ft_ao.h"
+#include "ft_ao.cuh"
#define OVERLAP_FAC 5.56832799683170787
#define OF_COMPLEX 2
diff --git a/gpu4pyscf/lib/pbc/unrolled_int3c2e.cu b/gpu4pyscf/lib/pbc/unrolled_int3c2e.cu
new file mode 100644
index 00000000..0c7e0174
--- /dev/null
+++ b/gpu4pyscf/lib/pbc/unrolled_int3c2e.cu
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "int3c2e.cuh"
+
+int int3c2e_unrolled(double *out, PBCInt3c2eEnvVars *envs, PBCInt3c2eBounds *bounds)
+{
+ return 0;
+}
diff --git a/gpu4pyscf/lib/solvent/pcm.cu b/gpu4pyscf/lib/solvent/pcm.cu
index 7615f314..4d34ce97 100644
--- a/gpu4pyscf/lib/solvent/pcm.cu
+++ b/gpu4pyscf/lib/solvent/pcm.cu
@@ -78,9 +78,9 @@ static void _pcm_d_s(double *matrix_d, double *matrix_s,
__global__
static void _pcm_dD_dS(double *matrix_dd, double *matrix_ds,
- const double *coords, const double *norm_vec, const double *r_vdw,
- const double *charge_exp, const double *switch_fun,
- int n)
+ const double *coords, const double *norm_vec,
+ const double *charge_exp,
+ int n)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
@@ -130,6 +130,127 @@ static void _pcm_dD_dS(double *matrix_dd, double *matrix_ds,
}
}
+__global__
+static void _pcm_d2D_d2S(double *matrix_d2D, double *matrix_d2S,
+ const double *coords, const double *norm_vec,
+ const double *charge_exp,
+ int n)
+{
+ const int i = blockIdx.x * blockDim.x + threadIdx.x;
+ const int j = blockIdx.y * blockDim.y + threadIdx.y;
+ if (i >= n || j >= n) {
+ return;
+ }
+
+ // calculate xi
+ const double ei = charge_exp[i];
+ const double ej = charge_exp[j];
+ const double eij = ei * ej / sqrt(ei*ei + ej*ej);
+
+ // calculate r
+ const double dx = coords[3*i] - coords[3*j];
+ const double dy = coords[3*i+1] - coords[3*j+1];
+ const double dz = coords[3*i+2] - coords[3*j+2];
+ const double rij = norm3d(dx, dy, dz);
+ const double rij_1 = (i != j) ? (1.0 / rij) : 0.0; // This guarantees that if i == j, all matrix elements = 0
+ const double rij_2 = rij_1 * rij_1;
+ const double rij_3 = rij_2 * rij_1;
+ const double rij_4 = rij_2 * rij_2;
+ const double rij_5 = rij_2 * rij_3;
+ const double eij2 = eij * eij;
+
+ const double eij_rij = eij * rij;
+ const double erf_eij_rij = erf(eij_rij);
+ const double exp_minus_eij2_rij2 = exp(-eij_rij * eij_rij);
+ const double two_eij_over_sqrt_pi = 2.0 * eij / SQRT_PI;
+ const double two_eij_over_sqrt_pi_exp_minus_eij2_rij2 = exp_minus_eij2_rij2 * two_eij_over_sqrt_pi;
+
+ const double S_direct_product_prefactor = -two_eij_over_sqrt_pi_exp_minus_eij2_rij2 * (3 * rij_4 + 2 * eij2 * rij_2)
+ + 3 * rij_5 * erf_eij_rij;
+ const double S_xyz_diagonal_prefactor = two_eij_over_sqrt_pi_exp_minus_eij2_rij2 * rij_2 - rij_3 * erf_eij_rij;
+
+ const int n2 = n * n;
+ matrix_d2S[i*n + j ] = dx * dx * S_direct_product_prefactor + S_xyz_diagonal_prefactor;
+ matrix_d2S[i*n + j + n2 ] = dx * dy * S_direct_product_prefactor;
+ matrix_d2S[i*n + j + n2 * 2] = dx * dz * S_direct_product_prefactor;
+ matrix_d2S[i*n + j + n2 * 3] = dy * dx * S_direct_product_prefactor;
+ matrix_d2S[i*n + j + n2 * 4] = dy * dy * S_direct_product_prefactor + S_xyz_diagonal_prefactor;
+ matrix_d2S[i*n + j + n2 * 5] = dy * dz * S_direct_product_prefactor;
+ matrix_d2S[i*n + j + n2 * 6] = dz * dx * S_direct_product_prefactor;
+ matrix_d2S[i*n + j + n2 * 7] = dz * dy * S_direct_product_prefactor;
+ matrix_d2S[i*n + j + n2 * 8] = dz * dz * S_direct_product_prefactor + S_xyz_diagonal_prefactor;
+
+ if (matrix_d2D != NULL) {
+ const double nxj = norm_vec[3*j];
+ const double nyj = norm_vec[3*j+1];
+ const double nzj = norm_vec[3*j+2];
+ const double nj_rij = dx * nxj + dy * nyj + dz * nzj;
+
+ const double eij4 = eij2 * eij2;
+ const double rij_6 = rij_4 * rij_2;
+ const double rij_7 = rij_4 * rij_3;
+
+ const double D_direct_product_prefactor = (-two_eij_over_sqrt_pi_exp_minus_eij2_rij2 * (15 * rij_6 + 10 * eij2 * rij_4 + 4 * eij4 * rij_2)
+ + 15 * rij_7 * erf_eij_rij) * nj_rij;
+ matrix_d2D[i*n + j ] = D_direct_product_prefactor * dx * dx - S_direct_product_prefactor * (dx * nxj + dx * nxj + nj_rij);
+ matrix_d2D[i*n + j + n2 ] = D_direct_product_prefactor * dx * dy - S_direct_product_prefactor * (dy * nxj + dx * nyj);
+ matrix_d2D[i*n + j + n2 * 2] = D_direct_product_prefactor * dx * dz - S_direct_product_prefactor * (dz * nxj + dx * nzj);
+ matrix_d2D[i*n + j + n2 * 3] = D_direct_product_prefactor * dy * dx - S_direct_product_prefactor * (dx * nyj + dy * nxj);
+ matrix_d2D[i*n + j + n2 * 4] = D_direct_product_prefactor * dy * dy - S_direct_product_prefactor * (dy * nyj + dy * nyj + nj_rij);
+ matrix_d2D[i*n + j + n2 * 5] = D_direct_product_prefactor * dy * dz - S_direct_product_prefactor * (dz * nyj + dy * nzj);
+ matrix_d2D[i*n + j + n2 * 6] = D_direct_product_prefactor * dz * dx - S_direct_product_prefactor * (dx * nzj + dz * nxj);
+ matrix_d2D[i*n + j + n2 * 7] = D_direct_product_prefactor * dz * dy - S_direct_product_prefactor * (dy * nzj + dz * nyj);
+ matrix_d2D[i*n + j + n2 * 8] = D_direct_product_prefactor * dz * dz - S_direct_product_prefactor * (dz * nzj + dz * nzj + nj_rij);
+ }
+}
+
+__global__
+static void _pcm_d2F_to_d2Sii(const double* F, const double* dF, const double* d2F, const double* charge_exp,
+ double* d2Sii, const int n_atom, const int n_grid)
+{
+ const int i_grid = blockIdx.x * blockDim.x + threadIdx.x;
+ const int ij_atom = blockIdx.y * blockDim.y + threadIdx.y;
+ if (i_grid >= n_grid || ij_atom >= n_atom * n_atom) {
+ return;
+ }
+
+ const int i_atom = ij_atom / n_atom;
+ const int j_atom = ij_atom % n_atom;
+
+ const double zeta = charge_exp[i_grid];
+ const double F_value = F[i_grid];
+ const double F_1 = 1.0 / F_value;
+ const double F_2 = F_1 * F_1;
+ const double combined_factor = SQRT2_PI * zeta * F_2;
+
+ const double dFix = dF[(i_atom * 3 ) * n_grid + i_grid];
+ const double dFiy = dF[(i_atom * 3 + 1) * n_grid + i_grid];
+ const double dFiz = dF[(i_atom * 3 + 2) * n_grid + i_grid];
+ const double dFjx = dF[(j_atom * 3 ) * n_grid + i_grid];
+ const double dFjy = dF[(j_atom * 3 + 1) * n_grid + i_grid];
+ const double dFjz = dF[(j_atom * 3 + 2) * n_grid + i_grid];
+
+ const double d2Fixjx = d2F[((i_atom * n_atom + j_atom) * 9 + 0 * 3 ) * n_grid + i_grid];
+ const double d2Fixjy = d2F[((i_atom * n_atom + j_atom) * 9 + 0 * 3 + 1) * n_grid + i_grid];
+ const double d2Fixjz = d2F[((i_atom * n_atom + j_atom) * 9 + 0 * 3 + 2) * n_grid + i_grid];
+ const double d2Fiyjx = d2F[((i_atom * n_atom + j_atom) * 9 + 1 * 3 ) * n_grid + i_grid];
+ const double d2Fiyjy = d2F[((i_atom * n_atom + j_atom) * 9 + 1 * 3 + 1) * n_grid + i_grid];
+ const double d2Fiyjz = d2F[((i_atom * n_atom + j_atom) * 9 + 1 * 3 + 2) * n_grid + i_grid];
+ const double d2Fizjx = d2F[((i_atom * n_atom + j_atom) * 9 + 2 * 3 ) * n_grid + i_grid];
+ const double d2Fizjy = d2F[((i_atom * n_atom + j_atom) * 9 + 2 * 3 + 1) * n_grid + i_grid];
+ const double d2Fizjz = d2F[((i_atom * n_atom + j_atom) * 9 + 2 * 3 + 2) * n_grid + i_grid];
+
+ d2Sii[((i_atom * n_atom + j_atom) * 9 + 0 * 3 ) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFix * dFjx - d2Fixjx);
+ d2Sii[((i_atom * n_atom + j_atom) * 9 + 0 * 3 + 1) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFix * dFjy - d2Fixjy);
+ d2Sii[((i_atom * n_atom + j_atom) * 9 + 0 * 3 + 2) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFix * dFjz - d2Fixjz);
+ d2Sii[((i_atom * n_atom + j_atom) * 9 + 1 * 3 ) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiy * dFjx - d2Fiyjx);
+ d2Sii[((i_atom * n_atom + j_atom) * 9 + 1 * 3 + 1) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiy * dFjy - d2Fiyjy);
+ d2Sii[((i_atom * n_atom + j_atom) * 9 + 1 * 3 + 2) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiy * dFjz - d2Fiyjz);
+ d2Sii[((i_atom * n_atom + j_atom) * 9 + 2 * 3 ) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiz * dFjx - d2Fizjx);
+ d2Sii[((i_atom * n_atom + j_atom) * 9 + 2 * 3 + 1) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiz * dFjy - d2Fizjy);
+ d2Sii[((i_atom * n_atom + j_atom) * 9 + 2 * 3 + 2) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiz * dFjz - d2Fizjz);
+}
+
extern "C" {
int pcm_d_s(cudaStream_t stream, double *matrix_d, double *matrix_s,
const double *coords, const double *norm_vec, const double *r_vdw,
@@ -149,15 +270,47 @@ int pcm_d_s(cudaStream_t stream, double *matrix_d, double *matrix_s,
}
int pcm_dd_ds(cudaStream_t stream, double *matrix_dD, double *matrix_dS,
- const double *coords, const double *norm_vec, const double *r_vdw,
- const double *charge_exp, const double *switch_fun,
- int n)
+ const double *coords, const double *norm_vec,
+ const double *charge_exp,
+ int n)
{
int ntilex = (n + THREADS - 1) / THREADS;
int ntiley = (n + THREADS - 1) / THREADS;
dim3 threads(THREADS, THREADS);
dim3 blocks(ntilex, ntiley);
- _pcm_dD_dS<<>>(matrix_dD, matrix_dS, coords, norm_vec, r_vdw, charge_exp, switch_fun, n);
+ _pcm_dD_dS<<>>(matrix_dD, matrix_dS, coords, norm_vec, charge_exp, n);
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess) {
+ return 1;
+ }
+ return 0;
+}
+
+int pcm_d2d_d2s(cudaStream_t stream, double *matrix_d2D, double *matrix_d2S,
+ const double *coords, const double *norm_vec,
+ const double *charge_exp,
+ int n)
+{
+ const int ntilex = (n + THREADS - 1) / THREADS;
+ const int ntiley = (n + THREADS - 1) / THREADS;
+ const dim3 threads(THREADS, THREADS);
+ const dim3 blocks(ntilex, ntiley);
+ _pcm_d2D_d2S<<>>(matrix_d2D, matrix_d2S, coords, norm_vec, charge_exp, n);
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess) {
+ return 1;
+ }
+ return 0;
+}
+
+int pcm_d2f_to_d2sii(cudaStream_t stream, const double* F, const double* dF, const double* d2F, const double* charge_exp,
+ double* d2Sii, const int n_atom, const int n_grid)
+{
+ const int ntilex = (n_grid + THREADS - 1) / THREADS;
+ const int ntiley = (n_atom * n_atom + THREADS - 1) / THREADS;
+ const dim3 threads(THREADS, THREADS);
+ const dim3 blocks(ntilex, ntiley);
+ _pcm_d2F_to_d2Sii<<>>(F, dF, d2F, charge_exp, d2Sii, n_atom, n_grid);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
return 1;
diff --git a/gpu4pyscf/lib/utils.py b/gpu4pyscf/lib/utils.py
index 0b7c613f..5f38a29c 100644
--- a/gpu4pyscf/lib/utils.py
+++ b/gpu4pyscf/lib/utils.py
@@ -105,6 +105,7 @@ def device(obj):
def format_sys_info():
'''Format a list of system information for printing.'''
from cupyx._runtime import get_runtime_info
+ from gpu4pyscf.__config__ import num_devices, mem_fraction, props as device_props
pyscf_info = lib.repo_info(pyscf.__file__)
gpu4pyscf_info = lib.repo_info(os.path.join(__file__, '..', '..'))
@@ -112,7 +113,6 @@ def format_sys_info():
cuda_version = f"{cuda_version // 1000}.{(cuda_version % 1000) // 10}"
runtime_info = get_runtime_info()
- device_props = cupy.cuda.runtime.getDeviceProperties(0)
result = [
f'System: {platform.uname()} Threads {lib.num_threads()}',
f'Python {sys.version}',
@@ -134,6 +134,8 @@ def format_sys_info():
'Device info',
f' Device name {device_props["name"]}',
f' Device global memory {device_props["totalGlobalMem"] / 1024**3:.2f} GB',
+ f' CuPy memory fraction {mem_fraction}',
+ f' Num. Devices {num_devices}',
f'GPU4PySCF {gpu4pyscf.__version__}',
f'GPU4PySCF path {gpu4pyscf_info["path"]}'
]
diff --git a/gpu4pyscf/mp/dfmp2.py b/gpu4pyscf/mp/dfmp2.py
index 92652402..da398dcb 100644
--- a/gpu4pyscf/mp/dfmp2.py
+++ b/gpu4pyscf/mp/dfmp2.py
@@ -20,7 +20,7 @@
from gpu4pyscf.mp import mp2
from gpu4pyscf.lib import logger
from gpu4pyscf.lib.cupy_helper import contract, tag_array, reduce_to_device
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
from pyscf import __config__
WITH_T2 = getattr(__config__, 'mp_dfmp2_with_t2', True)
@@ -45,8 +45,8 @@ def _dfmp2_tasks(mp, mo_coeff, mo_energy, device_id=0):
return Lov
def get_occ_blk(Lov_dist, i, nocc, nvir):
- occ_blk_dist = [None] * _num_devices
- for device_id in range(_num_devices):
+ occ_blk_dist = [None] * num_devices
+ for device_id in range(num_devices):
with cupy.cuda.Device(device_id), _streams[device_id]:
Lov = Lov_dist[device_id]
mat = cupy.dot(Lov[:,i*nvir:(i+1)*nvir].T,
@@ -73,8 +73,8 @@ def kernel(mp, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2,
# Submit tasks to different devices
futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
+ with ThreadPoolExecutor(max_workers=num_devices) as executor:
+ for device_id in range(num_devices):
future = executor.submit(_dfmp2_tasks, mp, mo_coeff, mo_energy,
device_id=device_id)
futures.append(future)
diff --git a/gpu4pyscf/pbc/df/aft.py b/gpu4pyscf/pbc/df/aft.py
index 5f9edc37..4bc4aa50 100644
--- a/gpu4pyscf/pbc/df/aft.py
+++ b/gpu4pyscf/pbc/df/aft.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -27,7 +27,6 @@
from pyscf.pbc.gto.pseudo import pp_int
from pyscf.pbc.lib.kpts_helper import is_zero
from pyscf.pbc.df import ft_ao
-from pyscf.pbc.df.aft import _check_kpts
from pyscf.pbc.tools import k2gamma
from gpu4pyscf.pbc.tools.pbc import get_coulG
from gpu4pyscf.pbc.df import aft_jk
@@ -201,3 +200,19 @@ def get_jk(self, dm, hermi=1, kpts=None, kpts_band=None,
to_gpu = utils.to_gpu
device = utils.device
to_cpu = utils.to_cpu
+
+def _check_kpts(mydf, kpts):
+ '''Check if the argument kpts is a single k-point'''
+ if kpts is None:
+ kpts = mydf.kpts
+ if kpts is None:
+ kpts = np.zeros((1, 3))
+ is_single_kpt = True
+ else:
+ kpts = np.asarray(kpts)
+ is_single_kpt = kpts.ndim == 1 or is_zero(kpts)
+ else:
+ kpts = np.asarray(kpts)
+ is_single_kpt = kpts.ndim == 1
+ kpts = kpts.reshape(-1,3)
+ return kpts, is_single_kpt
diff --git a/gpu4pyscf/pbc/df/aft_jk.py b/gpu4pyscf/pbc/df/aft_jk.py
index 225f97cb..040fc955 100644
--- a/gpu4pyscf/pbc/df/aft_jk.py
+++ b/gpu4pyscf/pbc/df/aft_jk.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/gpu4pyscf/pbc/df/df.py b/gpu4pyscf/pbc/df/df.py
index 756c94df..45d41f22 100644
--- a/gpu4pyscf/pbc/df/df.py
+++ b/gpu4pyscf/pbc/df/df.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
__all__ = ['GDF']
+import warnings
import ctypes
import tempfile
import numpy as np
@@ -28,20 +29,19 @@
from pyscf import lib
from pyscf.pbc.df import aft as aft_cpu
from pyscf.pbc.df import df as df_cpu
-from pyscf.pbc.df.aft import _check_kpts
from pyscf.pbc.df.gdf_builder import libpbc
-from pyscf.pbc.lib.kpts_helper import is_zero, unique
-from pyscf.pbc.df.rsdf_builder import _RSGDFBuilder, _RSNucBuilder
+from pyscf.pbc.lib.kpts_helper import is_zero
from gpu4pyscf.lib import logger
-from gpu4pyscf.pbc.df import df_jk
-from gpu4pyscf.lib.cupy_helper import return_cupy_array, pack_tril, unpack_tril
+from gpu4pyscf.pbc.df import df_jk, rsdf_builder
+from gpu4pyscf.pbc.df.aft import _check_kpts
+from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh
+from gpu4pyscf.lib.cupy_helper import return_cupy_array, pack_tril, get_avail_mem
from gpu4pyscf.lib import utils
class GDF(lib.StreamObject):
'''Gaussian density fitting
'''
blockdim = df_cpu.GDF.blockdim
- _dataname = 'j3c'
_prefer_ccdf = False
force_dm_kbuild = False
@@ -56,51 +56,25 @@ class GDF(lib.StreamObject):
reset = df_cpu.GDF.reset
dump_flags = df_cpu.GDF.dump_flags
- def build(self, j_only=None, with_j3c=True, kpts_band=None):
+ def build(self, j_only=None, kpts_band=None):
+ warnings.warn(
+ 'PBC.df is currently experimental and subject to significant changes.')
if j_only is not None:
self._j_only = j_only
- if self.kpts_band is not None:
- self.kpts_band = np.reshape(self.kpts_band, (-1,3))
- assert kpts_band is None
+ assert kpts_band is None and self.kpts_band is None
self.check_sanity()
self.dump_flags()
+ cell = self.cell
+ auxcell = df_cpu.make_auxcell(cell, self.auxbasis, self.exp_to_discard)
+ self.auxcell = auxcell
- self.auxcell = df_cpu.make_auxcell(self.cell, self.auxbasis,
- self.exp_to_discard)
-
- if with_j3c and self._cderi_to_save is not None:
- if isinstance(self._cderi_to_save, str):
- cderi = self._cderi_to_save
- else:
- cderi = self._cderi_to_save.name
- self._cderi = cderi
- t1 = (logger.process_clock(), logger.perf_counter())
- self._make_j3c(self.cell, self.auxcell, None, cderi)
- t1 = logger.timer_debug1(self, 'j3c', *t1)
+ t1 = (logger.process_clock(), logger.perf_counter())
+ self._cderi, self._cderip = rsdf_builder.build_cderi(
+ cell, auxcell, self.kpts, j_only=j_only)
+ t1 = logger.timer_debug1(self, 'j3c', *t1)
return self
- def _make_j3c(self, cell=None, auxcell=None, kptij_lst=None, cderi_file=None):
- if cell is None: cell = self.cell
- if auxcell is None: auxcell = self.auxcell
- if cderi_file is None: cderi_file = self._cderi_to_save
-
- # Remove duplicated k-points. Duplicated kpts may lead to a buffer
- # located in incore.wrap_int3c larger than necessary. Integral code
- # only fills necessary part of the buffer, leaving some space in the
- # buffer unfilled.
- if self.kpts_band is None:
- kpts_union = self.kpts
- else:
- kpts_union = unique(np.vstack([self.kpts, self.kpts_band]))[0]
-
- dfbuilder = _RSGDFBuilder(cell, auxcell, kpts_union)
- dfbuilder.mesh = self.mesh
- dfbuilder.linear_dep_threshold = self.linear_dep_threshold
- j_only = self._j_only or len(kpts_union) == 1
- dfbuilder.make_j3c(cderi_file, j_only=j_only, dataname=self._dataname,
- kptij_lst=kptij_lst)
-
has_kpts = df_cpu.GDF.has_kpts
weighted_coulG = return_cupy_array(aft_cpu.weighted_coulG)
pw_loop = NotImplemented
@@ -108,48 +82,72 @@ def _make_j3c(self, cell=None, auxcell=None, kptij_lst=None, cderi_file=None):
get_naoaux = df_cpu.GDF.get_naoaux
range_coulomb = aft_cpu.AFTDFMixin.range_coulomb
- def sr_loop(self, kpti_kptj=np.zeros((2,3)), max_memory=2000,
- compact=True, blksize=None, aux_slice=None):
- '''Short range part'''
- assert aux_slice is None
+ def sr_loop(self, ki, kj, compact=True, blksize=None):
+ '''Iterator for the 3-index cderi tensor over the auxliary dimension'''
if self._cderi is None:
self.build()
cell = self.cell
- kpti, kptj = kpti_kptj
- unpack = is_zero(kpti-kptj) and not compact
nao = cell.nao
if blksize is None:
- blksize = max_memory*1e6/16/(nao**2*2)
- blksize /= 2 # For prefetch
- blksize = max(16, min(int(blksize), self.blockdim))
- logger.debug2(self, 'max_memory %d MB, blksize %d', max_memory, blksize)
-
- def load(aux_slice):
- b0, b1 = aux_slice
- naux = b1 - b0
- Lpq = cp.asarray(j3c[b0:b1])
- if compact and Lpq.shape[1] == nao**2:
- Lpq = pack_tril(Lpq.reshape(naux, nao, nao))
- elif unpack and Lpq.shape[1] != nao**2:
- Lpq = unpack_tril(Lpq)
- return Lpq
-
- with df_cpu._load3c(self._cderi, self._dataname, kpti_kptj) as j3c:
- slices = lib.prange(0, j3c.shape[0], blksize)
- for Lpq in lib.map_with_prefetch(load, slices):
- yield Lpq, 1
-
- if cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum':
- # Truncated Coulomb operator is not positive definite. Load the
- # CDERI tensor of negative part.
- with df_cpu._load3c(self._cderi, self._dataname+'-', kpti_kptj,
- ignore_key_error=True) as j3c:
- slices = lib.prange(0, j3c.shape[0], blksize)
- for Lpq in lib.map_with_prefetch(load, slices):
- yield Lpq, -1
-
- get_pp = return_cupy_array(df_cpu.GDF.get_pp)
- get_nuc = return_cupy_array(df_cpu.GDF.get_nuc)
+ avail_mem = get_avail_mem() * .8
+ blksize = avail_mem/16/(nao**2*3)
+ if blksize < 16:
+ raise RuntimeError('Insufficient GPU memory')
+ blksize = min(int(blksize), self.blockdim)
+ logger.debug2(self, 'max_memory %d MB, blksize %d', avail_mem*1e-6, blksize)
+
+ if (ki, kj) in self._cderi:
+ req_conj = False
+ elif (kj, ki) in self._cderi:
+ req_conj = True
+ else:
+ raise RuntimeError('CDERI for kpoints {ki},{kj} not generated')
+
+ Lpq_kij = self._cderi[ki,kj]
+ naux = len(Lpq_kij)
+ for b0, b1 in lib.prange(0, naux, blksize):
+ if req_conj:
+ Lpq = Lpq_kij[b0:b1].transpose(0,2,1).conj()
+ else:
+ Lpq = Lpq_kij[b0:b1]
+ assert Lpq[0].size == nao**2
+ if compact:
+ Lpq = pack_tril(Lpq.reshape(-1, nao, nao))
+ yield Lpq, 1
+
+ if cell.dimension == 2:
+ assert cell.low_dim_ft_type != 'inf_vacuum'
+ Lpq_kij = self._cderip[ki,kj]
+ naux = len(Lpq_kij)
+ for b0, b1 in lib.prange(0, naux, blksize):
+ if req_conj:
+ Lpq = Lpq_kij[b0:b1].transpose(0,2,1).conj()
+ else:
+ Lpq = Lpq_kij[b0:b1]
+ assert Lpq[0].size == nao**2
+ if compact:
+ Lpq = pack_tril(Lpq.reshape(-1, nao, nao))
+ yield Lpq, -1
+
+ def get_pp(self, kpts=None):
+ kpts, is_single_kpt = _check_kpts(self, kpts)
+ if is_single_kpt and is_zero(kpts):
+ vpp = rsdf_builder.get_pp(self.cell)
+ else:
+ vpp = rsdf_builder.get_pp(self.cell, kpts)
+ if is_single_kpt:
+ vpp = vpp[0]
+ return vpp
+
+ def get_nuc(self, kpts=None):
+ kpts, is_single_kpt = _check_kpts(self, kpts)
+ if is_single_kpt and is_zero(kpts):
+ nuc = rsdf_builder.get_nuc(self.cell)
+ else:
+ nuc = rsdf_builder.get_nuc(self.cell, kpts)
+ if is_single_kpt:
+ nuc = nuc[0]
+ return nuc
# Note: Special exxdiv by default should not be used for an arbitrary
# input density matrix. When the df object was used with the molecular
diff --git a/gpu4pyscf/pbc/df/df_jk.py b/gpu4pyscf/pbc/df/df_jk.py
index bdaf2427..36ce3acf 100644
--- a/gpu4pyscf/pbc/df/df_jk.py
+++ b/gpu4pyscf/pbc/df/df_jk.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
from gpu4pyscf.lib.cupy_helper import contract, unpack_tril
from gpu4pyscf.pbc.df.fft_jk import _ewald_exxdiv_for_G0, _format_dms, _format_jks
-def density_fit(mf, auxbasis=None, mesh=None, with_df=None):
+def density_fit(mf, auxbasis=None, with_df=None):
'''Generate density-fitting SCF object
Args:
@@ -34,8 +34,6 @@ def density_fit(mf, auxbasis=None, mesh=None, with_df=None):
Same format to the input attribute mol.basis. If auxbasis is
None, auxiliary basis based on AO basis (if possible) or
even-tempered Gaussian basis will be used.
- mesh : tuple
- number of grids in each direction
with_df : DF object
'''
from gpu4pyscf.pbc.df.df import GDF
@@ -45,27 +43,21 @@ def density_fit(mf, auxbasis=None, mesh=None, with_df=None):
else:
kpts = np.reshape(mf.kpt, (1,3))
with_df = GDF(mf.cell, kpts)
- with_df.max_memory = mf.max_memory
with_df.stdout = mf.stdout
with_df.verbose = mf.verbose
with_df.auxbasis = auxbasis
- if mesh is not None:
- with_df.mesh = mesh
- mf = mf.copy()
+ mf = mf.copy().reset()
mf.with_df = with_df
- mf._eri = None
return mf
def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None):
log = logger.new_logger(mydf)
t0 = log.init_timer()
- if mydf._cderi is None or not mydf.has_kpts(kpts_band):
- if mydf._cderi is not None:
- log.warn('DF integrals for band k-points were not found %s. '
- 'DF integrals will be rebuilt to include band k-points.',
- mydf._cderi)
+ assert kpts_band is None or kpts_band is kpts
+ assert mydf.has_kpts(kpts)
+ if mydf._cderi is None:
mydf.build(j_only=True, kpts_band=kpts_band)
t0 = log.timer_debug1('Init get_j_kpts', *t0)
@@ -83,11 +75,9 @@ def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None):
nband = len(kpts_band)
rho = cp.zeros((nset,naux), dtype=np.complex128)
- max_memory = max(2000, (mydf.max_memory - lib.current_memory()[0]))
- for k, kpt in enumerate(kpts):
- kptii = np.asarray((kpt,kpt))
+ for k in range(nkpts):
p1 = 0
- for Lpq, sign in mydf.sr_loop(kptii, max_memory, False):
+ for Lpq, sign in mydf.sr_loop(k, k, False):
Lpq = Lpq.reshape(-1,nao,nao)
p0, p1 = p1, p1+Lpq.shape[0]
rho[:,p0:p1] += sign * contract('Lpq,xqp->xL', Lpq, dms[:,k])
@@ -102,9 +92,8 @@ def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None):
vj = cp.zeros((nset,nband,nao_pair), dtype=np.complex128)
for k, kpt in enumerate(kpts_band):
- kptii = np.asarray((kpt,kpt))
p1 = 0
- for Lpq, sign in mydf.sr_loop(kptii, max_memory, aos2symm):
+ for Lpq, sign in mydf.sr_loop(k, k, aos2symm):
nrow = Lpq.shape[0]
p0, p1 = p1, p1+nrow
Lpq = Lpq.reshape(nrow, -1)
@@ -137,11 +126,9 @@ def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None,
raise RuntimeError('GDF does not support exxdiv %s' % exxdiv)
t0 = (logger.process_clock(), logger.perf_counter())
- if mydf._cderi is None or not mydf.has_kpts(kpts_band):
- if mydf._cderi is not None:
- log.warn('DF integrals for band k-points were not found %s. '
- 'DF integrals will be rebuilt to include band k-points.',
- mydf._cderi)
+ assert kpts_band is None or kpts_band is kpts
+ assert mydf.has_kpts(kpts)
+ if mydf._cderi is None:
mydf.build(kpts_band=kpts_band)
t0 = log.timer_debug1('Init get_k_kpts', *t0)
@@ -186,12 +173,12 @@ def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None,
# K_pq = ( p{k1} i{k2} | i{k2} q{k1} )
# input dm is not Hermitian/PSD --> build K from dm
log.debug2('get_k_kpts: build K from dm')
- max_memory = max(2000, mydf.max_memory-lib.current_memory()[0])
- def make_kpt(ki, kj, swap_2e, inverse_idx=None):
- kpti = kpts[ki]
- kptj = kpts_band[kj]
- #TODO: utilize kk_adapted_iter with time_reversal_symmetry, as that in aft_jk
- for Lpq, sign in mydf.sr_loop((kpti,kptj), max_memory, compact=False):
+ if mydf._cderi is None:
+ mydf.build()
+ def make_kpt(ki, kj, swap_2e):
+ if (ki, kj) not in mydf._cderi:
+ kj, ki = ki, kj
+ for Lpq, sign in mydf.sr_loop(ki, kj, compact=False):
Lpq = Lpq.reshape(-1, nao, nao)
tmp = contract('njk,Lkl->nLjl', dms[:,ki], Lpq)
if sign > 0:
@@ -207,23 +194,23 @@ def make_kpt(ki, kj, swap_2e, inverse_idx=None):
vk[:,ki] -= contract('nLki,Lji->nkj', tmp, Lpq.conj())
t1 = log.init_timer()
- if kpts_band is kpts: # normal k-points HF/DFT
- for ki in range(nkpts):
- for kj in range(ki):
- make_kpt(ki, kj, True)
- make_kpt(ki, ki, False)
- t1 = log.timer_debug1('get_k_kpts: make_kpt ki>=kj (%d,*)'%ki, *t1)
- else:
+ if kpts_band is not kpts: # normal k-points HF/DFT
raise NotImplementedError
-
- if exxdiv == 'ewald':
- _ewald_exxdiv_for_G0(cell, kpts, dms, vk, kpts_band)
+ #TODO: utilize kk_adapted_iter with time_reversal_symmetry, as that in aft_jk
+ for ki in range(nkpts):
+ for kj in range(ki):
+ make_kpt(ki, kj, True)
+ make_kpt(ki, ki, False)
+ t1 = log.timer_debug1('get_k_kpts: make_kpt ki>=kj (%d,*)'%ki, *t1)
if (is_zero(kpts) and is_zero(kpts_band) and
not np.iscomplexobj(dm_kpts)):
vk = vk.real
vk *= 1./nkpts
+ if exxdiv == 'ewald':
+ _ewald_exxdiv_for_G0(cell, kpts, dms, vk, kpts_band)
+
log.timer('get_k_kpts', *t0)
return _format_jks(vk, dm_kpts, input_band, kpts)
@@ -243,29 +230,17 @@ def get_jk(mydf, dm, hermi=1, kpt=np.zeros(3),
'''JK for given k-point'''
log = logger.new_logger(mydf)
t0 = log.init_timer()
- if mydf._cderi is None or not mydf.has_kpts(kpts_band):
- if mydf._cderi is not None:
- log.warn('DF integrals for band k-points were not found %s. '
- 'DF integrals will be rebuilt to include band k-points.',
- mydf._cderi)
+ assert is_zero(kpt)
+ assert kpts_band is None
+ if mydf._cderi is None:
mydf.build(j_only=not with_k, kpts_band=kpts_band)
t0 = log.timer_debug1('Init get_jk', *t0)
- vj = vk = None
- if kpts_band is not None and abs(kpt-kpts_band).sum() > 1e-9:
- kpt = np.reshape(kpt, (1,3))
- if with_k:
- vk = get_k_kpts(mydf, dm, hermi, kpt, kpts_band, exxdiv)
- if with_j:
- vj = get_j_kpts(mydf, dm, hermi, kpt, kpts_band)
- return vj, vk
-
cell = mydf.cell
- dm = np.asarray(dm, order='C')
+ dm = cp.asarray(dm, order='C')
dms = _format_dms(dm, [kpt])
nset, _, nao = dms.shape[:3]
dms = dms.reshape(nset,nao,nao)
- kptii = np.asarray((kpt,kpt))
if with_j:
vj = cp.zeros((nset,nao,nao), dtype=np.complex128)
if with_k:
@@ -294,9 +269,7 @@ def get_jk(mydf, dm, hermi=1, kpt=np.zeros(3),
'''
vk = cp.zeros((nset,nao,nao), dtype=np.complex128)
- mem_now = lib.current_memory()[0]
- max_memory = max(2000, (mydf.max_memory - mem_now))
- for Lpq, sign in mydf.sr_loop(kptii, max_memory, False):
+ for Lpq, sign in mydf.sr_loop(0, 0, False):
if with_j:
#:rho_coeff = np.einsum('Lpq,xqp->xL', Lpq, dms)
#:vj += np.dot(rho_coeff, Lpq.reshape(-1,nao**2))
diff --git a/gpu4pyscf/pbc/df/fft.py b/gpu4pyscf/pbc/df/fft.py
index 9d54b118..d074d9b3 100644
--- a/gpu4pyscf/pbc/df/fft.py
+++ b/gpu4pyscf/pbc/df/fft.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/gpu4pyscf/pbc/df/fft_jk.py b/gpu4pyscf/pbc/df/fft_jk.py
index dbf64378..1d17ed6d 100644
--- a/gpu4pyscf/pbc/df/fft_jk.py
+++ b/gpu4pyscf/pbc/df/fft_jk.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/gpu4pyscf/pbc/df/ft_ao.py b/gpu4pyscf/pbc/df/ft_ao.py
index d93678d6..cdd59951 100644
--- a/gpu4pyscf/pbc/df/ft_ao.py
+++ b/gpu4pyscf/pbc/df/ft_ao.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -26,7 +26,6 @@
from pyscf.gto.mole import ANG_OF, ATOM_OF, PTR_COORD
from pyscf.scf import _vhf
from pyscf.pbc import tools as pbctools
-from pyscf.pbc.gto.cell import _extract_pgto_params
from pyscf.pbc.tools import k2gamma
from pyscf.pbc.lib.kpts_helper import is_zero
from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh
@@ -36,6 +35,7 @@
from gpu4pyscf.scf.jk import (
g_pair_idx, _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE)
from gpu4pyscf.pbc.lib.kpts_helper import conj_images_in_bvk_cell
+from gpu4pyscf.pbc.gto.cell import extract_pgto_params
from gpu4pyscf.__config__ import props as gpu_specs
__all__ = [
@@ -43,8 +43,8 @@
]
libpbc = load_library('libpbc')
-libpbc.PBC_build_ft_ao.restype = ctypes.c_int
-libpbc.PBC_FT_init_constant.restype = ctypes.c_int
+libpbc.build_ft_ao.restype = ctypes.c_int
+libpbc.init_constant.restype = ctypes.c_int
LMAX = 4
GOUT_WIDTH = 19
@@ -71,27 +71,25 @@ def ft_ao(cell, Gv, shls_slice=None, b=None,
gxyz=None, Gvbase=None, kpt=np.zeros(3), verbose=None):
from pyscf.pbc.df.ft_ao import ft_ao
out = ft_ao(cell, Gv, shls_slice, b, gxyz, Gvbase, kpt, verbose)
- return cp.asarray(out)
+ if out.flags.c_contiguous:
+ return cp.asarray(out)
+ else:
+ return cp.asarray(out, order='F')
-def _bas_overlap_mask(cell, bvkmesh_Ls, Ls, cutoff=None):
+def _bas_overlap_mask(cell, bvkmesh_Ls, Ls):
'''integral screening mask for basis product between cell and supmol'''
# consider only the most diffused component of a basis
- exps, cs = _extract_pgto_params(cell, 'min')
+ exps, cs = extract_pgto_params(cell, 'diffused')
ls = cell._bas[:,ANG_OF]
bas_coords = cp.asarray(cell.atom_coords()[cell._bas[:,ATOM_OF]])
- vol = cell.vol
- if cutoff is None:
- theta_ij = exps.min() / 2
- lattice_sum_factor = max(2*np.pi*cell.rcut/(vol*theta_ij), 1)
- cutoff = cell.precision/lattice_sum_factor * .1
- logger.debug(cell, 'Set ft_ao cutoff to %g', cutoff)
-
ls = cp.asarray(ls)
exps = cp.asarray(exps)
norm = cp.asarray(cs) * ((2*ls+1)/(4*np.pi))**.5
aij = exps[:,None] + exps
- theta = exps[:,None] * exps / aij
+ fi = exps[:,None] / aij
+ fj = exps[None,:] / aij
+ theta = exps[:,None] * fj
Ls = cp.asarray(Ls)
# rj format: (bvk_cell_id, bas_id, lattice_img_id)
@@ -100,16 +98,18 @@ def _bas_overlap_mask(cell, bvkmesh_Ls, Ls, cutoff=None):
dr = cp.linalg.norm(rirj, axis=4)
- dri = exps[None,None,:,None]/aij[:,None,:,None] * dr
- drj = exps[:,None,None,None]/aij[:,None,:,None] * dr
+ dri = fj[:,None,:,None] * dr
+ drj = fi[:,None,:,None] * dr
li = ls[:,None,None,None]
lj = ls[None,None,:,None]
fac_dri = (li * .5/aij[:,None,:,None] + dri**2) ** (li*.5)
fac_drj = (lj * .5/aij[:,None,:,None] + drj**2) ** (lj*.5)
- fl = 2*np.pi/vol * (dr/theta[:,None,:,None]) + 1.
+ rad = cell.vol**(-1./3) * dr + 1
+ surface = 4*np.pi * rad**2
+ fl = cp.where(surface > 1, surface, 1)
fac_norm = norm[:,None]*norm * (np.pi/aij)**1.5
ovlp = fac_norm[:,None,:,None] * cp.exp(-theta[:,None,:,None]*dr**2) * fac_dri * fac_drj * fl
- return ovlp > cutoff
+ return ovlp > cell.precision
def gen_ft_kernel(cell, kpts=None, verbose=None):
r'''
@@ -132,11 +132,12 @@ def __init__(self, cell, kpts=None, bvk_kmesh=None):
self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts))
self.coeff = cp.asarray(coeff, dtype=np.complex128)
- if kpts is not None and bvk_kmesh is None:
- bvk_kmesh = kpts_to_kmesh(cell, kpts)
-
- # create BVK super-cell
if bvk_kmesh is None:
+ if kpts is None or is_zero(kpts):
+ bvk_kmesh = np.ones(3, dtype=int)
+ else:
+ bvk_kmesh = kpts_to_kmesh(cell, kpts)
+ if np.prod(bvk_kmesh) == 1:
bvkcell = cell
else:
bvkcell = pbctools.super_cell(cell, bvk_kmesh, wrap_around=True)
@@ -169,7 +170,7 @@ def gen_ft_kernel(self, verbose=None):
Ls = Ls[cp.linalg.norm(Ls-.5, axis=1).argsort()]
if bvk_kmesh is None:
- bvkmesh_Ls = cp.zeros(3)
+ bvkmesh_Ls = cp.zeros((1, 3))
else:
bvkmesh_Ls = cp.asarray(
k2gamma.translation_vectors_for_kmesh(cell, bvk_kmesh, True))
@@ -209,7 +210,7 @@ def gen_ft_kernel(self, verbose=None):
conj_mapping = cp.asarray(conj_images_in_bvk_cell(bvk_kmesh), dtype=np.int32)
init_constant(cell)
- kern = libpbc.PBC_build_ft_ao
+ kern = libpbc.build_ft_ao
cp.cuda.Stream.null.synchronize()
log.timer_debug1('initialize ft_kern', *cput0)
@@ -270,7 +271,7 @@ def _ft_sub(Gv, q, kptjs, transform_ao=True):
cell._atm.ctypes, ctypes.c_int(cell.natm),
cell._bas.ctypes, ctypes.c_int(cell.nbas), cell._env.ctypes)
if err != 0:
- raise RuntimeError(f'PBC_build_ft_ao kernel for {ll_pattern} failed')
+ raise RuntimeError(f'build_ft_ao kernel for {ll_pattern} failed')
if log.verbose >= logger.DEBUG1:
t1, t1p = log.timer_debug1(f'processing {ll_pattern}', *t1), t1
if ll_pattern not in timing_collection:
@@ -290,24 +291,25 @@ def _ft_sub(Gv, q, kptjs, transform_ao=True):
#ix, iy = cp.tril_indices(nao, -1)
#for k, ck in enumerate(conj_mapping):
# out[iy,ix,ck] = out[ix,iy,k]
- err = libpbc.PBC_ft_aopair_fill_triu(
+ err = libpbc.ft_aopair_fill_triu(
ctypes.cast(out.data.ptr, ctypes.c_void_p),
ctypes.cast(conj_mapping.data.ptr, ctypes.c_void_p),
ctypes.c_int(nao), ctypes.c_int(bvk_ncells), ctypes.c_int(nGv))
if err != 0:
- raise RuntimeError('PBC_ft_aopair_fill_triu kernel failed')
+ raise RuntimeError('ft_aopair_fill_triu kernel failed')
log.debug1('transform BvK-cell to k-points')
- if kptjs is not None:
+ gamma_point_only = kptjs is None or is_zero(kptjs)
+ if not gamma_point_only:
kptjs = cp.asarray(kptjs, order='C').reshape(-1,3)
expLk = cp.exp(1j*cp.dot(bvkmesh_Ls, kptjs.T))
- out = contract('Lk,LpqG->kGpq', expLk, out)
+ out = contract('Lk,LpqG->kpqG', expLk, out)
if transform_ao:
log.debug1('transform basis')
#:out = einsum('pqLG,pi,qj->LGij', out, coeff, coeff)
- out = contract('kGpq,qj->kGpj', out, coeff)
- out = contract('kGpj,pi->kGij', out, coeff)
+ out = contract('kpqG,pi->kiqG', out, coeff)
+ out = contract('kiqG,qj->kijG', out, coeff)
log.timer('ft_aopair', *cput0)
return out
@@ -323,7 +325,7 @@ def ft_kernel(Gv, q=np.zeros(3), kptjs=kpts, transform_ao=True):
avail_mem = get_avail_mem()
if 2*out_size < avail_mem * .8:
- return _ft_sub(Gv, q, kptjs, transform_ao)
+ return _ft_sub(Gv, q, kptjs, transform_ao).transpose(0,3,1,2)
elif out_size < avail_mem * .8:
if kptjs is None:
@@ -332,16 +334,16 @@ def ft_kernel(Gv, q=np.zeros(3), kptjs=kpts, transform_ao=True):
kptjs = kptjs.reshape(-1, 3)
nkpts = len(kptjs)
if transform_ao:
- out = cp.empty((nkpts, nGv, nao_orig, nao_orig), dtype=np.complex128)
+ out = cp.empty((nkpts, nao_orig, nao_orig, nGv), dtype=np.complex128)
else:
- out = cp.empty((nkpts, nGv, nao, nao), dtype=np.complex128)
+ out = cp.empty((nkpts, nao, nao, nGv), dtype=np.complex128)
Gv_block = int((avail_mem * .95 - out_size) / (2*nao**2*bvk_ncells*16))
Gv_block &= 0xfffffc
if Gv_block >= 4:
logger.debug1(cell, 'Processing ft_kernel in sub-blocks, Gv_block = %d', Gv_block)
for p0, p1 in lib.prange(0, nGv, Gv_block):
- out[:,p0:p1] = _ft_sub(Gv[p0:p1], q, kptjs, transform_ao)
- return out
+ out[:,:,:,p0:p1] = _ft_sub(Gv[p0:p1], q, kptjs, transform_ao)
+ return out.transpose(0,3,1,2)
raise RuntimeError('Not enough GPU memory. '
f'Available: {avail_mem*1e-9:.2f} GB. '
@@ -365,7 +367,7 @@ class AFTIntEnvVars(ctypes.Structure):
def init_constant(cell):
g_idx, offsets = g_pair_idx()
- err = libpbc.PBC_FT_init_constant(
+ err = libpbc.init_constant(
g_idx.ctypes, offsets.ctypes, cell._env.ctypes, ctypes.c_int(cell._env.size),
ctypes.c_int(SHM_SIZE))
if err != 0:
diff --git a/gpu4pyscf/pbc/df/int3c2e.py b/gpu4pyscf/pbc/df/int3c2e.py
new file mode 100644
index 00000000..f92b6ef6
--- /dev/null
+++ b/gpu4pyscf/pbc/df/int3c2e.py
@@ -0,0 +1,482 @@
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Perodic 3-center 2-electron short-range Coulomb integral helper functions
+'''
+
+import ctypes
+import math
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.lib.parameters import ANGULAR
+from pyscf.gto.mole import ANG_OF, ATOM_OF, PTR_COORD, PTR_EXP, conc_env
+from pyscf.pbc import tools as pbctools
+from pyscf.pbc.tools import k2gamma
+from pyscf.pbc.lib.kpts_helper import is_zero
+from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import contract
+from gpu4pyscf.gto.mole import group_basis, PTR_BAS_COORD
+from gpu4pyscf.scf.jk import _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE
+from gpu4pyscf.pbc.gto.cell import extract_pgto_params
+from gpu4pyscf.pbc.df.ft_ao import libpbc, init_constant
+
+__all__ = [
+ 'sr_aux_e2',
+]
+
+libpbc.fill_int3c2e.restype = ctypes.c_int
+
+LMAX = 4
+L_AUX_MAX = 6
+GOUT_WIDTH = 45
+THREADS = 256
+BVK_CELL_SHELLS = 2400
+
+def sr_aux_e2(cell, auxcell, omega, kpts=None, bvk_kmesh=None, j_only=False):
+ r'''
+ Short-range 3-center integrals (ij|k). The auxiliary basis functions are
+ placed at the second electron.
+ '''
+ if bvk_kmesh is None and kpts is not None:
+ if j_only:
+ # Coulomb integrals requires smaller kmesh to converge finite-size effects
+ bvk_kmesh = kpts_to_kmesh(cell, bvk_kmesh)
+ else:
+ # The remote images may contribute to certain k-point mesh,
+ # contributing to the finite-size effects in exchange matrix.
+ rcut = estimate_rcut(cell, auxcell, omega).max()
+ bvk_kmesh = kpts_to_kmesh(cell, kpts, rcut=rcut)
+ bvk_kmesh, bvk_kmesh_inp = guess_bvk_kmesh(cell, bvk_kmesh), bvk_kmesh
+ logger.debug(cell, 'BvK input %s, set to %s for sr_aux_e2', bvk_kmesh_inp, bvk_kmesh)
+ int3c2e_opt = SRInt3c2eOpt(cell, auxcell, omega, bvk_kmesh)
+ nao, nao_orig = int3c2e_opt.coeff.shape
+ naux = int3c2e_opt.aux_coeff.shape[0]
+
+ gamma_point = kpts is None or (kpts.ndim == 1 and is_zero(kpts))
+ if gamma_point:
+ out = cp.zeros((nao, nao, naux))
+ else:
+ kpts = np.asarray(kpts).reshape(-1, 3)
+ expLk = cp.exp(1j*cp.asarray(int3c2e_opt.bvkmesh_Ls.dot(kpts.T)))
+ nL, nkpts = expLk.shape
+ if j_only:
+ expLLk = contract('Lk,Mk->LMk', expLk.conj(), expLk)
+ expLLk = expLLk.view(np.float64).reshape(nL,nL,nkpts,2)
+ out = cp.zeros((nkpts, nao, nao, naux), dtype=np.complex128)
+ else:
+ out = cp.zeros((nkpts, nkpts, nao, nao, naux), dtype=np.complex128)
+
+ ao_loc = int3c2e_opt.sorted_cell.ao_loc
+ aux_loc = int3c2e_opt.sorted_auxcell.ao_loc
+
+ for shls_slice, eri3c in int3c2e_opt.int3c2e_kernel():
+ i0, i1, j0, j1 = ao_loc[list(shls_slice[:4])]
+ k0, k1 = aux_loc[list(shls_slice[4:])]
+ if gamma_point:
+ out[i0:i1,j0:j1,k0:k1] = tmp = eri3c.sum(axis=(0,2))
+ if i0 != j0:
+ out[j0:j1,i0:i1,k0:k1] = tmp.transpose(1,0,2)
+ elif j_only:
+ tmp = contract('LMkz,LpMqr->kpqrz', expLLk, eri3c)
+ tmp = tmp.view(np.complex128)[...,0]
+ out[:,i0:i1,j0:j1,k0:k1] = tmp
+ if i0 != j0:
+ out[:,j0:j1,i0:i1,k0:k1] = tmp.transpose(0,2,1,3).conj()
+ else:
+ expLkz = expLk.view(np.float64).reshape(nL,nkpts,2)
+ tmp = contract('Lkz,MpLqr->Mkpqrz', expLkz, eri3c)
+ tmp = tmp.view(np.complex128)[...,0]
+ tmp = contract('Mk,Mlpqr->klpqr', expLk.conj(), tmp)
+ out[:,:,i0:i1,j0:j1,k0:k1] = tmp
+ if i0 != j0:
+ out[:,:,j0:j1,i0:i1,k0:k1] = tmp.transpose(1,0,3,2,4).conj()
+ tmp = None
+
+ if kpts is None:
+ out = contract('pqr,rk->pqk', out, int3c2e_opt.aux_coeff)
+ out = contract('pqk,qj->pjk', out, int3c2e_opt.coeff)
+ out = contract('pjk,pi->ijk', out, int3c2e_opt.coeff)
+ elif j_only:
+ #:out = einsum('MpNqr,pi,qj,rk->MiNjk', out, coeff, coeff, auxcoeff)
+ out = contract('Npqr,rk->Npqk', out, int3c2e_opt.aux_coeff)
+ out = contract('Npqk,qj->Npjk', out, int3c2e_opt.coeff)
+ out = contract('Npjk,pi->Nijk', out, int3c2e_opt.coeff)
+ else:
+ #:out = einsum('MpNqr,pi,qj,rk->MiNjk', out, coeff, coeff, auxcoeff)
+ out = contract('MNpqr,rk->MNpqk', out, int3c2e_opt.aux_coeff)
+ out = contract('MNpqk,qj->MNpjk', out, int3c2e_opt.coeff)
+ out = contract('MNpjk,pi->MNijk', out, int3c2e_opt.coeff)
+ return out
+
+def create_img_idx(cell, bvkcell, auxcell, Ls, int3c2e_envs):
+ '''integral screening'''
+ # consider only the most diffused component of a basis
+ exps, cs = extract_pgto_params(cell, 'diffused')
+ ls = cell._bas[:,ANG_OF]
+ exps = cp.asarray(exps, dtype=np.float32)
+ log_cs = np.log(np.abs(cs * ((2*ls+1)/(4*np.pi))**.5))
+ log_cs = cp.asarray(log_cs, np.float32)
+ nbas = cell.nbas
+ nk = bvkcell.nbas // nbas
+
+ # Search the most diffused functions on each atom
+ aux_exps, aux_cs = extract_pgto_params(auxcell, 'diffused')
+ aux_ls = auxcell._bas[:,ANG_OF]
+ r2_aux = np.log(aux_cs**2 / cell.precision * 10**aux_ls) / aux_exps
+ atom_aux_exps = []
+ atoms = auxcell._bas[:,ATOM_OF]
+ atom_aux_exps = cp.full(cell.natm, 1e8, dtype=np.float32)
+ for ia in range(cell.natm):
+ bas_mask = atoms == ia
+ es = aux_exps[bas_mask]
+ if len(es) > 0:
+ atom_aux_exps[ia] = es[r2_aux[bas_mask].argmax()]
+
+ def gen_img_idx(ish0, ish1, jsh0, jsh1):
+ nish = ish1 - ish0
+ njsh = jsh1 - jsh0
+ #TODO: only tril part when i == j
+ ij_pairs = nk * nish * nk * njsh
+ img_counts = cp.zeros(ij_pairs, dtype=np.int32)
+ err = libpbc.int3c2e_img_counts(
+ ctypes.cast(img_counts.data.ptr, ctypes.c_void_p),
+ ctypes.byref(int3c2e_envs),
+ (ctypes.c_int*4)(ish0, ish1, jsh0, jsh1),
+ ctypes.cast(exps.data.ptr, ctypes.c_void_p),
+ ctypes.cast(log_cs.data.ptr, ctypes.c_void_p),
+ ctypes.cast(atom_aux_exps.data.ptr, ctypes.c_void_p),
+ ctypes.c_int(nk), ctypes.c_int(cell.natm))
+ if err != 0:
+ raise RuntimeError('int3c2e_img_counts failed')
+
+ remaining_idx = np.nonzero(img_counts > 0)[0]
+ remaining_idx = remaining_idx[img_counts[remaining_idx].argsort()[::-1]]
+ remaining_idx = cp.asarray(remaining_idx, dtype=np.int32, order='C')
+ ij_pairs = remaining_idx.size
+ img_offsets = cp.empty(ij_pairs+1, dtype=np.int32)
+ cp.cumsum(img_counts[remaining_idx], out=img_offsets[1:])
+ img_offsets[0] = 0
+
+ img_idx = cp.empty(int(img_offsets[-1]), dtype=np.int32)
+ err = libpbc.int3c2e_img_idx(
+ ctypes.cast(img_idx.data.ptr, ctypes.c_void_p),
+ ctypes.cast(img_offsets.data.ptr, ctypes.c_void_p),
+ ctypes.cast(remaining_idx.data.ptr, ctypes.c_void_p),
+ ctypes.c_int(ij_pairs),
+ ctypes.byref(int3c2e_envs),
+ (ctypes.c_int*4)(ish0, ish1, jsh0, jsh1),
+ ctypes.cast(exps.data.ptr, ctypes.c_void_p),
+ ctypes.cast(log_cs.data.ptr, ctypes.c_void_p),
+ ctypes.cast(atom_aux_exps.data.ptr, ctypes.c_void_p),
+ ctypes.c_int(nk), ctypes.c_int(cell.natm))
+ if err != 0:
+ raise RuntimeError('int3c2e_img_idx failed')
+
+ Ki, i, Kj, j = cp.unravel_index(remaining_idx, (nk, nish, nk, njsh))
+ i += ish0
+ j += jsh0
+ # one-dimensional indices corresponding to [Ki,i,Kj,j]
+ bas_ij = cp.ravel_multi_index((Ki, i, Kj, j), (nk, nbas, nk, nbas))
+ bas_ij = cp.asarray(bas_ij, dtype=np.int32)
+ return img_idx, img_offsets, bas_ij
+ return gen_img_idx
+
+class SRInt3c2eOpt:
+ def __init__(self, cell, auxcell, omega, bvk_kmesh=None):
+ assert omega < 0
+ self.omega = omega
+
+ self.cell = cell
+ cell, coeff, uniq_l_ctr, l_ctr_counts = group_basis(cell, tile=1)
+ self.sorted_cell = cell
+ self.uniq_l_ctr = uniq_l_ctr
+ self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts))
+ self.coeff = cp.asarray(coeff)
+ self.sorted_cell.omega = omega
+
+ self.auxcell = auxcell
+ auxcell, coeff, uniq_l_ctr, l_ctr_counts = group_basis(auxcell, tile=1)
+ self.sorted_auxcell = auxcell
+ self.uniq_l_ctr_aux = uniq_l_ctr
+ self.l_ctr_aux_offsets = np.append(0, np.cumsum(l_ctr_counts))
+ self.aux_coeff = cp.asarray(coeff)
+ self.sorted_auxcell.omega = omega
+
+ if bvk_kmesh is None:
+ bvk_kmesh = np.ones(3, dtype=int)
+ self.bvk_kmesh = bvk_kmesh
+ self.bvkmesh_Ls = k2gamma.translation_vectors_for_kmesh(cell, bvk_kmesh, True)
+
+ if np.prod(bvk_kmesh) == 1:
+ bvkcell = cell
+ else:
+ bvkcell = pbctools.super_cell(cell, bvk_kmesh, wrap_around=True)
+ # PTR_BAS_COORD was not initialized in pbctools.supe_rcell
+ bvkcell._bas[:,PTR_BAS_COORD] = bvkcell._atm[bvkcell._bas[:,ATOM_OF],PTR_COORD]
+ self.bvkcell = bvkcell
+
+ def int3c2e_kernel(self, cutoff=None, verbose=None):
+ cell = self.sorted_cell
+ auxcell = self.sorted_auxcell
+ uniq_l_ctr = self.uniq_l_ctr
+ l_ctr_offsets = self.l_ctr_offsets
+ l_ctr_aux_offsets = self.l_ctr_aux_offsets
+ bvkcell = self.bvkcell
+
+ log = logger.new_logger(cell, verbose)
+ cput0 = log.init_timer()
+ rcut = estimate_rcut(cell, auxcell, self.omega).max()
+ Ls = cp.asarray(bvkcell.get_lattice_Ls(rcut=rcut))
+ Ls = Ls[cp.linalg.norm(Ls-.5, axis=1).argsort()]
+ nimgs = len(Ls)
+ log.debug('int3c2e_kernel rcut = %g, nimgs = %d', rcut, nimgs)
+
+ if cutoff is None:
+ omega = cell.omega
+ aux_exp, _, aux_l = most_diffused_pgto(auxcell)
+ cell_exp, _, cell_l = most_diffused_pgto(cell)
+ if omega == 0:
+ theta = 1./(1./cell_exp + 1./aux_exp)
+ else:
+ theta = 1./(1./cell_exp + 1./aux_exp + omega**-2)
+ lsum = cell_l * 2 + aux_l + 1
+ rad = cell.vol**(-1./3) * rcut + 1
+ surface = 4*np.pi * rad**2
+ lattice_sum_factor = 2*np.pi*rcut*lsum/(cell.vol*theta) + surface
+ cutoff = cell.precision / lattice_sum_factor
+ log.debug1('int3c_kernel integral omega=%g theta=%g cutoff=%g',
+ omega, theta, cutoff)
+
+ _atm_cpu, _bas_cpu, _env_cpu = conc_env(
+ bvkcell._atm, bvkcell._bas, _scale_sp_ctr_coeff(bvkcell),
+ auxcell._atm, auxcell._bas, _scale_sp_ctr_coeff(auxcell))
+ #NOTE: PTR_BAS_COORD is not updated in conc_env()
+ off = _bas_cpu[bvkcell.nbas,PTR_EXP] - auxcell._bas[0,PTR_EXP]
+ _bas_cpu[bvkcell.nbas:,PTR_BAS_COORD] += off
+
+ bvk_ao_loc = bvkcell.ao_loc
+ aux_loc = auxcell.ao_loc
+
+ _atm = cp.array(_atm_cpu, dtype=np.int32)
+ _bas = cp.array(_bas_cpu, dtype=np.int32)
+ _env = cp.array(_env_cpu, dtype=np.float64)
+ ao_loc = _conc_locs(bvk_ao_loc, aux_loc)
+ bvk_ncells = bvkcell.nbas // cell.nbas
+ int3c2e_envs = Int3c2eEnvVars(
+ cell.natm, cell.nbas, bvk_ncells, nimgs,
+ _atm.data.ptr, _bas.data.ptr, _env.data.ptr, ao_loc.data.ptr,
+ Ls.data.ptr, math.log(cutoff),
+ )
+ # Keep a reference to these arrays, prevent releasing them upon returning the closure
+ int3c2e_envs._env_ref_holder = (_atm, _bas, _env, ao_loc, Ls)
+
+ gen_img_idx = create_img_idx(cell, bvkcell, auxcell, Ls, int3c2e_envs)
+
+ uniq_l = uniq_l_ctr[:,0]
+ n_groups = np.count_nonzero(uniq_l <= LMAX)
+ init_constant(cell)
+ kern = libpbc.fill_int3c2e
+ cp.cuda.Stream.null.synchronize()
+ t1 = log.timer_debug1('initialize int3c2e_kernel', *cput0)
+ timing_collection = {}
+ kern_counts = 0
+
+ cell_ao_loc = cell.ao_loc
+ di = (cell_ao_loc[l_ctr_offsets[1:]] - cell_ao_loc[l_ctr_offsets[:-1]]).max()
+ dk = (aux_loc[l_ctr_aux_offsets[1:]] - aux_loc[l_ctr_aux_offsets[:-1]]).max()
+ buf = cp.empty((bvk_ncells,di, bvk_ncells,di, dk))
+
+ ij_tasks = ((i, j) for i in range(n_groups) for j in range(i+1))
+ for i, j in ij_tasks:
+ li = uniq_l[i]
+ lj = uniq_l[j]
+ ish0, ish1 = l_ctr_offsets[i], l_ctr_offsets[i+1]
+ jsh0, jsh1 = l_ctr_offsets[j], l_ctr_offsets[j+1]
+ nrow = bvk_ao_loc[ish1] - bvk_ao_loc[ish0]
+ ncol = bvk_ao_loc[jsh1] - bvk_ao_loc[jsh0]
+ img_idx, img_offsets, bas_ij_idx = gen_img_idx(ish0, ish1, jsh0, jsh1)
+
+ for k, lk in enumerate(self.uniq_l_ctr_aux[:,0]):
+ ksh0, ksh1 = l_ctr_aux_offsets[k:k+2]
+ naux = aux_loc[ksh1] - aux_loc[ksh0]
+ shls_slice = ish0, ish1, jsh0, jsh1, ksh0, ksh1
+ eri3c = cp.ndarray((bvk_ncells, nrow, bvk_ncells, ncol, naux),
+ dtype=np.float64, memptr=buf.data)
+ eri3c.fill(0.)
+ lll = f'({ANGULAR[li]}{ANGULAR[lj]}|{ANGULAR[lk]})'
+ scheme = int3c2e_scheme(li, lj, lk)
+ log.debug2('int3c2e_scheme for %s: %s', lll, scheme)
+ err = kern(
+ ctypes.cast(eri3c.data.ptr, ctypes.c_void_p),
+ ctypes.byref(int3c2e_envs), (ctypes.c_int*3)(*scheme),
+ (ctypes.c_int*6)(*shls_slice),
+ ctypes.c_int(bvk_ncells), ctypes.c_int(nrow),
+ ctypes.c_int(ncol), ctypes.c_int(naux),
+ ctypes.c_int(bas_ij_idx.size),
+ ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p),
+ ctypes.cast(img_idx.data.ptr, ctypes.c_void_p),
+ ctypes.cast(img_offsets.data.ptr, ctypes.c_void_p),
+ _atm_cpu.ctypes, ctypes.c_int(bvkcell.natm),
+ _bas_cpu.ctypes, ctypes.c_int(bvkcell.nbas), _env_cpu.ctypes)
+ if err != 0:
+ raise RuntimeError(f'fill_int3c2e kernel for {lll} failed')
+ if log.verbose >= logger.DEBUG1:
+ t1, t1p = log.timer_debug1(f'processing {lll}', *t1), t1
+ if lll not in timing_collection:
+ timing_collection[lll] = 0
+ timing_collection[lll] += t1[1] - t1p[1]
+ kern_counts += 1
+ yield shls_slice, eri3c
+
+ if log.verbose >= logger.DEBUG1:
+ log.timer('int3c2e', *cput0)
+ log.debug1('kernel launches %d', kern_counts)
+ for lll, t in timing_collection.items():
+ log.debug1('%s wall time %.2f', lll, t)
+
+class Int3c2eEnvVars(ctypes.Structure):
+ _fields_ = [
+ ('cell0_natm', ctypes.c_uint16),
+ ('cell0_nbas', ctypes.c_uint16),
+ ('bvk_ncells', ctypes.c_uint16),
+ ('nimgs', ctypes.c_uint16),
+ ('atm', ctypes.c_void_p),
+ ('bas', ctypes.c_void_p),
+ ('env', ctypes.c_void_p),
+ ('ao_loc', ctypes.c_void_p),
+ ('img_coords', ctypes.c_void_p),
+ ('log_cutoff', ctypes.c_float),
+ ]
+
+def _conc_locs(ao_loc1, ao_loc2):
+ comp_loc = np.append(ao_loc1[:-1], ao_loc1[-1] + ao_loc2)
+ return cp.array(comp_loc, dtype=np.int32)
+
+def int3c2e_scheme(li, lj, lk, shm_size=SHM_SIZE):
+ order = li + lj + lk
+ nroots = (order//2 + 1) * 2
+
+ g_size = (li+1)*(lj+1)*(lk+1)
+ unit = g_size*3 + nroots*2 + 6
+ nksp_max = shm_size//(unit*8)
+ nksp_max = _nearest_power2(nksp_max)
+
+ nfi = (li + 1) * (li + 2) // 2
+ nfj = (lj + 1) * (lj + 2) // 2
+ nfk = (lk + 1) * (lk + 2) // 2
+ gout_size = nfi * nfj * nfk
+ gout_stride = (gout_size + GOUT_WIDTH-1) // GOUT_WIDTH
+ # Round up to the next 2^n
+ gout_stride = _nearest_power2(gout_stride, return_leq=False)
+
+ # Align nksh*gout_stride to warp size
+ if gout_stride < 32:
+ nksh_per_block = 32 // gout_stride
+ nsp_per_block = min(THREADS // 32, nksp_max // nksh_per_block)
+ else:
+ nksh_per_block = THREADS // gout_stride
+ nsp_per_block = 1
+ if nksp_max < nksh_per_block:
+ raise RuntimeError('GOUT_WIDTH too small or not enough shared memory')
+
+ gout_stride = THREADS // (nksh_per_block*nsp_per_block)
+ return nksh_per_block, gout_stride, nsp_per_block
+
+def most_diffused_pgto(cell):
+ exps, cs = extract_pgto_params(cell, 'diffused')
+ ls = cell._bas[:,ANG_OF]
+ r2 = np.log(cs**2 / cell.precision * 10**ls) / exps
+ idx = r2.argmax()
+ return exps[idx], cs[idx], ls[idx]
+
+# This modified rcut estimation function will be available in pyscf-2.8 or newer
+def estimate_rcut(cell, auxcell, omega):
+ '''Estimate rcut for 3c2e SR-integrals'''
+ if cell.nbas == 0 or auxcell.nbas == 0:
+ return np.zeros(1)
+
+ if omega == 0:
+ # No SR integrals in int3c2e if omega=0
+ assert cell.dimension == 0
+ return np.zeros(1)
+
+ precision = cell.precision
+ ak, ck, lk = most_diffused_pgto(auxcell)
+
+ # the most diffused orbital basis
+ cell_exps, cs = extract_pgto_params(cell, 'diffused')
+ ls = cell._bas[:,ANG_OF]
+ r2_cell = np.log(cs**2 / precision * 10**ls) / cell_exps
+ ai_idx = r2_cell.argmax()
+ ai = cell_exps[ai_idx]
+ aj = cell_exps
+ li = ls[ai_idx]
+ lj = ls
+ ci = cs[ai_idx]
+ cj = cs
+
+ aij = ai + aj
+ lij = li + lj
+ l3 = lij + lk
+ theta = 1./(omega**-2 + 1./aij + 1./ak)
+ norm_ang = ((2*li+1)*(2*lj+1))**.5/(4*np.pi)
+ c1 = ci * cj * ck * norm_ang
+ sfac = aij*aj/(aij*aj + ai*theta)
+ fl = 2
+ fac = 2**li*np.pi**2.5*c1 * theta**(l3-.5)
+ rad = cell.vol**(-1./3) * cell.rcut + 1
+ surface = 4*np.pi * rad**2
+ lattice_sum_factor = 2*np.pi*cell.rcut/(cell.vol*theta) + surface
+ fac *= lattice_sum_factor
+ fac /= aij**(li+1.5) * ak**(lk+1.5) * aj**lj
+ fac *= fl / precision
+
+ r0 = cell.rcut # initial guess
+ r0 = (np.log(fac * (sfac*r0)**(l3-1) + 1.) / (sfac*theta))**.5
+ r0 = (np.log(fac * (sfac*r0)**(l3-1) + 1.) / (sfac*theta))**.5
+ rcut = r0
+ return rcut
+
+def guess_bvk_kmesh(cell, bvk_kmesh, target_size=BVK_CELL_SHELLS):
+ '''Generate a sufficient large bvk cell for fill_int3c2e kernel to achieve
+ better load balance'''
+ if bvk_kmesh is None:
+ bvk_kmesh = np.ones(3, dtype=int)
+ else:
+ bvk_kmesh = bvk_kmesh.copy()
+ bvk_ncells = np.prod(bvk_kmesh)
+
+ # produce a cell with ~2000 shells
+ replica = target_size / (bvk_ncells * cell.nbas)
+ if replica < 1:
+ return bvk_kmesh
+
+ mesh_max = cell.nimgs * 2 + 1
+ bvk_multiplier = mesh_max / bvk_kmesh
+ if cell.dimension == 2:
+ fac = (replica / np.prod(bvk_multiplier[:2]))**.5
+ fac = min(fac, 1)
+ bvk_kmesh[:2] *= (fac * bvk_multiplier[:2]).astype(int)
+ else:
+ # The replica on each axis should be proportional to the required nimg
+ # along each direction.
+ fac = (replica / np.prod(bvk_multiplier))**(1./3)
+ # The replica is not necessary to be more than the required nimg.
+ fac = min(fac, 1)
+ bvk_kmesh *= (fac * bvk_multiplier).astype(int)
+
+ return bvk_kmesh
diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py
new file mode 100644
index 00000000..9f892504
--- /dev/null
+++ b/gpu4pyscf/pbc/df/rsdf_builder.py
@@ -0,0 +1,427 @@
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Build GDF tensor using the range-separation integral algorithm.
+'''
+
+import os
+import ctypes
+import warnings
+import numpy as np
+import cupy as cp
+from cupyx.scipy.linalg import solve_triangular
+from pyscf import lib
+#from pyscf.pbc import gto as pbcgto
+#from pyscf.pbc.gto import pseudo
+from pyscf.pbc.tools import pbc as pbctools
+from pyscf.pbc.lib.kpts_helper import is_zero
+from pyscf.pbc.df.rsdf_builder import (
+ RCUT_THRESHOLD, estimate_ke_cutoff_for_omega)
+from pyscf.pbc.df import aft as aft_cpu
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem
+from gpu4pyscf.pbc.df import ft_ao
+from gpu4pyscf.pbc.lib.kpts_helper import kk_adapted_iter
+from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh
+from gpu4pyscf.pbc.gto.cell import extract_pgto_params
+from gpu4pyscf.pbc.df.int3c2e import sr_aux_e2, estimate_rcut
+
+OMEGA_MIN = 0.3
+
+# In the ED of the j2c2e metric, the default LINEAR_DEP_THR setting in pyscf-2.8
+# is too loose. The linear dependency truncation often leads to serious errors.
+# PBC GDF very differs to the molecular GDF approximation where diffused
+# functions typically have insignificant contributions. The diffused auxliary
+# crystial orbitals have large impacts on the accuracy of Coulomb integrals. A
+# tight linear dependency threshold have to be applied to control the error,
+# even this may cause more numericial stability issues.
+LINEAR_DEP_THR = 1e-11
+# Use eigenvalue decomposition in decompose_j2c
+PREFER_ED = False
+
+def build_cderi(cell, auxcell, kpts=None, j_only=False,
+ omega=None, linear_dep_threshold=LINEAR_DEP_THR):
+ assert cell.low_dim_ft_type != 'inf_vacuum'
+ assert cell.dimension >= 2
+ if cell.omega != 0:
+ assert cell.omega < 0
+ omega = abs(cell.omega)
+ with_long_range = False
+ else:
+ if omega is None:
+ cell_exps, cs = extract_pgto_params(cell, 'diffused')
+ omega = cell_exps.min()**.5
+ logger.debug(cell, 'omega guess in rsdf_builder = %g', omega)
+ omega = abs(omega)
+ with_long_range = True
+
+ if kpts is None or is_zero(kpts):
+ return build_cderi_gamma_point(
+ cell, auxcell, omega, with_long_range, linear_dep_threshold)
+ elif j_only:
+ return build_cderi_j_only(
+ cell, auxcell, kpts, omega, with_long_range, linear_dep_threshold)
+ else:
+ return build_cderi_kk(
+ cell, auxcell, kpts, omega, with_long_range, linear_dep_threshold)
+
+def build_cderi_kk(cell, auxcell, kpts, omega=OMEGA_MIN, with_long_range=True,
+ linear_dep_threshold=LINEAR_DEP_THR):
+ log = logger.new_logger(cell)
+ t0 = log.init_timer()
+ if kpts is None:
+ kpts = np.zeros((1, 3))
+ bvk_kmesh = kmesh = np.ones(3, dtype=int)
+ else:
+ # The remote images may contribute to certain k-point mesh, contributing
+ # to the finite-size effects in HFX. For sufficiently large number of
+ # kpts, the truncation radious cell.rcut may cause finite-size errors.
+ kpts = kpts.reshape(-1, 3)
+ rcut = estimate_rcut(cell, auxcell, omega).max()
+ bvk_kmesh = kmesh = kpts_to_kmesh(cell, kpts, rcut=rcut)
+ if len(kpts) != np.prod(kmesh):
+ # When targeting many kpts, num-kpts can be more than num-bvk-images.
+ # Using a large radius to regenerate MP kmesh. The new MP kmesh
+ # should cover all kpts.
+ kmesh = kpts_to_kmesh(cell, kpts, rcut=rcut*20)
+ j3c = sr_aux_e2(cell, auxcell, -omega, kpts, bvk_kmesh)
+ t1 = log.timer('pass1: int3c2e', *t0)
+
+ kpt_iters = list(kk_adapted_iter(kmesh))
+ uniq_kpts = kpts[[x[0] for x in kpt_iters]]
+ log.debug('Generate auxcell 2c2e integrals')
+ j2c = _get_2c2e(auxcell, uniq_kpts, omega, with_long_range) # on CPU
+ t1 = log.timer('int2c2e', *t1)
+
+ if with_long_range:
+ ft_ao_iter = _ft_ao_iter_generator(cell, auxcell, bvk_kmesh, omega, log)
+
+ prefer_ed = PREFER_ED
+ if cell.dimension == 2:
+ prefer_ed = True
+ cderi = {}
+ cderip = {}
+ for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters):
+ log.debug1('make_cderi for k-point %d %s', kp, kpts[kp])
+ log.debug1('ki_idx = %s', ki_idx)
+ log.debug1('kj_idx = %s', kj_idx)
+
+ if with_long_range:
+ '''exp(-i*(G + k) dot r) * Coulomb_kernel'''
+ for pqG, auxG_conj in ft_ao_iter(kpts[kp], kpts[kj_idx]):
+ # \sum_G coulG * ints(ij * exp(-i G * r)) * ints(P * exp(i G * r))
+ # = \sum_G FT(ij, G) conj(FT(aux, G)) , where aux
+ # functions |P> are assumed to be real
+ j3c[ki_idx,kj_idx] += contract('kpqG,Gr->kpqr', pqG, auxG_conj)
+
+ j2c_k = j2c[j2c_idx]
+ if kp == kp_conj: # self conjugated
+ # DF metric for self-conjugated k-point should be real
+ j2c_k = j2c_k.real
+ cd_j2c, cd_j2c_negative, j2ctag = decompose_j2c(
+ j2c_k, prefer_ed, linear_dep_threshold)
+ if cd_j2c.dtype != j3c.dtype:
+ cd_j2c = cd_j2c.astype(j3c.dtype)
+
+ for ki, kj in zip(ki_idx, kj_idx):
+ j3c_k = j3c[ki,kj]
+ cderi[ki,kj] = _solve_cderi(cd_j2c, j3c_k, j2ctag)
+ if cd_j2c_negative is not None:
+ assert cell.dimension == 2
+ cderip[ki,kj] = _solve_cderi(cd_j2c_negative, j3c_k, j2ctag)
+ t1 = log.timer('pass2: solve cderi', *t1)
+ return cderi, cderip
+
+def build_cderi_gamma_point(cell, auxcell, omega=OMEGA_MIN, with_long_range=True,
+ linear_dep_threshold=LINEAR_DEP_THR):
+ log = logger.new_logger(cell)
+ t0 = log.init_timer()
+ kmesh = None
+ kpts = None
+
+ j3c = sr_aux_e2(cell, auxcell, -omega)
+ t1 = log.timer('pass1: int3c2e', *t0)
+
+ log.debug('Generate auxcell 2c2e integrals')
+ j2c = _get_2c2e(auxcell, kpts, omega, with_long_range) # on CPU
+ j2c = j2c[0].real
+ t1 = log.timer('int2c2e', *t1)
+
+ cderi = {}
+ cderip = {}
+ if with_long_range:
+ ft_ao_iter = _ft_ao_iter_generator(cell, auxcell, kmesh, omega, log)
+ for pqG, auxG_conj in ft_ao_iter():
+ # \sum_G coulG * ints(ij * exp(-i G * r)) * ints(P * exp(i G * r))
+ # = \sum_G FT(ij, G) conj(FT(aux, G)) , where aux
+ # functions |P> are assumed to be real
+ j3c += contract('pqG,Gr->pqr', pqG[0], auxG_conj).real
+
+ prefer_ed = PREFER_ED
+ if cell.dimension == 2:
+ prefer_ed = True
+ cd_j2c, cd_j2c_negative, j2ctag = decompose_j2c(
+ j2c, prefer_ed, linear_dep_threshold)
+
+ cderi[0,0] = _solve_cderi(cd_j2c, j3c, j2ctag)
+ if cd_j2c_negative is not None:
+ assert cell.dimension == 2
+ cderip[0,0] = _solve_cderi(cd_j2c_negative, j3c, j2ctag)
+ t1 = log.timer('pass2: solve cderi', *t1)
+ return cderi, cderip
+
+def build_cderi_j_only(cell, auxcell, kpts, omega=OMEGA_MIN, with_long_range=True,
+ linear_dep_threshold=LINEAR_DEP_THR):
+ log = logger.new_logger(cell)
+ t0 = log.init_timer()
+ if kpts is None:
+ kpts = np.zeros((1, 3))
+ bvk_kmesh = np.ones(3, dtype=int)
+ else:
+ # Coulomb integrals requires smaller kmesh to converge finite-size effects.
+ # A relatively small bvk_kmesh can be used for Coulomb integrals.
+ kpts = kpts.reshape(-1, 3)
+ bvk_kmesh = kpts_to_kmesh(cell, kpts)
+ # TODO: time-reversal symmetry in j3c, j2c
+ j3c = sr_aux_e2(cell, auxcell, -omega, kpts, bvk_kmesh, j_only=True)
+ t1 = log.timer('pass1: int3c2e', *t0)
+
+ log.debug('Generate auxcell 2c2e integrals')
+ j2c = _get_2c2e(auxcell, None, omega, with_long_range) # on CPU
+ j2c = j2c[0].real
+ t1 = log.timer('int2c2e', *t1)
+
+ # TODO: consider time-reversal symmetry
+ cderi = {}
+ cderip = {}
+ if with_long_range:
+ ft_ao_iter = _ft_ao_iter_generator(cell, auxcell, bvk_kmesh, omega, log)
+ kpt = np.zeros(3)
+ for pqG, auxG_conj in ft_ao_iter(kpt, kpts):
+ # \sum_G coulG * ints(ij * exp(-i G * r)) * ints(P * exp(i G * r))
+ # = \sum_G FT(ij, G) conj(FT(aux, G)) , where aux
+ # functions |P> are assumed to be real
+ j3c += contract('kpqG,Gr->kpqr', pqG, auxG_conj)
+
+ prefer_ed = PREFER_ED
+ if cell.dimension == 2:
+ prefer_ed = True
+ cd_j2c, cd_j2c_negative, j2ctag = decompose_j2c(
+ j2c, prefer_ed, linear_dep_threshold)
+ if cd_j2c.dtype != j3c.dtype:
+ cd_j2c = cd_j2c.astype(j3c.dtype)
+
+ nkpts = len(kpts)
+ for k in range(nkpts):
+ cderi[k, k] = _solve_cderi(cd_j2c, j3c[k], j2ctag)
+ if cd_j2c_negative is not None:
+ assert cell.dimension == 2
+ cderip[k, k] = _solve_cderi(cd_j2c_negative, j3c[k], j2ctag)
+ t1 = log.timer('pass2: solve cderi', *t1)
+ return cderi, cderip
+
+def _weighted_coulG_LR(cell, Gv, omega, kws, kpt=np.zeros(3)):
+ coulG = pbctools.get_coulG(cell, kpt, exx=False, Gv=Gv, omega=abs(omega))
+ coulG *= kws
+ if is_zero(kpt):
+ assert Gv[0].dot(Gv[0]) == 0
+ coulG[0] -= np.pi / omega**2 / cell.vol
+ return cp.asarray(coulG)
+
+def _ft_ao_iter_generator(cell, auxcell, bvk_kmesh, omega, verbose=None):
+ ke_cutoff = estimate_ke_cutoff_for_omega(cell, omega)
+ mesh = cell.cutoff_to_mesh(ke_cutoff)
+ mesh = cell.symmetrize_mesh(mesh)
+ Gv, Gvbase, kws = cell.get_Gv_weights(mesh)
+ ngrids = len(Gv)
+ nao = cell.nao
+
+ ft_opt = ft_ao.FTOpt(cell, bvk_kmesh=bvk_kmesh)
+ ft_kern = ft_opt.gen_ft_kernel(verbose=verbose)
+ if bvk_kmesh is None:
+ bvk_ncells = 1
+ else:
+ bvk_ncells = np.prod(bvk_kmesh)
+ avail_mem = get_avail_mem() * .8
+ Gblksize = max(16, int(avail_mem/(2*16*nao**2*bvk_ncells))//8*8)
+ Gblksize = min(Gblksize, ngrids, 16384)
+ #logger.debug1(cell, 'Gblksize = %d', Gblksize)
+ def ft_ao_iter(kpt=np.zeros(3), kpts=None):
+ coulG = _weighted_coulG_LR(auxcell, Gv, omega, kws, kpt)
+ auxG_conj = cp.asarray(ft_ao.ft_ao(auxcell, Gv, kpt=kpt).conj(), order='C')
+ auxG_conj *= cp.asarray(coulG[:,None])
+ for p0, p1 in lib.prange(0, ngrids, Gblksize):
+ pqG = ft_kern(Gv[p0:p1], kpt, kpts).transpose(0,2,3,1)
+ yield pqG, auxG_conj[p0:p1]
+ return ft_ao_iter
+
+def decompose_j2c(j2c, prefer_ed=PREFER_ED, linear_dep_threshold=LINEAR_DEP_THR):
+ if prefer_ed:
+ return eigenvalue_decomposed_metric(j2c, linear_dep_threshold)
+ else:
+ return cholesky_decomposed_metric(j2c)
+
+def cholesky_decomposed_metric(j2c):
+ '''Return L for j2c = L L^T'''
+ j2c_negative = None
+ j2ctag = 'CD'
+ # Cupy cholesky does not check positive-definite, seems returning nan in the
+ # resultant CD matrix silently.
+ j2c = cp.asarray(j2c)
+ j2c = cp.linalg.cholesky(j2c)
+ if cp.isnan(j2c[-1,-1]):
+ raise RuntimeError('j2c is not positive definite')
+ return j2c, j2c_negative, j2ctag
+
+def eigenvalue_decomposed_metric(j2c, linear_dep_threshold=LINEAR_DEP_THR):
+ j2c = cp.asarray(j2c)
+ w, v = cp.linalg.eigh(j2c)
+ mask = w > linear_dep_threshold
+ v1 = v[:,mask].conj().T
+ v1 *= w[mask, None]**-.5
+ j2c = v1
+ idx = cp.where(w < -linear_dep_threshold)[0]
+ j2c_negative = None
+ if len(idx) > 0:
+ j2c_negative = (v[:,idx] * (-w[idx])**-.5).conj().T
+ j2ctag = 'ED'
+ return j2c, j2c_negative, j2ctag
+
+# Create 2c2e, store on CPU
+def _get_2c2e(auxcell, uniq_kpts, omega, with_long_range=True):
+ # j2c ~ (-kpt_ji | kpt_ji) => hermi=1
+ precision = auxcell.precision ** 1.5
+ aux_exps, aux_cs = extract_pgto_params(auxcell, 'diffused')
+ aux_exp = aux_exps.min()
+ theta = 1./(2./aux_exp + omega**-2)
+ rad = auxcell.vol**(-1./3) * auxcell.rcut + 1
+ surface = 4*np.pi * rad**2
+ lattice_sum_factor = 2*np.pi*auxcell.rcut/(auxcell.vol*theta) + surface
+ rcut_sr = (np.log(lattice_sum_factor / precision + 1.) / theta)**.5
+ logger.debug1(auxcell, 'auxcell rcut_sr = %g', rcut_sr)
+ auxcell_sr = auxcell.copy()
+ auxcell_sr.rcut = rcut_sr
+ with auxcell_sr.with_short_range_coulomb(omega):
+ j2c = auxcell_sr.pbc_intor('int2c2e', hermi=1, kpts=uniq_kpts)
+
+ if not with_long_range:
+ return j2c
+
+ ke = estimate_ke_cutoff_for_omega(auxcell, omega, precision)
+ mesh = auxcell.cutoff_to_mesh(ke)
+ mesh = auxcell.symmetrize_mesh(mesh)
+ logger.debug(auxcell, 'Set 2c2e integrals precision %g, mesh %s', precision, mesh)
+
+ Gv, Gvbase, kws = auxcell.get_Gv_weights(mesh)
+ b = auxcell.reciprocal_vectors()
+ gxyz = lib.cartesian_prod([np.arange(len(x)) for x in Gvbase])
+ ngrids = Gv.shape[0]
+ naux = auxcell.nao
+ max_memory = max(1000, auxcell.max_memory - lib.current_memory()[0])
+ blksize = min(ngrids, int(max_memory*.4e6/16/naux), 200000)
+ logger.debug2(auxcell, 'max_memory %s (MB) blocksize %s', max_memory, blksize)
+
+ if uniq_kpts is None:
+ j2c = cp.asarray(j2c)
+ coulG_LR = _weighted_coulG_LR(auxcell, Gv, omega, kws)
+ for p0, p1 in lib.prange(0, ngrids, blksize):
+ auxG = ft_ao.ft_ao(auxcell, Gv[p0:p1], None, b, gxyz[p0:p1], Gvbase).T
+ j2c += (auxG.conj() * coulG_LR[p0:p1]).dot(auxG.T).real
+ auxG = None
+ j2c = [j2c.real.get()]
+ else:
+ for k, kpt in enumerate(uniq_kpts):
+ j2c_k = cp.asarray(j2c[k])
+ coulG_LR = _weighted_coulG_LR(auxcell, Gv, omega, kws, kpt)
+ gamma_point = is_zero(kpt)
+
+ for p0, p1 in lib.prange(0, ngrids, blksize):
+ auxG = ft_ao.ft_ao(auxcell, Gv[p0:p1], None, b, gxyz[p0:p1], Gvbase, kpt).T
+ if gamma_point:
+ j2c_k += (auxG.conj() * coulG_LR[p0:p1]).dot(auxG.T).real
+ else:
+ j2c_k += (auxG.conj() * coulG_LR[p0:p1]).dot(auxG.T)
+ auxG = None
+ j2c[k] = j2c_k.get()
+ return j2c
+
+def _solve_cderi(cd_j2c, j3c, j2ctag):
+ if j2ctag == 'ED':
+ return contract('Lr,pqr->Lpq', cd_j2c, j3c)
+ else:
+ nao, naux = j3c.shape[1:3]
+ j3c = solve_triangular(cd_j2c, j3c.reshape(-1,naux).T, lower=True)
+ return j3c.reshape(naux,nao,nao)
+
+def get_pp_loc_part1(cell, kpts=None, with_pseudo=True, verbose=None):
+ fakenuc = aft_cpu._fake_nuc(cell, with_pseudo=with_pseudo)
+ cell_exps, cs = extract_pgto_params(cell, 'diffused')
+ omega = (2*cell_exps.min())**.5
+ logger.debug(cell, 'omega guess in get_pp_loc_part1 = %g', omega)
+
+ if kpts is None or is_zero(kpts):
+ kpts = None
+ bvk_kmesh = np.ones(3, dtype=int)
+ else:
+ bvk_kmesh = kpts_to_kmesh(cell, kpts)
+ nuc = sr_aux_e2(cell, fakenuc, -omega, kpts, bvk_kmesh, j_only=True)
+ charges = -cp.asarray(cell.atom_charges())
+ if kpts is None:
+ nuc = contract('pqr,r->pq', nuc, charges)
+ else:
+ nuc = contract('kpqr,r->kpq', nuc, charges)
+
+ # TODO: consider time-reversal symmetry
+ ft_ao_iter = _ft_ao_iter_generator(cell, fakenuc, bvk_kmesh, omega, verbose)
+ kpt = np.zeros(3)
+ for i, (pqG, auxG_conj) in enumerate(ft_ao_iter(kpt, kpts)):
+ ZG = auxG_conj.dot(charges)
+ # contributions due to pseudo.pp_int.get_gth_vlocG_part1
+ if (with_pseudo and i == 0 and
+ (cell.dimension == 3 or
+ (cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum'))):
+ exps = cp.asarray(np.hstack(fakenuc.bas_exps()))
+ ZG[0] -= charges.dot(np.pi/exps) / cell.vol
+ if kpts is None:
+ nuc += contract('pqG,G->pq', pqG[0], ZG).real
+ else:
+ nuc += contract('kpqG,G->kpq', pqG, ZG)
+ return nuc
+
+def get_nuc(cell, kpts=None):
+ '''Get the periodic nuc-el AO matrix, with G=0 removed.
+ '''
+ log = logger.new_logger(cell)
+ t0 = log.init_timer()
+ nuc = get_pp_loc_part1(cell, kpts, with_pseudo=False, verbose=log)
+ log.timer('get_nuc', *t0)
+ return nuc
+
+def get_pp(cell, kpts=None):
+ '''Get the periodic pseudopotential nuc-el ao matrix, with G=0 removed.
+ '''
+ from pyscf.pbc.gto import pseudo
+ log = logger.new_logger(cell)
+ t0 = log.init_timer()
+ pp2builder = aft_cpu._IntPPBuilder(cell, kpts)
+ vpp = cp.asarray(pp2builder.get_pp_loc_part2())
+ t1 = log.timer_debug1('get_pp_loc_part2', *t0)
+ vpp += cp.asarray(pseudo.pp_int.get_pp_nl(cell, kpts))
+ t1 = log.timer_debug1('get_pp_nl', *t1)
+
+ vpp += get_pp_loc_part1(cell, kpts, with_pseudo=True, verbose=log)
+ t1 = log.timer_debug1('get_pp_loc_part1', *t1)
+ log.timer('get_pp', *t0)
+ return vpp
diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_aft.py b/gpu4pyscf/pbc/df/tests/test_pbc_aft.py
index 6ca1d627..98ddad61 100644
--- a/gpu4pyscf/pbc/df/tests/test_pbc_aft.py
+++ b/gpu4pyscf/pbc/df/tests/test_pbc_aft.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -29,10 +29,9 @@ def setUpModule():
'C' :[[0, [1., 1]]],}
cell.pseudo = {'C':'gth-pade'}
cell.a = np.eye(3) * 2.5
+ cell.precision = 1e-8
cell.build()
- np.random.seed(1)
- kpts = np.random.random((4,3))
- kpts[3] = kpts[0]-kpts[1]+kpts[2]
+ kpts = cell.make_kpts([13,1,1])[4:8]
cell1 = pgto.Cell()
cell1.atom = 'He 1. .5 .5; He .1 1.3 2.1'
@@ -49,22 +48,22 @@ class KnownValues(unittest.TestCase):
def test_aft_get_pp(self):
ref = aft_cpu.AFTDF(cell, kpts[0]).get_pp()
v1 = aft.AFTDF(cell, kpts[0]).get_pp().get()
- assert abs(v1 - ref).max() < 1e-12
+ assert abs(v1 - ref).max() < 1e-9
kpts4 = cell.make_kpts([4,1,1])
ref = aft_cpu.AFTDF(cell, kpts4).get_pp()
v1 = aft.AFTDF(cell, kpts4).get_pp().get()
- assert abs(v1 - ref).max() < 1e-12
+ assert abs(v1 - ref).max() < 1e-9
def test_aft_get_nuc(self):
ref = aft_cpu.AFTDF(cell, kpts[0]).get_nuc()
v1 = aft.AFTDF(cell, kpts[0]).get_nuc().get()
- assert abs(v1 - ref).max() < 1e-12
+ assert abs(v1 - ref).max() < 1e-9
kpts4 = cell.make_kpts([4,1,1])
ref = aft_cpu.AFTDF(cell, kpts4).get_nuc()
v1 = aft.AFTDF(cell, kpts4).get_nuc().get()
- assert abs(v1 - ref).max() < 1e-12
+ assert abs(v1 - ref).max() < 1e-9
def test_jk(self):
mesh = [11]*3
@@ -76,15 +75,15 @@ def test_jk(self):
dm = np.random.random((nao,nao))
jref, kref = mydf0.get_jk(dm, hermi=0, exxdiv='ewald')
vj, vk = mydf.get_jk(dm, hermi=0, exxdiv='ewald')
- assert abs(vj.get() - jref).max() < 1e-12
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vj.get() - jref).max() < 1e-9
+ assert abs(vk.get() - kref).max() < 1e-9
dm = dm + np.random.random((nao,nao)) * 1j
dm = dm + dm.conj().T
jref, kref = mydf0.get_jk(dm, hermi=1, exxdiv='ewald')
vj, vk = mydf.get_jk(dm, hermi=1, exxdiv='ewald')
- assert abs(vj.get() - jref).max() < 1e-12
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vj.get() - jref).max() < 1e-9
+ assert abs(vk.get() - kref).max() < 1e-9
def test_jk_complex_dm(self):
scaled_center = [0.3728,0.5524,0.7672]
@@ -98,14 +97,14 @@ def test_jk_complex_dm(self):
dm = np.random.random((nao,nao)) + np.random.random((nao,nao)) * 1j
jref, kref = mydf0.get_jk(dm, hermi=0, kpts=kpt, exxdiv='ewald')
vj, vk = mydf.get_jk(dm, hermi=0, kpts=kpt, exxdiv='ewald')
- assert abs(vj.get() - jref).max() < 1e-12
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vj.get() - jref).max() < 1e-9
+ assert abs(vk.get() - kref).max() < 1e-9
dm = dm + dm.conj().T
jref, kref = mydf0.get_jk(dm, hermi=1, kpts=kpt, exxdiv='ewald')
vj, vk = mydf.get_jk(dm, hermi=1, kpts=kpt, exxdiv='ewald')
- assert abs(vj.get() - jref).max() < 1e-12
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vj.get() - jref).max() < 1e-9
+ assert abs(vk.get() - kref).max() < 1e-9
def test_aft_j(self):
kpts = np.random.random((4,3))
@@ -120,7 +119,7 @@ def test_aft_j(self):
dm = dm + dm.transpose(0,2,1)
jref = mydf0.get_jk(dm, with_k=False)[0]
vj = mydf.get_jk(dm, with_k=False)[0]
- assert abs(vj.get() - jref).max() < 1e-12
+ assert abs(vj.get() - jref).max() < 1e-9
def test_aft_k(self):
kpts = cell.get_abs_kpts([[-.25,-.25,-.25],
@@ -141,7 +140,7 @@ def test_aft_k(self):
dm = np.random.random((nkpts,nao,nao))
kref = mydf0.get_jk(dm, hermi=0, with_j=False)[1]
vk = mydf.get_jk(dm, hermi=0, with_j=False)[1]
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vk.get() - kref).max() < 1e-9
def test_aft_k1(self):
kpts = cell.get_abs_kpts([[-.25,-.25,-.25],
@@ -163,7 +162,7 @@ def test_aft_k1(self):
dm = dm + dm.transpose(0,2,1)
kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1]
vk = mydf.get_jk(dm, hermi=1, with_j=False)[1]
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vk.get() - kref).max() < 1e-9
def test_aft_k2(self):
kpts = cell.make_kpts([2,1,1])
@@ -183,7 +182,7 @@ def test_aft_k2(self):
kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1]
vk = mydf.get_jk(dm, hermi=1, with_j=False)[1]
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vk.get() - kref).max() < 1e-9
def test_aft_k3(self):
kpts = cell.make_kpts([6,1,1])
@@ -205,7 +204,7 @@ def test_aft_k3(self):
kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1]
vk = mydf.get_jk(dm, hermi=1, with_j=False)[1]
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vk.get() - kref).max() < 1e-9
if __name__ == '__main__':
print("Full Tests for aft")
diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_df.py b/gpu4pyscf/pbc/df/tests/test_pbc_df.py
index e89cc8a0..fcc22837 100644
--- a/gpu4pyscf/pbc/df/tests/test_pbc_df.py
+++ b/gpu4pyscf/pbc/df/tests/test_pbc_df.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@ def setUpModule():
'C' :[[0, [1., 1]]],}
cell.pseudo = {'C':'gth-pade'}
cell.a = np.eye(3) * 2.5
+ cell.precision = 1e-8
cell.build()
def tearDownModule():
@@ -37,15 +38,19 @@ def tearDownModule():
class KnownValues(unittest.TestCase):
def test_get_pp(self):
- kpt = cell.make_kpts([9,6,5])[107]
- ref = df_cpu.GDF(cell, kpt).get_pp()
- v1 = GDF(cell, kpt).get_pp().get()
- assert abs(v1 - ref).max() < 1e-12
+ #kpt = cell.make_kpts([9,6,5])[107]
+ #ref = df_cpu.GDF(cell, kpt).get_pp()
+ #v1 = GDF(cell, kpt).get_pp().get()
+ #assert abs(v1 - ref).max() < 1e-8
+
+ ref = df_cpu.GDF(cell).get_pp()
+ v1 = GDF(cell).get_pp().get()
+ assert abs(v1 - ref).max() < 1e-8
kpts4 = cell.make_kpts([4,1,1])
ref = df_cpu.GDF(cell, kpts4).get_pp()
v1 = GDF(cell, kpts4).get_pp().get()
- assert abs(v1 - ref).max() < 1e-12
+ assert abs(v1 - ref).max() < 1e-8
def test_get_nuc(self):
L = 5.
@@ -56,18 +61,18 @@ def test_get_nuc(self):
cell1.atom = '''He 3. 2. 3.
He 1. 1. 1.'''
cell1.basis = 'ccpvdz'
- cell1.precision=1e-12
+ cell1.precision=1e-8
cell1.verbose = 0
cell1.max_memory = 1000
cell1.build(0,0)
ref = df_cpu.GDF(cell1).get_nuc()
v1 = GDF(cell1).get_nuc().get()
- assert abs(v1 - ref).max() < 1e-12
+ assert abs(v1 - ref).max() < 1e-8
kpts4 = cell1.make_kpts([4,1,1])
ref = df_cpu.GDF(cell1, kpts4).get_nuc()
v1 = GDF(cell1, kpts4).get_nuc().get()
- assert abs(v1 - ref).max() < 1e-12
+ assert abs(v1 - ref).max() < 1e-8
def test_jk(self):
mydf0 = df_cpu.GDF(cell)
@@ -78,16 +83,38 @@ def test_jk(self):
dm = np.random.random((nao,nao))
jref, kref = mydf0.get_jk(dm, hermi=0, exxdiv='ewald')
vj, vk = mydf.get_jk(dm, hermi=0, exxdiv='ewald')
- assert abs(vj.get() - jref).max() < 1e-12
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vj.get() - jref).max() < 1e-8
+ assert abs(vk.get() - kref).max() < 1e-8
dm = dm + np.random.random((nao,nao)) * 1j
dm = dm + dm.conj().T
jref, kref = mydf0.get_jk(dm, hermi=1, exxdiv='ewald')
vj, vk = mydf.get_jk(dm, hermi=1, exxdiv='ewald')
- assert abs(vj.get() - jref).max() < 1e-12
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vj.get() - jref).max() < 1e-8
+ assert abs(vk.get() - kref).max() < 1e-8
+
+ def test_jk1(self):
+ kpts = cell.make_kpts([1,6,1])
+ nkpts = len(kpts)
+ mydf0 = df_cpu.GDF(cell, kpts)
+ mydf = GDF(cell, kpts)
+
+ nao = cell.nao
+ np.random.seed(12)
+ dm = (np.random.random((nkpts, nao, nao)) +
+ np.random.random((nkpts, nao, nao))*1j)
+ jref, kref = mydf0.get_jk(dm, hermi=0, exxdiv='ewald')
+ vj, vk = mydf.get_jk(dm, hermi=0, exxdiv='ewald')
+ assert abs(vj.get() - jref).max() < 1e-8
+ assert abs(vk.get() - kref).max() < 1e-8
+
+ dm = dm + dm.conj().transpose(0,2,1)
+ jref, kref = mydf0.get_jk(dm, hermi=1, exxdiv='ewald')
+ vj, vk = mydf.get_jk(dm, hermi=1, exxdiv='ewald')
+ assert abs(vj.get() - jref).max() < 1e-8
+ assert abs(vk.get() - kref).max() < 1e-8
+ @unittest.skip('pbc-gdf only supports Monkhorst-Pack k-mesh')
def test_jk_complex_dm(self):
scaled_center = [0.3728,0.5524,0.7672]
kpt = cell.make_kpts([1,1,1], scaled_center=scaled_center)[0]
@@ -99,15 +126,16 @@ def test_jk_complex_dm(self):
dm = np.random.random((nao,nao)) + np.random.random((nao,nao)) * 1j
jref, kref = mydf0.get_jk(dm, hermi=0, kpts=kpt, exxdiv='ewald')
vj, vk = mydf.get_jk(dm, hermi=0, kpts=kpt, exxdiv='ewald')
- assert abs(vj.get() - jref).max() < 1e-12
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vj.get() - jref).max() < 1e-8
+ assert abs(vk.get() - kref).max() < 1e-8
dm = dm + dm.conj().T
jref, kref = mydf0.get_jk(dm, hermi=1, kpts=kpt, exxdiv='ewald')
vj, vk = mydf.get_jk(dm, hermi=1, kpts=kpt, exxdiv='ewald')
- assert abs(vj.get() - jref).max() < 1e-12
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vj.get() - jref).max() < 1e-8
+ assert abs(vk.get() - kref).max() < 1e-8
+ @unittest.skip('pbc-gdf only supports Monkhorst-Pack k-mesh')
def test_get_j(self):
kpts = np.random.random((4,3))
nkpts = len(kpts)
@@ -120,8 +148,9 @@ def test_get_j(self):
dm = dm + dm.transpose(0,2,1)
jref = mydf0.get_jk(dm, with_k=False)[0]
vj = mydf.get_jk(dm, with_k=False)[0]
- assert abs(vj.get() - jref).max() < 1e-12
+ assert abs(vj.get() - jref).max() < 1e-8
+ @unittest.skip('pbc-gdf only supports Monkhorst-Pack k-mesh')
def test_get_k(self):
kpts = cell.get_abs_kpts([[-.25,-.25,-.25],
[-.25,-.25, .25],
@@ -140,8 +169,9 @@ def test_get_k(self):
dm = np.random.random((nkpts,nao,nao))
kref = mydf0.get_jk(dm, hermi=0, with_j=False)[1]
vk = mydf.get_jk(dm, hermi=0, with_j=False)[1]
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vk.get() - kref).max() < 1e-8
+ @unittest.skip('pbc-gdf only supports Monkhorst-Pack k-mesh')
def test_get_k1(self):
kpts = cell.get_abs_kpts([[-.25,-.25,-.25],
[-.25,-.25, .25],
@@ -161,11 +191,10 @@ def test_get_k1(self):
dm = dm + dm.transpose(0,2,1)
kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1]
vk = mydf.get_jk(dm, hermi=1, with_j=False)[1]
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vk.get() - kref).max() < 1e-8
- @unittest.skip('build_k from MO coefficients')
def test_get_k2(self):
- kpts = cell.make_kpts([2,1,1])
+ kpts = cell.make_kpts([3,1,1])
nkpts = len(kpts)
mydf0 = df_cpu.GDF(cell, kpts=kpts)
mydf = GDF(cell, kpts=kpts)
@@ -176,14 +205,13 @@ def test_get_k2(self):
mo = (np.random.random((nkpts,nao,nocc)) +
np.random.random((nkpts,nao,nocc))*1j)
mo_occ = np.ones((nkpts,nocc))
- dm = np.random.rand(nkpts, nao, nao)
+ dm = np.einsum('kpi,kqi->kpq', mo, mo.conj())
dm = lib.tag_array(dm, mo_coeff=mo, mo_occ=mo_occ)
kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1]
vk = mydf.get_jk(dm, hermi=1, with_j=False)[1]
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vk.get() - kref).max() < 1e-8
- @unittest.skip('build_k from MO coefficients')
def test_get_k3(self):
kpts = cell.make_kpts([6,1,1])
nkpts = len(kpts)
@@ -197,12 +225,12 @@ def test_get_k3(self):
mo = (np.random.random((nkpts,nao,nocc)) +
np.random.random((nkpts,nao,nocc))*1j)
mo_occ = np.ones((nkpts,nocc))
- dm = np.random.rand(nkpts, nao, nao)
+ dm = np.einsum('kpi,kqi->kpq', mo, mo.conj())
dm = lib.tag_array(dm, mo_coeff=mo, mo_occ=mo_occ)
kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1]
vk = mydf.get_jk(dm, hermi=1, with_j=False)[1]
- assert abs(vk.get() - kref).max() < 1e-12
+ assert abs(vk.get() - kref).max() < 1e-8
if __name__ == '__main__':
print("Full Tests for PBC DF")
diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_ft_ao.py b/gpu4pyscf/pbc/df/tests/test_pbc_ft_ao.py
index 55646945..ee77c401 100644
--- a/gpu4pyscf/pbc/df/tests/test_pbc_ft_ao.py
+++ b/gpu4pyscf/pbc/df/tests/test_pbc_ft_ao.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_int3c2e.py b/gpu4pyscf/pbc/df/tests/test_pbc_int3c2e.py
new file mode 100644
index 00000000..3238806a
--- /dev/null
+++ b/gpu4pyscf/pbc/df/tests/test_pbc_int3c2e.py
@@ -0,0 +1,153 @@
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import pyscf
+from pyscf import lib
+from pyscf.pbc.df import rsdf_builder
+from gpu4pyscf.pbc.df.int3c2e import sr_aux_e2
+
+
+def test_int3c2e_gamma_point():
+ cell = pyscf.M(
+ atom='''C1 1.3 .2 .3
+ C2 .19 .1 1.1
+ ''',
+ basis={'C1': [[3, [1.1, 1.]],
+ [4, [2., 1.]]],
+ 'C2': 'ccpvdz'},
+ precision = 1e-8,
+ a=np.diag([2.5, 1.9, 2.2])*3)
+
+ auxcell = cell.copy()
+ auxcell.basis = {
+ 'C1':'''
+C P
+ 102.9917624900 1.0000000000
+C P
+ 28.1325940100 1.0000000000
+C P
+ 9.8364318200 1.0000000000
+C P
+ 3.3490545000 1.0000000000
+C P
+ 1.4947618600 1.0000000000
+C P
+ 0.5769010900 1.0000000000
+C D
+ 0.1995412500 1.0000000000 ''',
+ 'C2':[[0, [.5, 1.]]],
+ }
+ auxcell.build()
+ omega = -0.2
+ dat = sr_aux_e2(cell, auxcell, omega).get()
+
+ cell.precision=1e-10
+ cell.build()
+ df = rsdf_builder._RSGDFBuilder(cell, auxcell).build(omega=abs(omega))
+ int3c = df.gen_int3c_kernel('int3c2e', aosym='s1', return_complex=True)
+ ref = int3c().reshape(dat.shape)
+ assert abs(dat - ref).max() < 1e-8
+
+def test_int3c2e_kpoints():
+ cell = pyscf.M(
+ atom='''H1 1.3 .2 .3
+ H2 .19 .1 1.1
+ ''',
+ basis='ccpvdz',
+ precision = 1e-8,
+ a=np.diag([2.5, 1.9, 2.2])*4)
+ auxcell = cell.copy()
+ auxcell.basis = [[0, [3.5, 1.]],
+ [0, [1.1, 1.]],
+ [1, [0.7, 1.]],
+ [2, [1.5, 1.]]]
+ auxcell.build()
+ kpts = cell.make_kpts([5,1,1])
+ omega = -0.2
+ dat = sr_aux_e2(cell, auxcell, omega, kpts).get()
+
+ cell.precision=1e-10
+ cell.build()
+ df = rsdf_builder._RSGDFBuilder(cell, auxcell, kpts).build(omega=abs(omega))
+ int3c = df.gen_int3c_kernel('int3c2e', aosym='s1', return_complex=True)
+ ref = int3c().reshape(dat.shape)
+ assert abs(dat - ref).max() < 1e-8
+
+def test_minor_diffused_basis():
+ cell = pyscf.M(
+ atom='''H 1.3 .2 .3
+ H .19 .1 1.1
+ ''',
+ basis='''
+C S
+ 7.5 0.40
+ 2.6 0.90
+ 0.5 0.08''',
+ precision = 1e-8,
+ a=np.diag([2.5, 1.9, 2.2])*3)
+ auxcell = cell.copy()
+ auxcell.basis = '''
+C P
+ 1.4947618600 1.0000000000
+C P
+ 0.5769010900 1.0000000000
+C D
+ 0.1995412500 1.0000000000 '''
+ auxcell.build()
+ omega = -0.2
+ dat = sr_aux_e2(cell, auxcell, omega).get()
+
+ cell.precision=1e-12
+ cell.build()
+ df = rsdf_builder._RSGDFBuilder(cell, auxcell).build(omega=abs(omega))
+ int3c = df.gen_int3c_kernel('int3c2e', aosym='s1', return_complex=True)
+ ref = int3c().reshape(dat.shape)
+ assert abs(dat - ref).max() < 1e-8
+
+def test_ignorable_diffused_basis():
+ cell = pyscf.M(
+ atom='''H 1.3 .2 .3
+ H .19 .1 1.1
+ ''',
+ basis='''
+C S
+ 7.5 0.4000000
+ 2.6 0.9000000
+ 0.5 0.0000002''',
+ precision = 1e-8,
+ a=np.diag([2.5, 1.9, 2.2])*3)
+ auxcell = cell.copy()
+ auxcell.basis = '''
+C P
+ 1.4947618600 1.0000000000
+C P
+ 0.5769010900 1.0000000000
+C D
+ 0.1995412500 1.0000000000 '''
+ auxcell.build()
+ omega = -0.2
+ cell.verbose = 6
+ dat = sr_aux_e2(cell, auxcell, omega).get()
+
+ cell.basis='''
+C S
+ 7.5 0.4000000
+ 2.6 0.9000000'''
+ cell.build()
+ df = rsdf_builder._RSGDFBuilder(cell, auxcell).build(omega=abs(omega))
+ int3c = df.gen_int3c_kernel('int3c2e', aosym='s1', return_complex=True)
+ ref = int3c().reshape(dat.shape)
+ assert abs(dat - ref).max() < 1e-6
diff --git a/gpu4pyscf/pbc/df/tests/test_rsdf_builder.py b/gpu4pyscf/pbc/df/tests/test_rsdf_builder.py
new file mode 100644
index 00000000..0d77cfb0
--- /dev/null
+++ b/gpu4pyscf/pbc/df/tests/test_rsdf_builder.py
@@ -0,0 +1,177 @@
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import numpy as np
+import pyscf
+from pyscf.pbc.df.rsdf_builder import _RSGDFBuilder
+from pyscf.pbc.df.df import _load3c
+from gpu4pyscf.pbc.df.rsdf_builder import build_cderi
+
+def test_gamma_point():
+ cell = pyscf.M(
+ atom='''C1 1.3 .2 .3
+ C2 .19 .1 1.1
+ ''',
+ basis={'C1': [[0, [1.1, 1.]],
+ [1, [2., 1.]]],
+ 'C2': 'ccpvdz'},
+ a=np.diag([2.5, 1.9, 2.2])*3)
+
+ auxcell = cell.copy()
+ auxcell.basis = {
+ 'C1':'''
+C S
+ 12.9917624900 1.0000000000
+C S
+ 2.1325940100 1.0000000000
+C P
+ 9.8364318200 1.0000000000
+C P
+ 3.3490545000 1.0000000000
+C P
+ 1.4947618600 1.0000000000
+C P
+ 0.5769010900 1.0000000000
+C D
+ 0.1995412500 1.0000000000 ''',
+ 'C2':[[0, [.5, 1.]]],
+ }
+ auxcell.build()
+ omega = 0.3
+ gpu_dat, dat_neg = build_cderi(cell, auxcell, kpts=None, omega=omega)
+
+ cell.precision = 1e-10
+ auxcell.precision = 1e-10
+ kpts = cell.make_kpts([1,1,1])
+ dfbuilder = _RSGDFBuilder(cell, auxcell, kpts)
+ dfbuilder.omega = omega
+ dfbuilder.j2c_eig_always = False
+ dfbuilder.fft_dd_block = True
+ dfbuilder.exclude_d_aux = True
+ naux = auxcell.nao
+ nao = cell.nao
+ with tempfile.NamedTemporaryFile() as tmpf:
+ dfbuilder.make_j3c(tmpf.name, aosym='s1')
+ with _load3c(tmpf.name, 'j3c', kpts[[0,0]]) as cderi:
+ ref = abs(cderi[:].reshape(naux,nao,nao))
+ dat = abs(gpu_dat[0,0].get())
+ assert abs(dat - ref).max() < 1e-8
+
+def test_kpts():
+ cell = pyscf.M(
+ atom='''C1 1.3 .2 .3
+ C2 .19 .1 1.1
+ ''',
+ basis={'C1': [[0, [1.1, 1.]],
+ [1, [2., 1.]]],
+ 'C2': 'ccpvdz'},
+ a=np.diag([2.5, 1.9, 2.2])*3)
+
+ auxcell = cell.copy()
+ auxcell.basis = {
+ 'C1':'''
+C S
+ 12.9917624900 1.0000000000
+C S
+ 2.1325940100 1.0000000000
+C P
+ 9.8364318200 1.0000000000
+C P
+ 3.3490545000 1.0000000000
+C P
+ 1.4947618600 1.0000000000
+C P
+ 0.5769010900 1.0000000000
+C D
+ 0.1995412500 1.0000000000 ''',
+ 'C2':[[0, [.5, 1.]]],
+ }
+ auxcell.build()
+ omega = 0.3
+ kmesh = [6,1,1]
+ kpts = cell.make_kpts(kmesh)
+ gpu_dat, dat_neg = build_cderi(cell, auxcell, kpts, omega=omega)
+
+ cell.precision = 1e-10
+ auxcell.precision = 1e-10
+ dfbuilder = _RSGDFBuilder(cell, auxcell, kpts)
+ dfbuilder.omega = omega
+ dfbuilder.j2c_eig_always = False
+ dfbuilder.fft_dd_block = True
+ dfbuilder.exclude_d_aux = True
+ naux = auxcell.nao
+ nao = cell.nao
+ with tempfile.NamedTemporaryFile() as tmpf:
+ dfbuilder.make_j3c(tmpf.name, aosym='s1')
+ for ki, kj in gpu_dat:
+ with _load3c(tmpf.name, 'j3c', kpts[[ki,kj]]) as cderi:
+ ref = abs(cderi[:].reshape(naux,nao,nao))
+ dat = abs(gpu_dat[ki,kj].get())
+ print(ki,kj)
+ assert abs(dat - ref).max() < 1e-8
+
+def test_kpts_j_only():
+ cell = pyscf.M(
+ atom='''C1 1.3 .2 .3
+ C2 .19 .1 1.1
+ ''',
+ basis={'C1': [[0, [1.1, 1.]],
+ [1, [2., 1.]]],
+ 'C2': 'ccpvdz'},
+ a=np.diag([2.5, 1.9, 2.2])*3)
+
+ auxcell = cell.copy()
+ auxcell.basis = {
+ 'C1':'''
+C S
+ 12.9917624900 1.0000000000
+C S
+ 2.1325940100 1.0000000000
+C P
+ 9.8364318200 1.0000000000
+C P
+ 3.3490545000 1.0000000000
+C P
+ 1.4947618600 1.0000000000
+C P
+ 0.5769010900 1.0000000000
+C D
+ 0.1995412500 1.0000000000 ''',
+ 'C2':[[0, [.5, 1.]]],
+ }
+ auxcell.build()
+ omega = 0.3
+ kmesh = [1,3,4]
+ kpts = cell.make_kpts(kmesh)
+ gpu_dat, dat_neg = build_cderi(cell, auxcell, kpts, omega=omega, j_only=True)
+
+ cell.precision = 1e-10
+ auxcell.precision = 1e-10
+ dfbuilder = _RSGDFBuilder(cell, auxcell, kpts)
+ dfbuilder.j_only = True
+ dfbuilder.omega = omega
+ dfbuilder.j2c_eig_always = False
+ dfbuilder.fft_dd_block = True
+ dfbuilder.exclude_d_aux = True
+ naux = auxcell.nao
+ nao = cell.nao
+ with tempfile.NamedTemporaryFile() as tmpf:
+ dfbuilder.make_j3c(tmpf.name, aosym='s1', j_only=True)
+ for ki, kj in gpu_dat:
+ with _load3c(tmpf.name, 'j3c', kpts[[ki,kj]]) as cderi:
+ ref = abs(cderi[:].reshape(naux,nao,nao))
+ dat = abs(gpu_dat[ki,kj].get())
+ print(ki,kj)
+ assert abs(dat - ref).max() < 1e-8
diff --git a/gpu4pyscf/pbc/dft/gen_grid.py b/gpu4pyscf/pbc/dft/gen_grid.py
index 8cac0d01..66b362d2 100644
--- a/gpu4pyscf/pbc/dft/gen_grid.py
+++ b/gpu4pyscf/pbc/dft/gen_grid.py
@@ -16,10 +16,14 @@
import numpy as np
import cupy as cp
from pyscf import lib
-from pyscf.lib import logger
from pyscf.pbc.dft import gen_grid as gen_grid_cpu
from pyscf.pbc.gto.cell import get_uniform_grids
-from gpu4pyscf.lib import utils
+from gpu4pyscf.dft import Grids
+from gpu4pyscf.lib import utils, logger
+
+__all__ = [
+ 'UniformGrids', 'BeckeGrids', 'AtomicGrids'
+]
class UniformGrids(lib.StreamObject):
'''Uniform Grid class.'''
@@ -66,8 +70,31 @@ def size(self):
kernel = gen_grid_cpu.UniformGrids.kernel
to_gpu = utils.to_gpu
- device = utils.device
to_cpu = utils.to_cpu
-class BeckeGrids:
- pass
+
+class BeckeGrids(Grids):
+ '''Atomic grids for all-electron calculation.'''
+ def __init__(self, cell):
+ self.cell = cell
+ Grids.__init__(self, cell)
+
+ def build(self, cell=None, with_non0tab=False):
+ if cell is None: cell = self.cell
+ coords, weights = gen_grid_cpu.get_becke_grids(
+ self.cell, self.atom_grid, radi_method=self.radi_method,
+ level=self.level, prune=self.prune)
+ self.coords = cp.asarray(coords)
+ self.weights = cp.asarray(weights)
+ if with_non0tab:
+ raise NotImplementedError
+ self.non0tab = None
+ logger.info(self, 'tot grids = %d', len(self.weights))
+ logger.info(self, 'cell vol = %.9g sum(weights) = %.9g',
+ cell.vol, self.weights.sum())
+ return self
+
+ to_gpu = utils.to_gpu
+ to_cpu = utils.to_cpu
+
+AtomicGrids = BeckeGrids
diff --git a/gpu4pyscf/pbc/dft/krks.py b/gpu4pyscf/pbc/dft/krks.py
index c5fefb7f..c4fa0245 100644
--- a/gpu4pyscf/pbc/dft/krks.py
+++ b/gpu4pyscf/pbc/dft/krks.py
@@ -47,7 +47,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
n, exc, vxc = multigrid.nr_rks(ks.with_df, ks.xc, dm, hermi,
kpts, kpts_band,
with_j=True, return_j=False)
- log.info('nelec by numeric integration = %s', n)
+ log.debug('nelec by numeric integration = %s', n)
t0 = log.timer('vxc', *t0)
return vxc
@@ -61,7 +61,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
max_memory = ks.max_memory - lib.current_memory()[0]
n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi,
kpts, kpts_band, max_memory=max_memory)
- log.info('nelec by numeric integration = %s', n)
+ log.debug('nelec by numeric integration = %s', n)
if ks.do_nlc():
if ni.libxc.is_nlc(ks.xc):
xc = ks.xc
@@ -72,7 +72,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
max_memory=max_memory)
exc += enlc
vxc += vnlc
- log.info('nelec with nlc grids = %s', n)
+ log.debug('nelec with nlc grids = %s', n)
t0 = log.timer('vxc', *t0)
nkpts = len(kpts)
@@ -140,6 +140,14 @@ def energy_elec(mf, dm_kpts=None, h1e_kpts=None, vhf=None):
ecoul.imag)
return tot_e.real, ecoul.real + exc.real
+def get_rho(mf, dm=None, grids=None, kpts=None):
+ if dm is None: dm = mf.make_rdm1()
+ if grids is None: grids = mf.grids
+ if kpts is None: kpts = mf.kpts
+ assert dm.ndim == 3
+ assert kpts.ndim == 2
+ return mf._numint.get_rho(mf.cell, dm, grids, kpts)
+
class KRKS(rks.KohnShamDFT, khf.KRHF):
'''RKS class adapted for PBCs with k-point sampling.
'''
@@ -151,7 +159,7 @@ def __init__(self, cell, kpts=np.zeros((1,3)), xc='LDA,VWN', exxdiv='ewald'):
dump_flags = krks_cpu.KRKS.dump_flags
get_veff = get_veff
energy_elec = energy_elec
- get_rho = return_cupy_array(krks_cpu.get_rho)
+ get_rho = get_rho
nuc_grad_method = NotImplemented
to_hf = NotImplemented
diff --git a/gpu4pyscf/pbc/dft/kuks.py b/gpu4pyscf/pbc/dft/kuks.py
index 363bfefd..fad45cbd 100644
--- a/gpu4pyscf/pbc/dft/kuks.py
+++ b/gpu4pyscf/pbc/dft/kuks.py
@@ -28,7 +28,7 @@
from gpu4pyscf.lib import logger, utils
from gpu4pyscf.lib.cupy_helper import return_cupy_array, tag_array
from gpu4pyscf.pbc.scf import khf, kuhf
-from gpu4pyscf.pbc.dft import rks
+from gpu4pyscf.pbc.dft import rks, krks
def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
kpts=None, kpts_band=None):
@@ -47,7 +47,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm, hermi,
kpts, kpts_band,
with_j=True, return_j=False)
- log.info('nelec by numeric integration = %s', n)
+ log.debug('nelec by numeric integration = %s', n)
t0 = log.timer('vxc', *t0)
return vxc
@@ -71,7 +71,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
0, hermi, kpts, max_memory=max_memory)
exc += enlc
vxc += vnlc
- log.info('nelec by numeric integration = %s', n)
+ log.debug('nelec by numeric integration = %s', n)
t0 = log.timer('vxc', *t0)
nkpts = len(kpts)
@@ -150,7 +150,10 @@ def __init__(self, cell, kpts=np.zeros((1,3)), xc='LDA,VWN', exxdiv='ewald'):
get_veff = get_veff
energy_elec = energy_elec
- get_rho = return_cupy_array(kuks_cpu.get_rho)
+
+ def get_rho(self, dm=None, grids=None, kpts=None):
+ if dm is None: dm = self.make_rdm1()
+ return krks.get_rho(self, dm[0]+dm[1], grids, kpts)
nuc_grad_method = NotImplemented
to_hf = NotImplemented
diff --git a/gpu4pyscf/pbc/dft/numint.py b/gpu4pyscf/pbc/dft/numint.py
index ea9e83cd..f064f664 100644
--- a/gpu4pyscf/pbc/dft/numint.py
+++ b/gpu4pyscf/pbc/dft/numint.py
@@ -90,17 +90,17 @@ def eval_rho(cell, ao, dm, non0tab=None, xctype='LDA', hermi=0, with_lapl=False,
pyscf.dft.numint.eval_rho
'''
- if np.iscomplexobj(ao) or np.iscomplexobj(dm):
+ if cp.iscomplexobj(ao) or cp.iscomplexobj(dm):
ngrids, nao = ao.shape[-2:]
ao_loc = cell.ao_loc_nr()
assert nao == ao_loc[-1]
dm = cp.asarray(dm, dtype=np.complex128)
+ ao = cp.asarray(ao, dtype=np.complex128)
if hermi == 1:
def dot_bra(bra, aodm):
- rho = contract('pi,pi->p', bra.real, aodm.real)
- rho += contract('pi,pi->p', bra.imag, aodm.imag)
- return rho
+ rho = contract('pi,pi->p', bra.conj(), aodm).real
+ return cp.asarray(rho, order='C')
dtype = np.float64
else:
def dot_bra(bra, aodm):
@@ -147,6 +147,7 @@ def dot_bra(bra, aodm):
ngrids, nao = ao.shape[-2:]
ao_loc = cell.ao_loc_nr()
assert nao == ao_loc[-1]
+ assert ao.dtype == dm.dtype
def dot_bra(bra, aodm):
return contract('pi,pi->p', bra, aodm)
@@ -378,13 +379,12 @@ def _tau_dot(bra, ket, wv):
return mat
-#TODO: put NumInt and KNumInt into one
class KNumInt(lib.StreamObject, numint.LibXCMixin):
eval_ao = staticmethod(eval_ao_kpts)
make_mask = NotImplemented
- def get_rho(self, cell, dm, grids, kpts=np.zeros((1,3)), max_memory=2000):
+ def get_rho(self, cell, dm, grids, kpts=np.zeros((1,3))):
'''Density in real space
'''
kpts = kpts.reshape(-1, 3)
@@ -445,7 +445,7 @@ def block_loop(self, cell, grids, deriv=0, kpts=None):
for ip0, ip1 in lib.prange(0, ngrids, blksize):
coords = grids_coords[ip0:ip1]
weight = grids_weights[ip0:ip1]
- ao_ks = eval_ao_kpts(cell, coords, kpts, deriv=deriv)
+ ao_ks = self.eval_ao(cell, coords, kpts, deriv=deriv)
yield ao_ks, weight, coords
ao_ks = None
diff --git a/gpu4pyscf/pbc/dft/rks.py b/gpu4pyscf/pbc/dft/rks.py
index c6c93b24..fbc35f51 100644
--- a/gpu4pyscf/pbc/dft/rks.py
+++ b/gpu4pyscf/pbc/dft/rks.py
@@ -73,7 +73,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
else:
n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi,
kpt, kpts_band)
- log.info('nelec by numeric integration = %s', n)
+ log.debug('nelec by numeric integration = %s', n)
if ks.do_nlc():
if ni.libxc.is_nlc(ks.xc):
xc = ks.xc
@@ -83,7 +83,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
n, enlc, vnlc = ni.nr_nlc_vxc(cell, ks.nlcgrids, xc, dm, 0, hermi, kpt)
exc += enlc
vxc += vnlc
- log.info('nelec with nlc grids = %s', n)
+ log.debug('nelec with nlc grids = %s', n)
t0 = log.timer('vxc', *t0)
if not hybrid:
@@ -122,8 +122,18 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=None, vk=None)
return vxc
-def prune_small_rho_grids_(ks, cell, dm, grids, kpts):
- raise NotImplementedError
+NELEC_ERROR_TOL = getattr(__config__, 'pbc_dft_rks_prune_error_tol', 0.02)
+def prune_small_rho_grids_(mf, cell, dm, grids, kpts):
+ rho = mf.get_rho(dm, grids, kpts)
+ n = rho.dot(grids.weights)
+ if abs(n-cell.nelectron) < NELEC_ERROR_TOL*n:
+ rho *= grids.weights
+ size0 = grids.weights.size
+ idx = abs(rho) > mf.small_rho_cutoff / size0
+ grids.coords = grids.coords [idx]
+ grids.weights = grids.weights[idx]
+ logger.debug(mf, 'Drop grids %d', size0 - grids.weights.size)
+ return grids
class KohnShamDFT(mol_ks.KohnShamDFT):
'''PBC-KS'''
@@ -148,9 +158,21 @@ def __init__(self, xc='LDA,VWN'):
dump_flags = rks_cpu.KohnShamDFT.dump_flags
get_veff = NotImplemented
- get_rho = return_cupy_array(rks_cpu.get_rho)
+ get_rho = NotImplemented
+
+ def density_fit(self, auxbasis=None, with_df=None):
+ from gpu4pyscf.pbc.df.df_jk import density_fit
+ cell = self.cell
+ mf = density_fit(self, auxbasis, with_df)
+ mf.with_df._j_only = not self._numint.libxc.is_hybrid_xc(self.xc)
+ mf.grids = gen_grid.BeckeGrids(cell)
+ mf.grids.level = getattr(
+ __config__, 'dft_rks_RKS_grids_level', mf.grids.level)
+ mf.nlcgrids = gen_grid.BeckeGrids(cell)
+ mf.nlcgrids.level = getattr(
+ __config__, 'dft_rks_RKS_nlcgrids_level', mf.nlcgrids.level)
+ return mf
- density_fit = NotImplemented
rs_density_fit = NotImplemented
jk_method = NotImplemented
@@ -164,7 +186,7 @@ def initialize_grids(self, cell, dm, kpts, ground_state=True):
'''Initialize self.grids the first time call get_veff'''
if self.grids.coords is None:
t0 = (logger.process_clock(), logger.perf_counter())
- self.grids.build(with_non0tab=True)
+ self.grids.build()
if (isinstance(self.grids, gen_grid.BeckeGrids) and
self.small_rho_cutoff > 1e-20 and ground_state):
self.grids = prune_small_rho_grids_(
@@ -173,7 +195,7 @@ def initialize_grids(self, cell, dm, kpts, ground_state=True):
is_nlc = self.do_nlc()
if is_nlc and self.nlcgrids.coords is None:
t0 = (logger.process_clock(), logger.perf_counter())
- self.nlcgrids.build(with_non0tab=True)
+ self.nlcgrids.build()
if (isinstance(self.grids, gen_grid.BeckeGrids) and
self.small_rho_cutoff > 1e-20 and ground_state):
self.nlcgrids = prune_small_rho_grids_(
@@ -185,6 +207,14 @@ def initialize_grids(self, cell, dm, kpts, ground_state=True):
pbchf.KohnShamDFT = KohnShamDFT
+def get_rho(mf, dm=None, grids=None, kpt=None):
+ if dm is None: dm = mf.make_rdm1()
+ if grids is None: grids = mf.grids
+ if kpt is None: kpt = mf.kpt
+ assert dm.ndim == 2
+ assert kpt.ndim == 1
+ return mf._numint.get_rho(mf.cell, dm[None], grids, kpt[None])
+
class RKS(KohnShamDFT, pbchf.RHF):
'''RKS class adapted for PBCs.
@@ -203,6 +233,7 @@ def dump_flags(self, verbose=None):
get_veff = get_veff
energy_elec = mol_ks.energy_elec
+ get_rho = get_rho
to_gpu = utils.to_gpu
device = utils.device
diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py b/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py
index cc60be8f..17e1451f 100644
--- a/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py
+++ b/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py
@@ -154,6 +154,31 @@ def test_kpts_rsh_fft(self):
mf_ref = kmf.to_cpu().run()
self.assertAlmostEqual(kmf.e_tot, mf_ref.e_tot, 7)
+ def test_kpts_gga_gdf(self):
+ from gpu4pyscf.pbc.df.df import GDF
+ L = 4.
+ cell = pbcgto.Cell()
+ cell.a = np.eye(3)*L
+ cell.atom =[['H' , ( L/2+0., L/2+0. , L/2+1.)],
+ ['H' , ( L/2+1., L/2+0. , L/2+1.)]]
+ cell.basis = [[0, (4.0, 1.0)], [0, (1.0, 1.0)]]
+ cell.build()
+
+ mf = cell.RKS(xc='pbe0').to_gpu().density_fit().run()
+ self.assertTrue(isinstance(mf.with_df, GDF))
+ self.assertAlmostEqual(mf.e_tot, -0.44834992009430463, 7)
+ mf_ref = mf.to_cpu().run()
+ self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+ nk = [2, 1, 1]
+ kpts = cell.make_kpts(nk)
+ kmf = pbcdft.KRKS(cell, xc='pbe0', kpts=kpts).density_fit().run()
+ self.assertTrue(isinstance(kmf.with_df, GDF))
+ self.assertAlmostEqual(kmf.e_tot, -0.44429306, 7)
+ mf_ref = kmf.to_cpu()
+ mf_ref.run()
+ self.assertAlmostEqual(kmf.e_tot, mf_ref.e_tot, 7)
+
if __name__ == '__main__':
print("Full Tests for pbc.dft.rks")
unittest.main()
diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_uks.py b/gpu4pyscf/pbc/dft/tests/test_pbc_uks.py
index 5848038c..2b73dfb2 100644
--- a/gpu4pyscf/pbc/dft/tests/test_pbc_uks.py
+++ b/gpu4pyscf/pbc/dft/tests/test_pbc_uks.py
@@ -68,6 +68,7 @@ def test_gga_fft(self):
def test_rsh_fft(self):
mf = pbcdft.UKS(cell, xc='camb3lyp').run(conv_tol=1e-9)
+ self.assertAlmostEqual(mf.e_tot, -4.350842690091271, 7)
mf_ref = mf.to_cpu().run()
self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
@@ -153,6 +154,32 @@ def test_kpts_rsh_fft(self):
mf_ref = kmf.to_cpu().run()
self.assertAlmostEqual(kmf.e_tot, mf_ref.e_tot, 7)
+ def test_kpts_gga_gdf(self):
+ from gpu4pyscf.pbc.df.df import GDF
+ L = 4.
+ cell = pbcgto.Cell()
+ cell.a = np.eye(3)*L
+ cell.atom =[['H' , ( L/2+0., L/2+0. , L/2+1.)],
+ ['H' , ( L/2+1., L/2+0. , L/2+1.)]]
+ cell.basis = [[0, (4.0, 1.0)], [0, (1.0, 1.0)]]
+ cell.spin = 2
+ cell.build()
+
+ mf = cell.UKS(xc='pbe0').to_gpu().density_fit().run()
+ self.assertTrue(isinstance(mf.with_df, GDF))
+ self.assertAlmostEqual(mf.e_tot, -0.10443638, 7)
+ mf_ref = mf.to_cpu().run()
+ self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+ nk = [2, 1, 1]
+ kpts = cell.make_kpts(nk)
+ kmf = pbcdft.KUKS(cell, xc='pbe0', kpts=kpts).density_fit().run()
+ self.assertTrue(isinstance(kmf.with_df, GDF))
+ self.assertAlmostEqual(kmf.e_tot, -0.19581151, 7)
+ mf_ref = kmf.to_cpu()
+ mf_ref.run()
+ self.assertAlmostEqual(kmf.e_tot, mf_ref.e_tot, 7)
+
if __name__ == '__main__':
print("Full Tests for pbc.dft.uks")
unittest.main()
diff --git a/gpu4pyscf/pbc/dft/uks.py b/gpu4pyscf/pbc/dft/uks.py
index 8ce4466e..1cd2f976 100644
--- a/gpu4pyscf/pbc/dft/uks.py
+++ b/gpu4pyscf/pbc/dft/uks.py
@@ -52,7 +52,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm, hermi,
kpt.reshape(1,3), kpts_band,
with_j=True, return_j=False)
- log.info('nelec by numeric integration = %s', n)
+ log.debug('nelec by numeric integration = %s', n)
t0 = log.timer('vxc', *t0)
return vxc
@@ -79,7 +79,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
0, hermi, kpt, max_memory=max_memory)
exc += enlc
vxc += vnlc
- log.info('nelec by numeric integration = %s', n)
+ log.debug('nelec by numeric integration = %s', n)
t0 = log.timer('vxc', *t0)
if not hybrid:
@@ -134,10 +134,13 @@ def __init__(self, cell, kpt=np.zeros(3), xc='LDA,VWN', exxdiv='ewald'):
dump_flags = uks_cpu.UKS.dump_flags
- get_rho = return_cupy_array(uks_cpu.get_rho)
get_veff = get_veff
energy_elec = mol_uks.energy_elec
+ def get_rho(self, dm=None, grids=None, kpt=None):
+ if dm is None: dm = self.make_rdm1()
+ return rks.get_rho(self, dm[0]+dm[1], grids, kpt)
+
nuc_grad_method = NotImplemented
to_hf = NotImplemented
diff --git a/gpu4pyscf/pbc/gto/__init__.py b/gpu4pyscf/pbc/gto/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/gpu4pyscf/pbc/gto/cell.py b/gpu4pyscf/pbc/gto/cell.py
new file mode 100644
index 00000000..14df0ff9
--- /dev/null
+++ b/gpu4pyscf/pbc/gto/cell.py
@@ -0,0 +1,49 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+# This function is only available in pyscf-2.8 or later
+def extract_pgto_params(cell, op='diffused'):
+ '''A helper function to extract exponents and contraction coefficients for
+ estimate_xxx function
+ '''
+ es = []
+ cs = []
+ if op == 'diffused':
+ precision = cell.precision
+ for i in range(cell.nbas):
+ e = cell.bas_exp(i)
+ c = abs(cell._libcint_ctr_coeff(i)).max(axis=1)
+ l = cell.bas_angular(i)
+ # A quick estimation for the radius that each primitive GTO vanishes
+ r2 = np.log(c**2 / precision * 10**l) / e
+ idx = r2.argmax()
+ es.append(e[idx])
+ cs.append(c[idx].max())
+ elif op == 'compact':
+ precision = cell.precision
+ for i in range(cell.nbas):
+ e = cell.bas_exp(i)
+ c = abs(cell._libcint_ctr_coeff(i)).max(axis=1)
+ l = cell.bas_angular(i)
+ # A quick estimation for the resolution of planewaves that each
+ # primitive GTO requires
+ ke = np.log(c**2 / precision * 50**l) * e
+ idx = ke.argmax()
+ es.append(e[idx])
+ cs.append(c[idx].max())
+ else:
+ raise RuntimeError(f'Unsupported operation {op}')
+ return np.array(es), np.array(cs)
diff --git a/gpu4pyscf/pbc/lib/kpts_helper.py b/gpu4pyscf/pbc/lib/kpts_helper.py
index 9b85184b..6a3d0334 100644
--- a/gpu4pyscf/pbc/lib/kpts_helper.py
+++ b/gpu4pyscf/pbc/lib/kpts_helper.py
@@ -13,6 +13,7 @@
# limitations under the License.
import numpy as np
+from pyscf import lib
def conj_images_in_bvk_cell(kmesh, return_pair=False):
'''
@@ -42,3 +43,40 @@ def conj_images_in_bvk_cell(kmesh, return_pair=False):
mask = Ls_idx <= Ls_idx_conj
return np.column_stack((Ls_idx[mask], Ls_idx_conj[mask]))
+def kk_adapted_iter(kmesh):
+ '''Generates kpt which is adapted to the kpt_p in (ij|p)
+
+ This function provides the similar functionality as the
+ pyscf.pbc.lib.kpts_helper.kk_adapted_iter .
+ '''
+ kmesh = np.asarray(kmesh)
+ nkpts = np.prod(kmesh)
+ nx, ny, nz = kmesh
+ kx = np.fft.fftfreq(nx, 1./nx).astype(int)
+ ky = np.fft.fftfreq(ny, 1./ny).astype(int)
+ kz = np.fft.fftfreq(nz, 1./nz).astype(int)
+
+ kxyz = lib.cartesian_prod([kx, ky, kz])
+ dk = (kxyz[None,:,:] - kxyz[:,None,:]).reshape(-1, 3)
+
+ dk %= kmesh
+ wrap_around_mask = dk >= (kmesh+1)//2
+ dk[wrap_around_mask[:,0],0] -= nx
+ dk[wrap_around_mask[:,1],1] -= ny
+ dk[wrap_around_mask[:,2],2] -= nz
+ uniq_ks, uniq_index, uniq_inverse = np.unique(
+ dk, axis=0, return_index=True, return_inverse=True)
+
+ ks_conj = -uniq_ks
+ strides = np.array((ny*nz, nz, 1))
+ ks_idx = (uniq_ks % kmesh).dot(strides)
+ ks_idx_conj = (ks_conj % kmesh).dot(strides)
+
+ independent_idx = np.sort(np.nonzero(ks_idx <= ks_idx_conj)[0])
+ for x in independent_idx:
+ kp = ks_idx[x]
+ kp_conj = ks_idx_conj[x]
+ kpt_ij_idx = np.where(uniq_inverse == x)[0]
+ kpti_idx = kpt_ij_idx // nkpts
+ kptj_idx = kpt_ij_idx % nkpts
+ yield kp, kp_conj, kpti_idx, kptj_idx
diff --git a/gpu4pyscf/pbc/scf/hf.py b/gpu4pyscf/pbc/scf/hf.py
index 740f76a5..3aec403d 100644
--- a/gpu4pyscf/pbc/scf/hf.py
+++ b/gpu4pyscf/pbc/scf/hf.py
@@ -240,11 +240,16 @@ class RHF(SCF):
to_gpu = utils.to_gpu
device = utils.device
+ def density_fit(self, auxbasis=None, with_df=None):
+ from gpu4pyscf.pbc.df.df_jk import density_fit
+ return density_fit(self, auxbasis, with_df)
+
def to_cpu(self):
mf = hf_cpu.RHF(self.cell)
utils.to_cpu(self, out=mf)
return mf
+
def _format_jks(vj, dm, kpts_band):
if kpts_band is None:
vj = vj.reshape(dm.shape)
diff --git a/gpu4pyscf/pbc/scf/khf.py b/gpu4pyscf/pbc/scf/khf.py
index d4c7855e..4ec72d98 100644
--- a/gpu4pyscf/pbc/scf/khf.py
+++ b/gpu4pyscf/pbc/scf/khf.py
@@ -399,6 +399,8 @@ def get_init_guess(self, cell=None, key='minao', s1e=None):
dm_kpts *= (nelectron / ne).reshape(-1,1,1)
return dm_kpts
+ density_fit = pbchf.RHF.density_fit
+
to_gpu = utils.to_gpu
device = utils.device
diff --git a/gpu4pyscf/pbc/scf/kuhf.py b/gpu4pyscf/pbc/scf/kuhf.py
index 7e82d932..d63396c7 100644
--- a/gpu4pyscf/pbc/scf/kuhf.py
+++ b/gpu4pyscf/pbc/scf/kuhf.py
@@ -38,8 +38,9 @@ def make_rdm1(mo_coeff_kpts, mo_occ_kpts, **kwargs):
Returns:
dm_kpts : (2, nkpts, nao, nao) ndarray
'''
- assert isinstance(mo_occ_kpts, cp.ndarray)
- assert isinstance(mo_coeff_kpts, cp.ndarray)
+ mo_occ_kpts = cp.asarray(mo_occ_kpts)
+ mo_coeff_kpts = cp.asarray(mo_coeff_kpts)
+ assert mo_occ_kpts.dtype == np.float64
c = mo_coeff_kpts * mo_occ_kpts[:,:,None,:]
dm = contract('nkpi,nkqi->nkpq', mo_coeff_kpts, c.conj())
return tag_array(dm, mo_coeff=mo_coeff_kpts, mo_occ=mo_occ_kpts)
@@ -312,6 +313,8 @@ def get_bands(self, kpts_band, cell=None, dm_kpts=None, kpts=None):
to_ks = NotImplemented
convert_from_ = NotImplemented
+ density_fit = khf.KRHF.density_fit
+
to_gpu = utils.to_gpu
device = utils.device
diff --git a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py
index ca0810c5..71ae0ef1 100644
--- a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py
+++ b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py
@@ -132,6 +132,28 @@ def test_krhf_bands(self):
e_ref = kmf_cpu.get_bands(kpts_bands)[0]
self.assertAlmostEqual(abs(e.get()-e_ref).max(), 0, 7)
+ def test_density_fit(self):
+ from gpu4pyscf.pbc.df.df import GDF
+ L = 4.
+ cell = pbcgto.Cell()
+ cell.a = np.eye(3)*L
+ cell.atom =[['H' , ( L/2+0., L/2+0. , L/2+1.)],
+ ['H' , ( L/2+1., L/2+0. , L/2+1.)]]
+ cell.basis = [[0, (4.0, 1.0)], [0, (1.0, 1.0)]]
+ cell.build()
+
+ ref = cell.RHF().density_fit().run()
+ mf = ref.to_gpu().run(conv_tol=1e-8)
+ self.assertTrue(isinstance(mf.with_df, GDF))
+ self.assertAlmostEqual(ref.e_tot, -0.3740002917376214, 8)
+ self.assertAlmostEqual(mf.e_tot, ref.e_tot, 8)
+
+ ref = cell.KRHF().density_fit().run()
+ mf = ref.to_gpu().run(conv_tol=1e-8)
+ self.assertTrue(isinstance(mf.with_df, GDF))
+ self.assertAlmostEqual(ref.e_tot, -0.3740002917376214, 8)
+ self.assertAlmostEqual(mf.e_tot, ref.e_tot, 8)
+
if __name__ == '__main__':
print("Full Tests for pbc.scf.hf")
unittest.main()
diff --git a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_uhf.py b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_uhf.py
index 2f888bdb..b9665f06 100644
--- a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_uhf.py
+++ b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_uhf.py
@@ -90,6 +90,28 @@ def test_small_system(self):
mf = pscf.KUHF(mol,kpts=[[0., 0., 0.]]).run()
self.assertAlmostEqual(mf.e_tot, -2.2719576422665635, 8)
+ def test_density_fit(self):
+ from gpu4pyscf.pbc.df.df import GDF
+ L = 4.
+ cell = pbcgto.Cell()
+ cell.a = np.eye(3)*L
+ cell.atom =[['H' , ( L/2+0., L/2+0. , L/2+1.)],
+ ['H' , ( L/2+1., L/2+0. , L/2+1.)]]
+ cell.basis = [[0, (4.0, 1.0)], [0, (1.0, 1.0)]]
+ cell.spin = 2
+ cell.build()
+
+ ref = cell.UHF().density_fit().run()
+ mf = ref.to_gpu().run(conv_tol=1e-8)
+ self.assertTrue(isinstance(mf.with_df, GDF))
+ self.assertAlmostEqual(ref.e_tot, -0.11995733902879813, 8)
+ self.assertAlmostEqual(mf.e_tot, ref.e_tot, 8)
+
+ ref = cell.UHF().density_fit().run()
+ mf = ref.to_gpu().run(conv_tol=1e-8)
+ self.assertTrue(isinstance(mf.with_df, GDF))
+ self.assertAlmostEqual(ref.e_tot, -0.11995733902879813, 8)
+ self.assertAlmostEqual(mf.e_tot, ref.e_tot, 8)
if __name__ == '__main__':
print("Tests for PBC UHF and PBC KUHF")
diff --git a/gpu4pyscf/pbc/scf/uhf.py b/gpu4pyscf/pbc/scf/uhf.py
index 5abe6398..65e02ef2 100644
--- a/gpu4pyscf/pbc/scf/uhf.py
+++ b/gpu4pyscf/pbc/scf/uhf.py
@@ -124,6 +124,8 @@ def get_init_guess(self, cell=None, key='minao', s1e=None):
to_ks = NotImplemented
convert_from_ = NotImplemented
+ density_fit = pbchf.RHF.density_fit
+
to_gpu = utils.to_gpu
device = utils.device
diff --git a/gpu4pyscf/pbc/tools/k2gamma.py b/gpu4pyscf/pbc/tools/k2gamma.py
index 5e0041cf..2de30399 100644
--- a/gpu4pyscf/pbc/tools/k2gamma.py
+++ b/gpu4pyscf/pbc/tools/k2gamma.py
@@ -18,20 +18,23 @@
import numpy as np
from pyscf.lib import logger
-# This version of kpts_to_kmesh will be available in PySCF-2.8
-def kpts_to_kmesh(cell, kpts, precision=None, max_images=10000):
- '''Find the minimal k-points mesh to include all input kpts'''
+# This version of kpts_to_kmesh may become available in PySCF-2.9
+def kpts_to_kmesh(cell, kpts, precision=None, rcut=None):
+ '''Search the minimal BvK mesh or Monkhorst-Pack k-point mesh'''
+ assert kpts.ndim == 2
scaled_kpts = cell.get_scaled_kpts(kpts)
logger.debug3(cell, ' scaled_kpts kpts %s', scaled_kpts)
- # cell.nimgs are the upper limits for kmesh
- kmesh = np.asarray(cell.nimgs) * 2 + 1
+ if rcut is None:
+ kmesh = np.asarray(cell.nimgs) * 2 + 1
+ else:
+ nimgs = cell.get_bounding_sphere(rcut)
+ kmesh = nimgs * 2 + 1
if precision is None:
precision = cell.precision * 1e2
for i in range(3):
floats = scaled_kpts[:,i]
uniq_floats_idx = np.unique(floats.round(6), return_index=True)[1]
uniq_floats = floats[uniq_floats_idx]
- # Limit the number of images to 30 in each direction
fracs = [Fraction(x).limit_denominator(int(kmesh[i])) for x in uniq_floats]
denominators = np.unique([x.denominator for x in fracs])
common_denominator = reduce(np.lcm, denominators)
@@ -43,14 +46,4 @@ def kpts_to_kmesh(cell, kpts, precision=None, max_images=10000):
i, common_denominator, abs(fs - np.rint(fs)).max())
logger.debug3(cell, ' unique kpts %s', uniq_floats)
logger.debug3(cell, ' frac kpts %s', fracs)
-
- assert max_images > 0
- if np.prod(kmesh) > max_images:
- kmesh_raw = kmesh.copy()
- for i in itertools.cycle(np.argsort(kmesh)[::-1]):
- kmesh[i] = int(kmesh[i] * .8)
- if np.prod(kmesh) < max_images:
- break
- logger.warn(cell, 'kmesh (%s) exceeds max_images (%d); reduced to %s',
- kmesh_raw, max_images, kmesh)
return kmesh
diff --git a/gpu4pyscf/pop/esp.py b/gpu4pyscf/pop/esp.py
index 8406ac06..e6d41e5f 100644
--- a/gpu4pyscf/pop/esp.py
+++ b/gpu4pyscf/pop/esp.py
@@ -88,7 +88,7 @@ def vdw_surface(mol, scales=[1.0], density=1.0*radii.BOHR**2, rad=R_VDW):
Generate vdw surface of molecules, in Bohr
'''
coords = mol.atom_coords(unit='B')
- charges = mol.atom_charges()
+ charges = [gto.charge(sym) for sym in mol.elements]
atom_radii = rad[charges]
surface_points = []
@@ -196,7 +196,7 @@ def resp_solve(mol, dm, grid_density=1.0*radii.BOHR**2,
q[u] = q[v] = q[w]
'''
- charges = mol.atom_charges()
+ charges = np.asarray([gto.charge(sym) for sym in mol.elements])
natm = mol.natm
is_restraint = charges > 1
is_restraint[charges == 1] = not hfree
diff --git a/gpu4pyscf/properties/polarizability.py b/gpu4pyscf/properties/polarizability.py
index 8face371..7949b4f5 100644
--- a/gpu4pyscf/properties/polarizability.py
+++ b/gpu4pyscf/properties/polarizability.py
@@ -13,11 +13,10 @@
# limitations under the License.
import numpy as np
-from gpu4pyscf.scf import cphf
import cupy
+from gpu4pyscf.scf import hf, cphf, _response_functions
from gpu4pyscf.lib.cupy_helper import contract
-
def gen_vind(mf, mo_coeff, mo_occ):
"""get the induced potential. This is the same as contract the mo1 with the kernel.
@@ -59,6 +58,7 @@ def eval_polarizability(mf):
Returns:
polarizability (numpy.array): polarizability in au
"""
+ assert isinstance(mf, hf.RHF), "Unrestricted mf object is not supported."
polarizability = np.empty((3, 3))
diff --git a/gpu4pyscf/properties/tests/test_polarizability.py b/gpu4pyscf/properties/tests/test_polarizability.py
index e9aebe48..7c02c718 100644
--- a/gpu4pyscf/properties/tests/test_polarizability.py
+++ b/gpu4pyscf/properties/tests/test_polarizability.py
@@ -17,6 +17,7 @@
import pyscf
from pyscf import lib
from pyscf.dft import rks as rks_cpu
+from pyscf.dft import uks as uks_cpu
from gpu4pyscf.dft import rks, uks
from gpu4pyscf.properties import polarizability
@@ -62,7 +63,7 @@ def run_dft_df_polarizability(xc):
polar = polarizability.eval_polarizability(mf)
return e_dft, polar
-def _vs_cpu(xc):
+def _vs_cpu_rks(xc):
mf = rks.RKS(mol, xc=xc)
mf.grids.level = grids_level
e_gpu = mf.kernel()
@@ -76,6 +77,20 @@ def _vs_cpu(xc):
assert np.abs(e_gpu - e_cpu) < 1e-5
assert np.linalg.norm(polar_cpu - polar_gpu) < 1e-3
+def _vs_cpu_uks(xc):
+ mf = uks.UKS(mol, xc=xc)
+ mf.grids.level = grids_level
+ e_gpu = mf.kernel()
+ polar_gpu = polarizability.eval_polarizability(mf)
+
+ mf_cpu = uks_cpu.UKS(mol, xc=xc)
+ mf_cpu.conv_tol = 1e-12
+ e_cpu = mf_cpu.kernel()
+ polar_cpu = polar.rhf.Polarizability(mf_cpu).polarizability()
+
+ assert np.abs(e_gpu - e_cpu) < 1e-5
+ assert np.linalg.norm(polar_cpu - polar_gpu) < 1e-3
+
class KnownValues(unittest.TestCase):
'''
known values are obtained by Q-Chem
@@ -140,9 +155,16 @@ def test_rks_b3lyp_df(self):
assert np.allclose(polar, qchem_polar)
@unittest.skipIf(polar is None, "Skipping test if pyscf.properties is not installed")
- def test_cpu(self):
- _vs_cpu('b3lyp')
+ def test_cpu_rks(self):
+ _vs_cpu_rks('b3lyp')
+ """
+ # UKS is not supported yet
+ @unittest.skipIf(polar is None, "Skipping test if pyscf.properties is not installed")
+ def test_cpu_uks(self):
+ _vs_cpu_uks('b3lyp')
+ """
+
if __name__ == "__main__":
print("Full Tests for polarizabillity")
unittest.main()
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index 3a0497ff..09523d4a 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -51,15 +51,13 @@ def _get_jk(mf, mol=None, dm=None, hermi=1, with_j=True, with_k=True,
vj, vk = get_jk(mol, dm, hermi, vhfopt, with_j, with_k, omega)
return vj, vk
-def make_rdm1(mf, mo_coeff=None, mo_occ=None, **kwargs):
- if mo_occ is None: mo_occ = mf.mo_occ
- if mo_coeff is None: mo_coeff = mf.mo_coeff
+def make_rdm1(mo_coeff, mo_occ):
mo_coeff = cupy.asarray(mo_coeff)
mo_occ = cupy.asarray(mo_occ)
is_occ = mo_occ > 0
mocc = mo_coeff[:, is_occ]
dm = cupy.dot(mocc*mo_occ[is_occ], mocc.conj().T)
- occ_coeff = mo_coeff[:, mo_occ>1.0]
+ occ_coeff = mo_coeff[:, is_occ]
return tag_array(dm, occ_coeff=occ_coeff, mo_occ=mo_occ, mo_coeff=mo_coeff)
def get_occ(mf, mo_energy=None, mo_coeff=None):
@@ -422,7 +420,6 @@ def check_sanity(self):
init_guess_by_chkfile = hf_cpu.SCF.init_guess_by_chkfile
from_chk = hf_cpu.SCF.from_chk
get_init_guess = return_cupy_array(hf_cpu.SCF.get_init_guess)
- make_rdm1 = make_rdm1
make_rdm2 = NotImplemented
energy_elec = energy_elec
energy_tot = energy_tot
@@ -461,6 +458,11 @@ def check_sanity(self):
mulliken_pop = NotImplemented
mulliken_meta = NotImplemented
+ def make_rdm1(self, mo_coeff=None, mo_occ=None, **kwargs):
+ if mo_occ is None: mo_occ = self.mo_occ
+ if mo_coeff is None: mo_coeff = self.mo_coeff
+ return make_rdm1(mo_coeff, mo_occ)
+
def dip_moment(self, mol=None, dm=None, unit='Debye', origin=None,
verbose=logger.NOTE):
if mol is None: mol = self.mol
diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py
index 3d98ae5f..715eef45 100644
--- a/gpu4pyscf/scf/j_engine.py
+++ b/gpu4pyscf/scf/j_engine.py
@@ -26,7 +26,7 @@
from pyscf import __config__
from gpu4pyscf.lib.cupy_helper import load_library, condense, sandwich_dot, transpose_sum
from gpu4pyscf.__config__ import props as gpu_specs
-from gpu4pyscf.__config__ import _num_devices
+from gpu4pyscf.__config__ import num_devices
from gpu4pyscf.lib import logger
from gpu4pyscf.scf import jk
from gpu4pyscf.scf.jk import _make_j_engine_pair_locs, RysIntEnvVars, _scale_sp_ctr_coeff
@@ -53,7 +53,7 @@ def get_j(mol, dm, hermi=1, vhfopt=None, omega=None, verbose=None):
if vhfopt is None:
with mol.with_range_coulomb(omega):
groupsize = None
- if _num_devices > 1:
+ if num_devices > 1:
groupsize = jk.GROUP_SIZE
vhfopt = _VHFOpt(mol).build(group_size=groupsize)
if omega is None:
diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
index 0e328204..a0048bf5 100644
--- a/gpu4pyscf/scf/jk.py
+++ b/gpu4pyscf/scf/jk.py
@@ -26,12 +26,12 @@
from pyscf.gto import ANG_OF, ATOM_OF, NPRIM_OF, NCTR_OF, PTR_COORD, PTR_COEFF
from pyscf import lib
from pyscf.scf import _vhf
-from pyscf import __config__
from gpu4pyscf.lib.cupy_helper import (load_library, condense, sandwich_dot, transpose_sum,
reduce_to_device)
+from gpu4pyscf.__config__ import _streams, num_devices, shm_size
from gpu4pyscf.__config__ import props as gpu_specs
-from gpu4pyscf.__config__ import _streams, _num_devices
from gpu4pyscf.lib import logger
+from gpu4pyscf.lib import multi_gpu
from gpu4pyscf.gto.mole import group_basis
__all__ = [
@@ -54,34 +54,68 @@
UNROLL_NFMAX = ctypes.c_int.in_dll(libvhf_rys, 'rys_jk_unrolled_max_nf').value
UNROLL_J_LMAX = ctypes.c_int.in_dll(libvhf_rys, 'rys_j_unrolled_lmax').value
UNROLL_J_MAX_ORDER = ctypes.c_int.in_dll(libvhf_rys, 'rys_j_unrolled_max_order').value
+SHM_SIZE = shm_size - 1024
+del shm_size
GOUT_WIDTH = 42
-SHM_SIZE = getattr(__config__, 'GPU_SHM_SIZE',
- int(gpu_specs['sharedMemPerBlockOptin']//9)*8)
THREADS = 256
GROUP_SIZE = 256
-def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
- device_id=0, with_j=True, with_k=True, verbose=None):
- n_dm = dms.shape[0]
- nao, _ = vhfopt.coeff.shape
+def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None):
+ '''Compute J, K matrices
+ '''
+ assert with_j or with_k
+ log = logger.new_logger(mol, verbose)
+ cput0 = log.init_timer()
+
+ if vhfopt is None:
+ vhfopt = _VHFOpt(mol).build()
+
+ mol = vhfopt.sorted_mol
+ nao, nao_orig = vhfopt.coeff.shape
+
+ dm = cp.asarray(dm, order='C')
+ dms = dm.reshape(-1,nao_orig,nao_orig)
+ #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
+ dms = sandwich_dot(dms, vhfopt.coeff.T)
+ dms = cp.asarray(dms, order='C')
+
+ ao_loc = mol.ao_loc
+ nao = ao_loc[-1]
uniq_l_ctr = vhfopt.uniq_l_ctr
uniq_l = uniq_l_ctr[:,0]
l_ctr_bas_loc = vhfopt.l_ctr_offsets
l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
- kern = libvhf_rys.RYS_build_jk
+ n_groups = np.count_nonzero(uniq_l <= LMAX)
- timing_counter = Counter()
- kern_counts = 0
- with cp.cuda.Device(device_id), _streams[device_id]:
+ dm_cond = condense('absmax', dms, ao_loc)
+ if hermi == 0:
+ # Wrap the triu contribution to tril
+ dm_cond = dm_cond + dm_cond.T
+ dm_cond = cp.log(dm_cond + 1e-300).astype(np.float32)
+ log_max_dm = float(dm_cond.max())
+ log_cutoff = math.log(vhfopt.direct_scf_tol)
+
+ tasks = [(i,j,k,l)
+ for i in range(n_groups)
+ for j in range(i+1)
+ for k in range(i+1)
+ for l in range(k+1)]
+ schemes = {t: quartets_scheme(mol, uniq_l_ctr[list(t)]) for t in tasks}
+
+ def proc(dms, dm_cond):
+ device_id = cp.cuda.device.get_device_id()
+ stream = cp.cuda.stream.get_current_stream()
log = logger.new_logger(mol, verbose)
- cput0 = log.init_timer()
- dms = cp.asarray(dms)
+ t0 = log.init_timer()
+ dms = cp.asarray(dms) # transfer to current device
+ dm_cond = cp.asarray(dm_cond)
if hermi == 0:
# Contract the tril and triu parts separately
dms = cp.vstack([dms, dms.transpose(0,2,1)])
n_dm = dms.shape[0]
- tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p)
+ tile_q_cond = vhfopt.tile_q_cond
+ tile_q_ptr = ctypes.cast(tile_q_cond.data.ptr, ctypes.c_void_p)
q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p)
s_ptr = lib.c_null_ptr()
if mol.omega < 0:
@@ -97,31 +131,34 @@ def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
vj = cp.zeros(dms.shape)
vj_ptr = ctypes.cast(vj.data.ptr, ctypes.c_void_p)
- ao_loc = mol.ao_loc
- dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32)
- log_max_dm = dm_cond.max()
- log_cutoff = math.log(vhfopt.direct_scf_tol)
- tile_mappings = _make_tril_tile_mappings(l_ctr_bas_loc, vhfopt.tile_q_cond,
+ tile_mappings = _make_tril_tile_mappings(l_ctr_bas_loc, tile_q_cond,
log_cutoff-log_max_dm)
workers = gpu_specs['multiProcessorCount']
pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16)
info = cp.empty(2, dtype=np.uint32)
- t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
+ t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *t0)
- for i, j, k, l in task_list:
- ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
- l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
+ init_constant(mol)
+ timing_counter = Counter()
+ kern_counts = 0
+ kern = libvhf_rys.RYS_build_jk
+
+ while tasks:
+ try:
+ task = tasks.pop()
+ except IndexError:
+ break
+
+ i, j, k, l = task
+ shls_slice = l_ctr_bas_loc[[i, i+1, j, j+1, k, k+1, l, l+1]]
tile_ij_mapping = tile_mappings[i,j]
- llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
- kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
- l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
tile_kl_mapping = tile_mappings[k,l]
- scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+ scheme = schemes[task]
err = kern(
vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
ctypes.c_int(n_dm), ctypes.c_int(nao),
vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
- (ctypes.c_int*8)(*ij_shls, *kl_shls),
+ (ctypes.c_int*8)(*shls_slice),
ctypes.c_int(tile_ij_mapping.size),
ctypes.c_int(tile_kl_mapping.size),
ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
@@ -135,12 +172,17 @@ def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
mol._atm.ctypes, ctypes.c_int(mol.natm),
mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
if err != 0:
+ llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
if log.verbose >= logger.DEBUG1:
+ llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
t1, t1p = log.timer_debug1(msg, *t1), t1
timing_counter[llll] += t1[1] - t1p[1]
kern_counts += 1
+ if num_devices > 1:
+ stream.synchronize()
+
if with_j:
if hermi == 1:
vj *= 2.
@@ -153,67 +195,16 @@ def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
else:
vk, vkT = vk[:n_dm//2], vk[n_dm//2:]
vk += vkT.transpose(0,2,1)
- return vj, vk, kern_counts, timing_counter
-
-def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None):
- '''Compute J, K matrices
- '''
- log = logger.new_logger(mol, verbose)
- cput0 = log.init_timer()
-
- if vhfopt is None:
- vhfopt = _VHFOpt(mol).build()
-
- mol = vhfopt.sorted_mol
- nao, nao_orig = vhfopt.coeff.shape
-
- dm = cp.asarray(dm, order='C')
- dms = dm.reshape(-1,nao_orig,nao_orig)
- #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
- dms = sandwich_dot(dms, vhfopt.coeff.T)
- dms = cp.asarray(dms, order='C')
-
- n_dm = dms.shape[0]
-
- assert with_j or with_k
-
- init_constant(mol)
-
- uniq_l_ctr = vhfopt.uniq_l_ctr
- uniq_l = uniq_l_ctr[:,0]
- l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
- n_groups = np.count_nonzero(uniq_l <= LMAX)
-
- tasks = []
- for i in range(n_groups):
- for j in range(i+1):
- for k in range(i+1):
- for l in range(k+1):
- tasks.append((i,j,k,l))
- tasks = np.array(tasks)
- task_list = []
- for device_id in range(_num_devices):
- task_list.append(tasks[device_id::_num_devices])
-
- cp.cuda.get_current_stream().synchronize()
- futures = []
- with ThreadPoolExecutor(max_workers=_num_devices) as executor:
- for device_id in range(_num_devices):
- future = executor.submit(
- _jk_task,
- mol, dms, vhfopt, task_list[device_id], hermi=hermi,
- with_j=with_j, with_k=with_k, verbose=verbose,
- device_id=device_id)
- futures.append(future)
+ return vj, vk, kern_counts, timing_counter
+ results = multi_gpu.run(proc, args=(dms, dm_cond), non_blocking=True)
kern_counts = 0
timing_collection = Counter()
vj_dist = []
vk_dist = []
- for future in futures:
- vj, vk, counts, counter = future.result()
+ for vj, vk, counts, t_counter in results:
kern_counts += counts
- timing_collection += counter
+ timing_collection += t_counter
vj_dist.append(vj)
vk_dist.append(vk)
@@ -222,17 +213,14 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
for llll, t in timing_collection.items():
log.debug1('%s wall time %.2f', llll, t)
- for s in _streams:
- s.synchronize()
- cp.cuda.get_current_stream().synchronize()
vj = vk = None
if with_k:
- vk = reduce_to_device(vk_dist, inplace=True)
+ vk = multi_gpu.array_reduce(vk_dist, inplace=True)
#:vk = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vk, vhfopt.coeff)
vk = sandwich_dot(vk, vhfopt.coeff)
-
+
if with_j:
- vj = reduce_to_device(vj_dist, inplace=True)
+ vj = multi_gpu.array_reduce(vj_dist, inplace=True)
vj = transpose_sum(vj)
#:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vj, vhfopt.coeff)
vj = sandwich_dot(vj, vhfopt.coeff)
@@ -251,10 +239,7 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
else:
scripts.append('jk->s1il')
shls_excludes = [0, h_shls[0]] * 4
- if hermi == 1:
- dms = dms.get()
- else:
- dms = dms[:n_dm//2].get()
+ dms = dms.get()
vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts,
dms, 1, mol._atm, mol._bas, mol._env,
shls_excludes=shls_excludes)
@@ -310,121 +295,148 @@ def get_j(mol, dm, hermi=0, vhfopt=None, verbose=None):
ao_loc = mol.ao_loc
dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32)
- log_max_dm = dm_cond.max()
+ log_max_dm = float(dm_cond.max())
log_cutoff = math.log(vhfopt.direct_scf_tol)
+ uniq_l_ctr = vhfopt.uniq_l_ctr
+ uniq_l = uniq_l_ctr[:,0]
+ l_ctr_bas_loc = vhfopt.l_ctr_offsets
+ l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
+ n_groups = np.count_nonzero(uniq_l <= LMAX)
+ ntiles = mol.nbas // TILE
+
dms = dms.get()
pair_loc = _make_j_engine_pair_locs(mol)
dm_xyz = np.empty(pair_loc[-1])
libvhf_rys.transform_cart_to_xyz(
dm_xyz.ctypes, dms.ctypes, ao_loc.ctypes, pair_loc.ctypes,
mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
- dm_xyz = cp.asarray(dm_xyz)
- vj_xyz = cp.zeros_like(dm_xyz)
-
- pair_loc_on_gpu = cp.asarray(pair_loc)
- rys_envs = RysIntEnvVars(
- mol.natm, mol.nbas,
- vhfopt.rys_envs.atm, vhfopt.rys_envs.bas, vhfopt.rys_envs.env,
- pair_loc_on_gpu.data.ptr,
- )
- err = libvhf_rys.RYS_init_rysj_constant(ctypes.c_int(SHM_SIZE))
- if err != 0:
- raise RuntimeError('CUDA kernel initialization')
+ tasks = [(i,j,k,l)
+ for i in range(n_groups)
+ for j in range(i+1)
+ for k in range(i+1)
+ for l in range(k+1)]
+ schemes = {t: _j_engine_quartets_scheme(mol, uniq_l_ctr[list(t)]) for t in tasks}
- uniq_l_ctr = vhfopt.uniq_l_ctr
- uniq_l = uniq_l_ctr[:,0]
- l_ctr_bas_loc = vhfopt.l_ctr_offsets
- l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
- n_groups = np.count_nonzero(uniq_l <= LMAX)
- ntiles = mol.nbas // TILE
- tile_mappings = {}
- workers = gpu_specs['multiProcessorCount']
- pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16)
- info = cp.empty(2, dtype=np.uint32)
+ def proc(dm_xyz, dm_cond):
+ device_id = cp.cuda.device.get_device_id()
+ stream = cp.cuda.stream.get_current_stream()
+ log = logger.new_logger(mol, verbose)
+ t0 = log.init_timer()
+ dm_xyz = cp.asarray(dm_xyz) # transfer to current device
+ dm_cond = cp.asarray(dm_cond)
+ vj_xyz = cp.zeros_like(dm_xyz)
+ pair_loc_on_gpu = cp.asarray(pair_loc)
+ _atm, _bas, _env, _ = vhfopt.rys_envs._env_ref_holder
+ rys_envs = RysIntEnvVars(
+ mol.natm, mol.nbas,
+ _atm.data.ptr, _bas.data.ptr, _env.data.ptr,
+ pair_loc_on_gpu.data.ptr,
+ )
+ tile_q_cond = vhfopt.tile_q_cond
+ q_cond = vhfopt.q_cond
+
+ err = libvhf_rys.RYS_init_rysj_constant(ctypes.c_int(SHM_SIZE))
+ if err != 0:
+ raise RuntimeError('CUDA kernel initialization')
+
+ tile_mappings = {}
+ workers = gpu_specs['multiProcessorCount']
+ pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16)
+ info = cp.empty(2, dtype=np.uint32)
- for i in range(n_groups):
- for j in range(i+1):
- ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1]
- jsh0, jsh1 = l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]
- ij_shls = (ish0, ish1, jsh0, jsh1)
- i0 = ish0 // TILE
- i1 = ish1 // TILE
- j0 = jsh0 // TILE
- j1 = jsh1 // TILE
- sub_tile_q = vhfopt.tile_q_cond[i0:i1,j0:j1]
- mask = sub_tile_q > log_cutoff - log_max_dm
- if i == j:
- mask = cp.tril(mask)
- t_ij = (cp.arange(i0, i1, dtype=np.int32)[:,None] * ntiles +
- cp.arange(j0, j1, dtype=np.int32))
- idx = cp.argsort(sub_tile_q[mask])[::-1]
- tile_mappings[i,j] = t_ij[mask][idx]
- t1 = t2 = log.timer_debug1('q_cond and dm_cond', *cput0)
+ for i in range(n_groups):
+ for j in range(i+1):
+ ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1]
+ jsh0, jsh1 = l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]
+ i0 = ish0 // TILE
+ i1 = ish1 // TILE
+ j0 = jsh0 // TILE
+ j1 = jsh1 // TILE
+ sub_tile_q = tile_q_cond[i0:i1,j0:j1]
+ mask = sub_tile_q > log_cutoff - log_max_dm
+ if i == j:
+ mask = cp.tril(mask)
+ t_ij = (cp.arange(i0, i1, dtype=np.int32)[:,None] * ntiles +
+ cp.arange(j0, j1, dtype=np.int32))
+ idx = cp.argsort(sub_tile_q[mask])[::-1]
+ tile_mappings[i,j] = t_ij[mask][idx]
+ t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *t0)
+
+ timing_collection = {}
+ kern_counts = 0
+ kern = libvhf_rys.RYS_build_j
+
+ while tasks:
+ try:
+ task = tasks.pop()
+ except IndexError:
+ break
+
+ i, j, k, l = task
+ shls_slice = l_ctr_bas_loc[[i, i+1, j, j+1, k, k+1, l, l+1]]
+ tile_ij_mapping = tile_mappings[i,j]
+ tile_kl_mapping = tile_mappings[k,l]
+ scheme = schemes[task]
+ err = kern(
+ ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p),
+ ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p),
+ ctypes.c_int(n_dm), ctypes.c_int(nao),
+ rys_envs, (ctypes.c_int*3)(*scheme),
+ (ctypes.c_int*8)(*shls_slice),
+ ctypes.c_int(tile_ij_mapping.size),
+ ctypes.c_int(tile_kl_mapping.size),
+ ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+ ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+ ctypes.cast(tile_q_cond.data.ptr, ctypes.c_void_p),
+ ctypes.cast(q_cond.data.ptr, ctypes.c_void_p),
+ lib.c_null_ptr(),
+ ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+ ctypes.c_float(log_cutoff),
+ ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+ ctypes.cast(info.data.ptr, ctypes.c_void_p),
+ ctypes.c_int(workers),
+ mol._atm.ctypes, ctypes.c_int(mol.natm),
+ mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+ if err != 0:
+ llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+ raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
+ if log.verbose >= logger.DEBUG1:
+ llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+ t1, t1p = log.timer_debug1(f'processing {llll}, tasks = {info[1]}', *t1), t1
+ if llll not in timing_collection:
+ timing_collection[llll] = 0
+ timing_collection[llll] += t1[1] - t1p[1]
+ kern_counts += 1
+ if num_devices > 1:
+ stream.synchronize()
+ return vj_xyz, kern_counts, timing_collection
- timing_collection = {}
+ results = multi_gpu.run(proc, args=(dm_xyz, dm_cond), non_blocking=True)
kern_counts = 0
- kern = libvhf_rys.RYS_build_j
-
- for i in range(n_groups):
- for j in range(i+1):
- ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
- l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
- tile_ij_mapping = tile_mappings[i,j]
- for k in range(i+1):
- for l in range(k+1):
- llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
- kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
- l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
- tile_kl_mapping = tile_mappings[k,l]
- scheme = _j_engine_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
- err = kern(
- ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p),
- ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p),
- ctypes.c_int(n_dm), ctypes.c_int(nao),
- rys_envs, (ctypes.c_int*3)(*scheme),
- (ctypes.c_int*8)(*ij_shls, *kl_shls),
- ctypes.c_int(tile_ij_mapping.size),
- ctypes.c_int(tile_kl_mapping.size),
- ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
- ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
- ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p),
- ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p),
- lib.c_null_ptr(),
- ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
- ctypes.c_float(log_cutoff),
- ctypes.cast(pool.data.ptr, ctypes.c_void_p),
- ctypes.cast(info.data.ptr, ctypes.c_void_p),
- ctypes.c_int(workers),
- mol._atm.ctypes, ctypes.c_int(mol.natm),
- mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
- if err != 0:
- raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
- if log.verbose >= logger.DEBUG1:
- t1, t1p = log.timer_debug1(f'processing {llll}, tasks = {info[1]}', *t1), t1
- if llll not in timing_collection:
- timing_collection[llll] = 0
- timing_collection[llll] += t1[1] - t1p[1]
- kern_counts += 1
+ timing_collection = Counter()
+ vj_dist = []
+ for vj, counts, t_counter in results:
+ kern_counts += counts
+ timing_collection += t_counter
+ vj_dist.append(vj)
if log.verbose >= logger.DEBUG1:
log.debug1('kernel launches %d', kern_counts)
for llll, t in timing_collection.items():
log.debug1('%s wall time %.2f', llll, t)
- cp.cuda.Stream.null.synchronize()
- log.timer_debug1('cuda kernel', *t2)
+ vj_xyz = multi_gpu.array_reduce(vj_dist, inplace=True)
vj_xyz = vj_xyz.get()
vj = np.empty_like(dms)
libvhf_rys.transform_xyz_to_cart(
vj.ctypes, vj_xyz.ctypes, ao_loc.ctypes, pair_loc.ctypes,
mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
#:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, cp.asarray(vj), vhfopt.coeff)
- vj = sandwich_dot(vj, vhfopt.coeff)
+ vj = sandwich_dot(cp.asarray(vj), vhfopt.coeff)
vj = transpose_sum(vj)
vj *= 2.
- vj = vj.reshape(dm.shape)
h_shls = vhfopt.h_shls
if h_shls:
@@ -433,7 +445,7 @@ def get_j(mol, dm, hermi=0, vhfopt=None, verbose=None):
scripts = ['ji->s2kl']
shls_excludes = [0, h_shls[0]] * 4
vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts,
- dms.get(), 1, mol._atm, mol._bas, mol._env,
+ dms, 1, mol._atm, mol._bas, mol._env,
shls_excludes=shls_excludes)
vj1 = vs_h[0].reshape(n_dm,nao,nao)
coeff = vhfopt.coeff
@@ -443,6 +455,7 @@ def get_j(mol, dm, hermi=0, vhfopt=None, verbose=None):
vj[i] += coeff.T.dot(cp.asarray(v)).dot(coeff)
log.timer_debug1('get_j pass 2 for h functions on cpu', *cput1)
+ vj = vj.reshape(dm.shape)
log.timer('vj', *cput0)
return vj
@@ -457,7 +470,6 @@ def __init__(self, mol, cutoff=1e-13):
# Hold cache on GPU devices
self._rys_envs = {}
- self._mol_gpu = {}
self._q_cond = {}
self._tile_q_cond = {}
self._s_estimator = {}
@@ -550,11 +562,11 @@ def rys_envs(self):
_bas = cp.array(mol._bas)
_env = cp.array(_scale_sp_ctr_coeff(mol))
ao_loc = cp.array(mol.ao_loc)
- self._mol_gpu[device_id] = (_atm, _bas, _env, ao_loc)
- self._rys_envs[device_id] = RysIntEnvVars(
+ self._rys_envs[device_id] = rys_envs = RysIntEnvVars(
mol.natm, mol.nbas,
_atm.data.ptr, _bas.data.ptr, _env.data.ptr,
ao_loc.data.ptr)
+ rys_envs._env_ref_holder = (_atm, _bas, _env, ao_loc)
return self._rys_envs[device_id]
class RysIntEnvVars(ctypes.Structure):
@@ -600,13 +612,12 @@ def g_pair_idx(ij_inc=None):
def init_constant(mol):
g_idx, offsets = g_pair_idx()
- for device_id in range(_num_devices):
- with cp.cuda.Device(device_id), _streams[device_id]:
- err = libvhf_rys.RYS_init_constant(
- g_idx.ctypes, offsets.ctypes, mol._env.ctypes,
- ctypes.c_int(mol._env.size), ctypes.c_int(SHM_SIZE))
- if err != 0:
- raise RuntimeError(f'CUDA kernel initialization on device {device_id}')
+ err = libvhf_rys.RYS_init_constant(
+ g_idx.ctypes, offsets.ctypes, mol._env.ctypes,
+ ctypes.c_int(mol._env.size), ctypes.c_int(SHM_SIZE))
+ if err != 0:
+ device_id = cp.cuda.device.get_device_id()
+ raise RuntimeError(f'CUDA kernel initialization on device {device_id}')
def _make_tril_tile_mappings(l_ctr_bas_loc, tile_q_cond, cutoff, tile=TILE):
n_groups = len(l_ctr_bas_loc) - 1
diff --git a/gpu4pyscf/scf/tests/test_scf_jk.py b/gpu4pyscf/scf/tests/test_scf_jk.py
index 78ae68eb..e311482f 100644
--- a/gpu4pyscf/scf/tests/test_scf_jk.py
+++ b/gpu4pyscf/scf/tests/test_scf_jk.py
@@ -125,3 +125,32 @@ def test_jk_hermi0():
assert abs(vj2+vj3 - vj1).max() < 1e-9
assert abs(vk2+vk3 - vk1).max() < 1e-9
+
+def test_jk_hermi0_l5():
+ mol = pyscf.M(
+ atom = '''
+ O 0.000 -0. 0.1174
+ H -0.757 4. -0.4696
+ H 0.757 4. -0.4696
+ C 1. 1. 0.
+ H 4. 0. 3.
+ H 0. 1. .6
+ ''',
+ basis={'default': 'def2-tzvp', 'O': [[5, [1., 1.]]]},
+ unit='B',)
+
+ np.random.seed(9)
+ nao = mol.nao
+ dm = np.random.rand(nao, nao)
+ vj, vk = jk.get_jk(mol, dm, hermi=0)
+ vj = vj.get()
+ vk = vk.get()
+ ref = get_jk(mol, dm, hermi=0)
+ assert abs(vj - ref[0]).max() < 1e-9
+ assert abs(vk - ref[1]).max() < 1e-9
+ assert abs(lib.fp(vj) - -61.28856847097108) < 1e-9
+ assert abs(lib.fp(vk) - -76.38373664249241) < 1e-9
+
+ vj = jk.get_j(mol, dm, hermi=0).get()
+ assert abs(vj - ref[0]).max() < 1e-9
+ assert abs(lib.fp(vj) - -61.28856847097108) < 1e-9
diff --git a/gpu4pyscf/scf/uhf.py b/gpu4pyscf/scf/uhf.py
index 12a01d57..1107cbd2 100644
--- a/gpu4pyscf/scf/uhf.py
+++ b/gpu4pyscf/scf/uhf.py
@@ -38,10 +38,6 @@ def make_rdm1(mo_coeff, mo_occ, **kwargs):
mo_b = mo_coeff[1]
dm_a = cupy.dot(mo_a*mo_occ[0], mo_a.conj().T)
dm_b = cupy.dot(mo_b*mo_occ[1], mo_b.conj().T)
-# DO NOT make tag_array for DM here because the DM arrays may be modified and
-# passed to functions like get_jk, get_vxc. These functions may take the tags
-# (mo_coeff, mo_occ) to compute the potential if tags were found in the DM
-# arrays and modifications to DM arrays may be ignored.
return tag_array((dm_a, dm_b), mo_coeff=mo_coeff, mo_occ=mo_occ)
diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py
index 3fe7cb6c..28711f77 100644
--- a/gpu4pyscf/solvent/grad/pcm.py
+++ b/gpu4pyscf/solvent/grad/pcm.py
@@ -40,13 +40,6 @@ def grad_switch_h(x):
dy[x>1] = 0.0
return dy
-def gradgrad_switch_h(x):
- ''' 2nd derivative of h(x) '''
- ddy = 60.0*x - 180.0*x**2 + 120*x**3
- ddy[x<0] = 0.0
- ddy[x>1] = 0.0
- return ddy
-
def get_dF_dA(surface):
'''
J. Chem. Phys. 133, 244111 (2010), Appendix C
@@ -63,10 +56,9 @@ def get_dF_dA(surface):
dF = cupy.zeros([ngrids, natom, 3])
dA = cupy.zeros([ngrids, natom, 3])
- for ia in range(atom_coords.shape[0]):
+ for ia in range(natom):
p0,p1 = surface['gslice_by_atom'][ia]
coords = grid_coords[p0:p1]
- p1 = p0 + coords.shape[0]
ri_rJ = cupy.expand_dims(coords, axis=1) - atom_coords
riJ = cupy.linalg.norm(ri_rJ, axis=-1)
diJ = (riJ - R_in_J) / R_sw_J
@@ -145,9 +137,7 @@ def get_dD_dS(surface, with_S=True, with_D=False, stream=None):
'''
charge_exp = surface['charge_exp']
grid_coords = surface['grid_coords']
- switch_fun = surface['switch_fun']
norm_vec = surface['norm_vec']
- R_vdw = surface['R_vdw']
n = charge_exp.shape[0]
dS = cupy.empty([3,n,n])
dD = None
@@ -163,9 +153,7 @@ def get_dD_dS(surface, with_S=True, with_D=False, stream=None):
dD_ptr, dS_ptr,
ctypes.cast(grid_coords.data.ptr, ctypes.c_void_p),
ctypes.cast(norm_vec.data.ptr, ctypes.c_void_p),
- ctypes.cast(R_vdw.data.ptr, ctypes.c_void_p),
ctypes.cast(charge_exp.data.ptr, ctypes.c_void_p),
- ctypes.cast(switch_fun.data.ptr, ctypes.c_void_p),
ctypes.c_int(n)
)
if err != 0:
@@ -181,7 +169,7 @@ def get_dSii(surface, dF):
dSii = dSii_dF[:,None] * dF
return dSii
-def grad_nuc(pcmobj, dm):
+def grad_nuc(pcmobj, dm, q_sym = None):
mol = pcmobj.mol
log = logger.new_logger(mol, mol.verbose)
t1 = log.init_timer()
@@ -194,7 +182,8 @@ def grad_nuc(pcmobj, dm):
pcmobj._get_vind(dm)
mol = pcmobj.mol
- q_sym = pcmobj._intermediates['q_sym'].get()
+ if q_sym is None:
+ q_sym = pcmobj._intermediates['q_sym'].get()
gridslice = pcmobj.surface['gslice_by_atom']
grid_coords = pcmobj.surface['grid_coords'].get()
exponents = pcmobj.surface['charge_exp'].get()
@@ -220,7 +209,7 @@ def grad_nuc(pcmobj, dm):
t1 = log.timer_debug1('grad nuc', *t1)
return de
-def grad_qv(pcmobj, dm):
+def grad_qv(pcmobj, dm, q_sym = None):
'''
contributions due to integrals
'''
@@ -237,7 +226,8 @@ def grad_qv(pcmobj, dm):
gridslice = pcmobj.surface['gslice_by_atom']
charge_exp = pcmobj.surface['charge_exp']
grid_coords = pcmobj.surface['grid_coords']
- q_sym = pcmobj._intermediates['q_sym']
+ if q_sym is None:
+ q_sym = pcmobj._intermediates['q_sym']
intopt = int3c1e.VHFOpt(mol)
intopt.build(1e-14, aosym=False)
@@ -282,12 +272,23 @@ def grad_solver(pcmobj, dm):
vK_1 = cupy.linalg.solve(K.T, v_grids)
epsilon = pcmobj.eps
+ def contract_bra(a, B, c):
+ ''' i,xij,j->jx '''
+ tmp = a.dot(B)
+ return (tmp*c).T
+
+ def contract_ket(a, B, c):
+ ''' i,xij,j->ix '''
+ tmp = B.dot(c)
+ return (a*tmp).T
+
de = cupy.zeros([pcmobj.mol.natm,3])
if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']:
dD, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True)
# dR = 0, dK = dS
- de_dS = (vK_1 * dS.dot(q)).T # cupy.einsum('i,xij,j->ix', vK_1, dS, q)
+ de_dS = 0.5 * contract_ket(vK_1, dS, q)
+ de_dS -= 0.5 * contract_bra(vK_1, dS, q)
de -= cupy.asarray([cupy.sum(de_dS[p0:p1], axis=0) for p0,p1 in gridslice])
dD = dS = None
@@ -295,24 +296,13 @@ def grad_solver(pcmobj, dm):
dSii = get_dSii(pcmobj.surface, dF)
de -= 0.5*contract('i,xij->jx', vK_1*q, dSii) # 0.5*cupy.einsum('i,xij,i->jx', vK_1, dSii, q)
- elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE', 'SMD']:
+ elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']:
dF, dA = get_dF_dA(pcmobj.surface)
dSii = get_dSii(pcmobj.surface, dF)
dF = None
dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
- def contract_bra(a, B, c):
- ''' i,xij,j->jx '''
- tmp = a.dot(B)
- return (tmp*c).T
-
- def contract_ket(a, B, c):
- ''' i,xij,j->ix '''
- tmp = B.dot(c)
- return (a*tmp).T
-
- # IEF-PCM and SS(V)PE formally are the same in gradient calculation
# dR = f_eps/(2*pi) * (dD*A + D*dA),
# dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
@@ -352,6 +342,67 @@ def contract_ket(a, B, c):
de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1)
de += de_dR - de_dK
+ elif pcmobj.method.upper() in [ 'SS(V)PE' ]:
+ dF, dA = get_dF_dA(pcmobj.surface)
+ dSii = get_dSii(pcmobj.surface, dF)
+ dF = None
+
+ dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
+
+ # dR = f_eps/(2*pi) * (dD*A + D*dA),
+ # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
+ f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
+ fac = f_epsilon/(2.0*PI)
+
+ Av = A*v_grids
+ de_dR = 0.5*fac * contract_ket(vK_1, dD, Av)
+ de_dR -= 0.5*fac * contract_bra(vK_1, dD, Av)
+ de_dR = cupy.asarray([cupy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice])
+
+ vK_1_D = vK_1.dot(D)
+ vK_1_Dv = vK_1_D * v_grids
+ de_dR += 0.5*fac * contract('j,xjn->nx', vK_1_Dv, dA)
+
+ de_dS0 = 0.5*contract_ket(vK_1, dS, q)
+ de_dS0 -= 0.5*contract_bra(vK_1, dS, q)
+ de_dS0 = cupy.asarray([cupy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice])
+
+ vK_1_q = vK_1 * q
+ de_dS0 += 0.5*contract('i,xin->nx', vK_1_q, dSii)
+
+ vK_1_DA = vK_1_D*A
+ de_dS1 = 0.5*contract_ket(vK_1_DA, dS, q)
+ de_dS1 -= 0.5*contract_bra(vK_1_DA, dS, q)
+ de_dS1 = cupy.asarray([cupy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice])
+ vK_1_DAq = vK_1_DA*q
+ de_dS1 += 0.5*contract('j,xjn->nx', vK_1_DAq, dSii)
+
+ DT_q = cupy.dot(D.T, q)
+ ADT_q = A * DT_q
+ de_dS1_T = 0.5*contract_ket(vK_1, dS, ADT_q)
+ de_dS1_T -= 0.5*contract_bra(vK_1, dS, ADT_q)
+ de_dS1_T = cupy.asarray([cupy.sum(de_dS1_T[p0:p1], axis=0) for p0,p1 in gridslice])
+ vK_1_ADT_q = vK_1 * ADT_q
+ de_dS1_T += 0.5*contract('j,xjn->nx', vK_1_ADT_q, dSii)
+
+ Sq = cupy.dot(S,q)
+ ASq = A*Sq
+ de_dD = 0.5*contract_ket(vK_1, dD, ASq)
+ de_dD -= 0.5*contract_bra(vK_1, dD, ASq)
+ de_dD = cupy.asarray([cupy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice])
+
+ vK_1_S = cupy.dot(vK_1, S)
+ vK_1_SA = vK_1_S * A
+ de_dD_T = 0.5*contract_ket(vK_1_SA, -dD.transpose(0,2,1), q)
+ de_dD_T -= 0.5*contract_bra(vK_1_SA, -dD.transpose(0,2,1), q)
+ de_dD_T = cupy.asarray([cupy.sum(de_dD_T[p0:p1], axis=0) for p0,p1 in gridslice])
+
+ de_dA = 0.5*contract('j,xjn->nx', vK_1_D*Sq, dA) # 0.5*cupy.einsum('j,xjn,j->nx', vK_1_D, dA, Sq)
+
+ de_dA_T = 0.5*contract('j,xjn->nx', vK_1_S*DT_q, dA)
+
+ de_dK = de_dS0 - 0.5 * fac * (de_dD + de_dA + de_dS1 + de_dD_T + de_dA_T + de_dS1_T)
+ de += de_dR - de_dK
else:
raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}")
t1 = log.timer_debug1('grad solver', *t1)
diff --git a/gpu4pyscf/solvent/hessian/pcm.py b/gpu4pyscf/solvent/hessian/pcm.py
index 538cb859..11c3e1df 100644
--- a/gpu4pyscf/solvent/hessian/pcm.py
+++ b/gpu4pyscf/solvent/hessian/pcm.py
@@ -19,141 +19,685 @@
import numpy
import cupy
+import ctypes
from pyscf import lib, gto
from gpu4pyscf import scf
-from gpu4pyscf.solvent.pcm import PI
-from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc, get_dD_dS, get_dF_dA, get_dSii
+from gpu4pyscf.solvent.pcm import PI, switch_h, libsolvent
+from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc, get_dD_dS, get_dF_dA, get_dSii, grad_switch_h
from gpu4pyscf.df import int3c2e
from gpu4pyscf.lib import logger
from gpu4pyscf.hessian.jk import _ao2mo
from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2
+from gpu4pyscf.gto.int3c1e_ipip import int1e_grids_ipip1, int1e_grids_ipvip1, int1e_grids_ipip2, int1e_grids_ip1ip2
from gpu4pyscf.gto import int3c1e
from gpu4pyscf.gto.int3c1e import int1e_grids
+from pyscf import lib as pyscf_lib
-def hess_nuc(pcmobj):
- raise NotImplementedError("Not tested")
+def gradgrad_switch_h(x):
+ ''' 2nd derivative of h(x) '''
+ ddy = 60.0*x - 180.0*x**2 + 120.0*x**3
+ ddy[x<0] = 0.0
+ ddy[x>1] = 0.0
+ return ddy
+
+def get_d2F_d2A(surface):
+ '''
+ Notations adopted from
+ J. Chem. Phys. 133, 244111 (2010), Appendix C
+ '''
+ atom_coords = surface['atom_coords']
+ grid_coords = surface['grid_coords']
+ switch_fun = surface['switch_fun']
+ area = surface['area']
+ R_in_J = surface['R_in_J']
+ R_sw_J = surface['R_sw_J']
+
+ ngrids = grid_coords.shape[0]
+ natom = atom_coords.shape[0]
+ d2F = cupy.zeros([ngrids, natom, natom, 3, 3])
+ d2A = cupy.zeros([ngrids, natom, natom, 3, 3])
+
+ for i_grid_atom in range(natom):
+ p0,p1 = surface['gslice_by_atom'][i_grid_atom]
+ coords = grid_coords[p0:p1]
+ si_rJ = cupy.expand_dims(coords, axis=1) - atom_coords
+ norm_si_rJ = cupy.linalg.norm(si_rJ, axis=-1)
+ diJ = (norm_si_rJ - R_in_J) / R_sw_J
+ diJ[:,i_grid_atom] = 1.0
+ diJ[diJ < 1e-8] = 0.0
+ si_rJ[:,i_grid_atom,:] = 0.0
+ si_rJ[diJ < 1e-8] = 0.0
+
+ fiJ = switch_h(diJ)
+ dfiJ = grad_switch_h(diJ)
+
+ fiJK = fiJ[:, :, cupy.newaxis] * fiJ[:, cupy.newaxis, :]
+ dfiJK = dfiJ[:, :, cupy.newaxis] * dfiJ[:, cupy.newaxis, :]
+ R_sw_JK = R_sw_J[:, cupy.newaxis] * R_sw_J[cupy.newaxis, :]
+ norm_si_rJK = norm_si_rJ[:, :, cupy.newaxis] * norm_si_rJ[:, cupy.newaxis, :]
+ terms_size_ngrids_natm_natm = dfiJK / (fiJK * norm_si_rJK * R_sw_JK)
+ si_rJK = si_rJ[:, :, cupy.newaxis, :, cupy.newaxis] * si_rJ[:, cupy.newaxis, :, cupy.newaxis, :]
+ d2fiJK_offdiagonal = terms_size_ngrids_natm_natm[:, :, :, cupy.newaxis, cupy.newaxis] * si_rJK
+
+ d2fiJ = gradgrad_switch_h(diJ)
+ terms_size_ngrids_natm = d2fiJ / (norm_si_rJ**2 * R_sw_J) - dfiJ / (norm_si_rJ**3)
+ si_rJJ = si_rJ[:, :, :, cupy.newaxis] * si_rJ[:, :, cupy.newaxis, :]
+ d2fiJK_diagonal = cupy.einsum('qA,qAdD->qAdD', terms_size_ngrids_natm, si_rJJ)
+ d2fiJK_diagonal += cupy.einsum('qA,dD->qAdD', dfiJ / norm_si_rJ, cupy.eye(3))
+ d2fiJK_diagonal /= (fiJ * R_sw_J)[:, :, cupy.newaxis, cupy.newaxis]
+
+ d2fiJK = d2fiJK_offdiagonal
+ for i_atom in range(natom):
+ d2fiJK[:, i_atom, i_atom, :, :] = d2fiJK_diagonal[:, i_atom, :, :]
+
+ Fi = switch_fun[p0:p1]
+ Ai = area[p0:p1]
+
+ d2F[p0:p1, :, :, :, :] += cupy.einsum('q,qABdD->qABdD', Fi, d2fiJK)
+ d2A[p0:p1, :, :, :, :] += cupy.einsum('q,qABdD->qABdD', Ai, d2fiJK)
+
+ d2fiJK_grid_atom_offdiagonal = -cupy.einsum('qABdD->qAdD', d2fiJK)
+ d2F[p0:p1, i_grid_atom, :, :, :] = cupy.einsum('q,qAdD->qAdD', Fi, d2fiJK_grid_atom_offdiagonal.transpose(0,1,3,2))
+ d2F[p0:p1, :, i_grid_atom, :, :] = cupy.einsum('q,qAdD->qAdD', Fi, d2fiJK_grid_atom_offdiagonal)
+ d2A[p0:p1, i_grid_atom, :, :, :] = cupy.einsum('q,qAdD->qAdD', Ai, d2fiJK_grid_atom_offdiagonal.transpose(0,1,3,2))
+ d2A[p0:p1, :, i_grid_atom, :, :] = cupy.einsum('q,qAdD->qAdD', Ai, d2fiJK_grid_atom_offdiagonal)
+
+ d2fiJK_grid_atom_diagonal = -cupy.einsum('qAdD->qdD', d2fiJK_grid_atom_offdiagonal)
+ d2F[p0:p1, i_grid_atom, i_grid_atom, :, :] = cupy.einsum('q,qdD->qdD', Fi, d2fiJK_grid_atom_diagonal)
+ d2A[p0:p1, i_grid_atom, i_grid_atom, :, :] = cupy.einsum('q,qdD->qdD', Ai, d2fiJK_grid_atom_diagonal)
+
+ d2F = d2F.transpose(1,2,3,4,0)
+ d2A = d2A.transpose(1,2,3,4,0)
+ return d2F, d2A
+
+def get_d2Sii(surface, dF, d2F, stream=None):
+ ''' Second derivative of S matrix (diagonal only)
+ '''
+ charge_exp = surface['charge_exp']
+ switch_fun = surface['switch_fun']
+ ngrids = switch_fun.shape[0]
+ dF = dF.transpose(2,0,1)
+ natm = dF.shape[0]
+ assert dF.shape == (natm, 3, ngrids)
+
+ # dF_dF = dF[:, cupy.newaxis, :, cupy.newaxis, :] * dF[cupy.newaxis, :, cupy.newaxis, :, :]
+ # dF_dF_over_F3 = dF_dF * (1.0/(switch_fun**3))
+ # d2F_over_F2 = d2F * (1.0/(switch_fun**2))
+ # d2Sii = 2 * dF_dF_over_F3 - d2F_over_F2
+ # d2Sii = (2.0/PI)**0.5 * (d2Sii * charge_exp)
+
+ dF = dF.flatten() # Make sure the underlying data order is the same as shape shows
+ d2F = d2F.flatten() # Make sure the underlying data order is the same as shape shows
+ d2Sii = cupy.empty((natm, natm, 3, 3, ngrids), dtype=cupy.float64)
+ if stream is None:
+ stream = cupy.cuda.get_current_stream()
+ err = libsolvent.pcm_d2f_to_d2sii(
+ ctypes.cast(stream.ptr, ctypes.c_void_p),
+ ctypes.cast(switch_fun.data.ptr, ctypes.c_void_p),
+ ctypes.cast(dF.data.ptr, ctypes.c_void_p),
+ ctypes.cast(d2F.data.ptr, ctypes.c_void_p),
+ ctypes.cast(charge_exp.data.ptr, ctypes.c_void_p),
+ ctypes.cast(d2Sii.data.ptr, ctypes.c_void_p),
+ ctypes.c_int(natm),
+ ctypes.c_int(ngrids),
+ )
+ if err != 0:
+ raise RuntimeError('Failed in converting PCM d2F to d2Sii.')
+ return d2Sii
+
+def get_d2D_d2S(surface, with_S=True, with_D=False, stream=None):
+ ''' Second derivatives of D matrix and S matrix (offdiagonals only)
+ '''
+ charge_exp = surface['charge_exp']
+ grid_coords = surface['grid_coords']
+ norm_vec = surface['norm_vec']
+ n = charge_exp.shape[0]
+ d2S = cupy.empty([3,3,n,n])
+ d2D = None
+ d2S_ptr = ctypes.cast(d2S.data.ptr, ctypes.c_void_p)
+ d2D_ptr = pyscf_lib.c_null_ptr()
+ if with_D:
+ d2D = cupy.empty([3,3,n,n])
+ d2D_ptr = ctypes.cast(d2D.data.ptr, ctypes.c_void_p)
+ if stream is None:
+ stream = cupy.cuda.get_current_stream()
+ err = libsolvent.pcm_d2d_d2s(
+ ctypes.cast(stream.ptr, ctypes.c_void_p),
+ d2D_ptr, d2S_ptr,
+ ctypes.cast(grid_coords.data.ptr, ctypes.c_void_p),
+ ctypes.cast(norm_vec.data.ptr, ctypes.c_void_p),
+ ctypes.cast(charge_exp.data.ptr, ctypes.c_void_p),
+ ctypes.c_int(n)
+ )
+ if err != 0:
+ raise RuntimeError('Failed in generating PCM d2D and d2S matrices.')
+ return d2D, d2S
+
+def analytical_hess_nuc(pcmobj, dm, verbose=None):
if not pcmobj._intermediates:
pcmobj.build()
+ dm_cache = pcmobj._intermediates.get('dm', None)
+ if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10:
+ pass
+ else:
+ pcmobj._get_vind(dm)
mol = pcmobj.mol
+ log = logger.new_logger(pcmobj, verbose)
+ t1 = log.init_timer()
+
q_sym = pcmobj._intermediates['q_sym'].get()
gridslice = pcmobj.surface['gslice_by_atom']
grid_coords = pcmobj.surface['grid_coords'].get()
exponents = pcmobj.surface['charge_exp'].get()
+ ngrids = q_sym.shape[0]
+
atom_coords = mol.atom_coords(unit='B')
atom_charges = numpy.asarray(mol.atom_charges(), dtype=numpy.float64)
fakemol_nuc = gto.fakemol_for_charges(atom_coords)
fakemol = gto.fakemol_for_charges(grid_coords, expnt=exponents**2)
- # nuclei potential response
+ d2e_from_d2I = numpy.zeros([mol.natm, mol.natm, 3, 3])
+
int2c2e_ip1ip2 = mol._add_suffix('int2c2e_ip1ip2')
- v_ng_ip1ip2 = gto.mole.intor_cross(int2c2e_ip1ip2, fakemol_nuc, fakemol).reshape([3,3,mol.natm,-1])
- dv_g = numpy.einsum('n,xyng->ngxy', atom_charges, v_ng_ip1ip2)
- dv_g = numpy.einsum('ngxy,g->ngxy', dv_g, q_sym)
+ d2I_dAdC = gto.mole.intor_cross(int2c2e_ip1ip2, fakemol_nuc, fakemol)
+ d2I_dAdC = d2I_dAdC.reshape(3, 3, mol.natm, ngrids)
+ for i_atom in range(mol.natm):
+ g0,g1 = gridslice[i_atom]
+ d2e_from_d2I[:, i_atom, :, :] += numpy.einsum('A,dDAq,q->AdD', atom_charges, d2I_dAdC[:, :, :, g0:g1], q_sym[g0:g1])
+ d2e_from_d2I[i_atom, :, :, :] += numpy.einsum('A,dDAq,q->AdD', atom_charges, d2I_dAdC[:, :, :, g0:g1], q_sym[g0:g1])
- de = numpy.zeros([mol.natm, mol.natm, 3, 3])
- for ia in range(mol.natm):
- p0, p1 = gridslice[ia]
- de_tmp = numpy.sum(dv_g[:,p0:p1], axis=1)
- de[:,ia] -= de_tmp
- #de[ia,:] -= de_tmp.transpose([0,2,1])
+ int2c2e_ipip1 = mol._add_suffix('int2c2e_ipip1')
+ # # Some explanations here:
+ # # Why can we use the ip1ip2 here? Because of the translational invariance
+ # # $\frac{\partial^2 I_{AC}}{\partial A^2} + \frac{\partial^2 I_{AC}}{\partial A \partial C} = 0$
+ # # Why not using the ipip1 here? Because the nuclei, a point charge, is handled as a Gaussian charge with exponent = 1e16
+ # # This causes severe numerical problem in function int2c2e_ip1ip2, and make the main diagonal of hessian garbage.
+ # d2I_dA2 = gto.mole.intor_cross(int2c2e_ipip1, fakemol_nuc, fakemol)
+ d2I_dA2 = -gto.mole.intor_cross(int2c2e_ip1ip2, fakemol_nuc, fakemol)
+ d2I_dA2 = d2I_dA2 @ q_sym
+ d2I_dA2 = d2I_dA2.reshape(3, 3, mol.natm)
+ for i_atom in range(mol.natm):
+ d2e_from_d2I[i_atom, i_atom, :, :] += atom_charges[i_atom] * d2I_dA2[:, :, i_atom]
+
+ d2I_dC2 = gto.mole.intor_cross(int2c2e_ipip1, fakemol, fakemol_nuc)
+ d2I_dC2 = d2I_dC2 @ atom_charges
+ d2I_dC2 = d2I_dC2.reshape(3, 3, ngrids)
+ for i_atom in range(mol.natm):
+ g0,g1 = gridslice[i_atom]
+ d2e_from_d2I[i_atom, i_atom, :, :] += d2I_dC2[:, :, g0:g1] @ q_sym[g0:g1]
+ intopt_derivative = int3c1e.VHFOpt(mol)
+ intopt_derivative.build(cutoff = 1e-14, aosym = False)
- int2c2e_ip1ip2 = mol._add_suffix('int2c2e_ip1ip2')
- v_ng_ip1ip2 = gto.mole.intor_cross(int2c2e_ip1ip2, fakemol, fakemol_nuc).reshape([3,3,-1,mol.natm])
- dv_g = numpy.einsum('n,xygn->gnxy', atom_charges, v_ng_ip1ip2)
- dv_g = numpy.einsum('gnxy,g->gnxy', dv_g, q_sym)
+ dqdx = get_dqsym_dx(pcmobj, dm, range(mol.natm), intopt_derivative)
+ dqdx = dqdx.get()
- for ia in range(mol.natm):
- p0, p1 = gridslice[ia]
- de_tmp = numpy.sum(dv_g[p0:p1], axis=0)
- de[ia,:] -= de_tmp
- #de[ia,:] -= de_tmp.transpose([0,2,1])
+ d2e_from_dIdq = numpy.zeros([mol.natm, mol.natm, 3, 3])
+ for i_atom in range(mol.natm):
+ for i_xyz in range(3):
+ d2e_from_dIdq[i_atom, :, i_xyz, :] = grad_nuc(pcmobj, dm, q_sym = dqdx[i_atom, i_xyz, :])
- int2c2e_ipip1 = mol._add_suffix('int2c2e_ipip1')
- v_ng_ipip1 = gto.mole.intor_cross(int2c2e_ipip1, fakemol_nuc, fakemol).reshape([3,3,mol.natm,-1])
- dv_g = numpy.einsum('g,xyng->nxy', q_sym, v_ng_ipip1)
- for ia in range(mol.natm):
- de[ia,ia] -= dv_g[ia] * atom_charges[ia]
-
- v_ng_ipip1 = gto.mole.intor_cross(int2c2e_ipip1, fakemol, fakemol_nuc).reshape([3,3,-1,mol.natm])
- dv_g = numpy.einsum('n,xygn->gxy', atom_charges, v_ng_ipip1)
- dv_g = numpy.einsum('g,gxy->gxy', q_sym, dv_g)
- for ia in range(mol.natm):
- p0, p1 = gridslice[ia]
- de[ia,ia] -= numpy.sum(dv_g[p0:p1], axis=0)
-
- return de
-
-def hess_qv(pcmobj, dm, verbose=None):
- raise NotImplementedError("PCM analytical hessian is not tested")
- if not pcmobj._intermediates or 'q_sym' not in pcmobj._intermediates:
- pcmobj._get_vind(dm)
- gridslice = pcmobj.surface['gslice_by_atom']
- q_sym = pcmobj._intermediates['q_sym']
+ d2e = d2e_from_d2I - d2e_from_dIdq
- intopt = pcmobj.intopt
- intopt.clear()
- # rebuild with aosym
- intopt.build(1e-14, diag_block_with_triu=True, aosym=False)
- coeff = intopt.coeff
- dm_cart = coeff @ dm @ coeff.T
- #dm_cart = cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff)
-
- dvj, _ = int3c2e.get_int3c2e_ipip1_hjk(intopt, q_sym, None, dm_cart, with_k=False)
- dq, _ = int3c2e.get_int3c2e_ipvip1_hjk(intopt, q_sym, None, dm_cart, with_k=False)
- dvj, _ = int3c2e.get_int3c2e_ip1ip2_hjk(intopt, q_sym, None, dm_cart, with_k=False)
- dq, _ = int3c2e.get_int3c2e_ipip2_hjk(intopt, q_sym, None, dm_cart, with_k=False)
-
- cart_ao_idx = intopt.cart_ao_idx
- rev_cart_ao_idx = numpy.argsort(cart_ao_idx)
- dvj = dvj[:,rev_cart_ao_idx]
-
- aoslice = intopt.mol.aoslice_by_atom()
- dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice])
- dvj= 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]])
- de = dq + dvj
- return de.get()
-
-def hess_elec(pcmobj, dm, verbose=None):
- '''
- slow version with finite difference
- TODO: use analytical hess_nuc
- '''
+ t1 = log.timer_debug1('solvent hessian d(dVnuc/dx * q)/dx contribution', *t1)
+ return d2e
+
+def analytical_hess_qv(pcmobj, dm, verbose=None):
+ if not pcmobj._intermediates:
+ pcmobj.build()
+ dm_cache = pcmobj._intermediates.get('dm', None)
+ if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10:
+ pass
+ else:
+ pcmobj._get_vind(dm)
+ mol = pcmobj.mol
log = logger.new_logger(pcmobj, verbose)
t1 = log.init_timer()
- pmol = pcmobj.mol.copy()
- mol = pmol.copy()
- coords = mol.atom_coords(unit='Bohr')
-
- def pcm_grad_scanner(mol):
- # TODO: use more analytical forms
- pcmobj.reset(mol)
- e, v = pcmobj._get_vind(dm)
- #return grad_elec(pcmobj, dm)
- pcm_grad = grad_nuc(pcmobj, dm)
- pcm_grad+= grad_solver(pcmobj, dm)
- pcm_grad+= grad_qv(pcmobj, dm)
- return pcm_grad
-
- mol.verbose = 0
- de = numpy.zeros([mol.natm, mol.natm, 3, 3])
- eps = 1e-3
- for ia in range(mol.natm):
- for ix in range(3):
- dv = numpy.zeros_like(coords)
- dv[ia,ix] = eps
- mol.set_geom_(coords + dv, unit='Bohr')
- g0 = pcm_grad_scanner(mol)
-
- mol.set_geom_(coords - dv, unit='Bohr')
- g1 = pcm_grad_scanner(mol)
- de[ia,:,ix] = (g0 - g1)/2.0/eps
- t1 = log.timer_debug1('solvent energy', *t1)
- pcmobj.reset(pmol)
- return de
-
-def get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K):
+
+ gridslice = pcmobj.surface['gslice_by_atom']
+ charge_exp = pcmobj.surface['charge_exp']
+ grid_coords = pcmobj.surface['grid_coords']
+ q_sym = pcmobj._intermediates['q_sym']
+
+ aoslice = mol.aoslice_by_atom()
+ aoslice = numpy.array(aoslice)
+
+ intopt_derivative = int3c1e.VHFOpt(mol)
+ intopt_derivative.build(cutoff = 1e-14, aosym = False)
+
+ # fakemol = gto.fakemol_for_charges(grid_coords.get(), expnt=charge_exp.get()**2)
+ # intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e')
+ # intopt.build(1e-14, diag_block_with_triu=True, aosym=False)
+
+ d2e_from_d2I = cupy.zeros([mol.natm, mol.natm, 3, 3])
+
+ # d2I_dA2 = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip1', direct_scf_tol=1e-14)
+ # d2I_dA2 = cupy.einsum('dijq,q->dij', d2I_dA2, q_sym)
+ # d2I_dA2 = d2I_dA2.reshape([3, 3, nao, nao])
+ d2I_dA2 = int1e_grids_ipip1(mol, grid_coords, charges = q_sym, intopt = intopt_derivative, charge_exponents = charge_exp**2)
+ for i_atom in range(mol.natm):
+ p0,p1 = aoslice[i_atom, 2:]
+ d2e_from_d2I[i_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[p0:p1, :], d2I_dA2[:, :, p0:p1, :])
+ d2e_from_d2I[i_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[:, p0:p1], d2I_dA2[:, :, p0:p1, :].transpose(0,1,3,2))
+ d2I_dA2 = None
+
+ # d2I_dAdB = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipvip1', direct_scf_tol=1e-14)
+ # d2I_dAdB = cupy.einsum('dijq,q->dij', d2I_dAdB, q_sym)
+ # d2I_dAdB = d2I_dAdB.reshape([3, 3, nao, nao])
+ d2I_dAdB = int1e_grids_ipvip1(mol, grid_coords, charges = q_sym, intopt = intopt_derivative, charge_exponents = charge_exp**2)
+ for i_atom in range(mol.natm):
+ pi0,pi1 = aoslice[i_atom, 2:]
+ for j_atom in range(mol.natm):
+ pj0,pj1 = aoslice[j_atom, 2:]
+ d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[pi0:pi1, pj0:pj1], d2I_dAdB[:, :, pi0:pi1, pj0:pj1])
+ d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[pj0:pj1, pi0:pi1], d2I_dAdB[:, :, pi0:pi1, pj0:pj1].transpose(0,1,3,2))
+ d2I_dAdB = None
+
+ for j_atom in range(mol.natm):
+ g0,g1 = gridslice[j_atom]
+ # d2I_dAdC = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ip1ip2', direct_scf_tol=1e-14)
+ # d2I_dAdC = cupy.einsum('dijq,q->dij', d2I_dAdC[:, :, :, g0:g1], q_sym[g0:g1])
+ # d2I_dAdC = d2I_dAdC.reshape([3, 3, nao, nao])
+ d2I_dAdC = int1e_grids_ip1ip2(mol, grid_coords[g0:g1, :], charges = q_sym[g0:g1], intopt = intopt_derivative, charge_exponents = charge_exp[g0:g1]**2)
+
+ for i_atom in range(mol.natm):
+ p0,p1 = aoslice[i_atom, 2:]
+ d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[p0:p1, :], d2I_dAdC[:, :, p0:p1, :])
+ d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[:, p0:p1], d2I_dAdC[:, :, p0:p1, :].transpose(0,1,3,2))
+
+ d2e_from_d2I[j_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[p0:p1, :], d2I_dAdC[:, :, p0:p1, :].transpose(1,0,2,3))
+ d2e_from_d2I[j_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[:, p0:p1], d2I_dAdC[:, :, p0:p1, :].transpose(1,0,3,2))
+ d2I_dAdC = None
+
+ # d2I_dC2 = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip2', direct_scf_tol=1e-14)
+ # d2I_dC2 = cupy.einsum('dijq,ij->dq', d2I_dC2, dm)
+ # d2I_dC2 = d2I_dC2.reshape([3, 3, ngrids])
+ d2I_dC2 = int1e_grids_ipip2(mol, grid_coords, dm = dm, intopt = intopt_derivative, charge_exponents = charge_exp**2)
+ for i_atom in range(mol.natm):
+ g0,g1 = gridslice[i_atom]
+ d2e_from_d2I[i_atom, i_atom, :, :] += d2I_dC2[:, :, g0:g1] @ q_sym[g0:g1]
+ d2I_dC2 = None
+
+ dqdx = get_dqsym_dx(pcmobj, dm, range(mol.natm), intopt_derivative)
+
+ d2e_from_dIdq = numpy.zeros([mol.natm, mol.natm, 3, 3])
+ for i_atom in range(mol.natm):
+ for i_xyz in range(3):
+ d2e_from_dIdq[i_atom, :, i_xyz, :] = grad_qv(pcmobj, dm, q_sym = dqdx[i_atom, i_xyz, :])
+
+ d2e_from_d2I = d2e_from_d2I.get()
+ d2e = d2e_from_d2I + d2e_from_dIdq
+ d2e *= -1
+
+ t1 = log.timer_debug1('solvent hessian d(dI/dx * q)/dx contribution', *t1)
+ return d2e
+
+def einsum_ij_Adj_Adi_inverseK(K, Adj_term):
+ nA, nd, nj = Adj_term.shape
+ # return cupy.einsum('ij,Adj->Adi', cupy.linalg.inv(K), Adj_term)
+ return cupy.linalg.solve(K, Adj_term.reshape(nA * nd, nj).T).T.reshape(nA, nd, nj)
+def einsum_Adi_ij_Adj_inverseK(Adi_term, K):
+ nA, nd, nj = Adi_term.shape
+ # return cupy.einsum('Adi,ij->Adj', Adi_term, cupy.linalg.inv(K))
+ return cupy.linalg.solve(K.T, Adi_term.reshape(nA * nd, nj).T).T.reshape(nA, nd, nj)
+
+def get_dS_dot_q(dS, dSii, q, atmlst, gridslice):
+ output = cupy.einsum('diA,i->Adi', dSii[:,:,atmlst], q)
+ for i_atom in atmlst:
+ g0,g1 = gridslice[i_atom]
+ output[i_atom, :, g0:g1] += dS[:,g0:g1,:] @ q
+ output[i_atom, :, :] -= dS[:,:,g0:g1] @ q[g0:g1]
+ return output
+def get_dST_dot_q(dS, dSii, q, atmlst, gridslice):
+ # S is symmetric
+ return get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
+
+def get_dA_dot_q(dA, q, atmlst):
+ return cupy.einsum('diA,i->Adi', dA[:,:,atmlst], q)
+
+def get_dD_dot_q(dD, q, atmlst, gridslice, ngrids):
+ output = cupy.zeros([len(atmlst), 3, ngrids])
+ for i_atom in atmlst:
+ g0,g1 = gridslice[i_atom]
+ output[i_atom, :, g0:g1] += dD[:,g0:g1,:] @ q
+ output[i_atom, :, :] -= dD[:,:,g0:g1] @ q[g0:g1]
+ return output
+def get_dDT_dot_q(dD, q, atmlst, gridslice, ngrids):
+ return get_dD_dot_q(-dD.transpose(0,2,1), q, atmlst, gridslice, ngrids)
+
+def get_v_dot_d2S_dot_q(d2S, d2Sii, v_left, q_right, natom, gridslice):
+ output = d2Sii @ (v_left * q_right)
+ for i_atom in range(natom):
+ gi0,gi1 = gridslice[i_atom]
+ for j_atom in range(natom):
+ gj0,gj1 = gridslice[j_atom]
+ d2S_atom_ij = cupy.einsum('q,dDq->dD', v_left[gi0:gi1], d2S[:,:,gi0:gi1,gj0:gj1] @ q_right[gj0:gj1])
+ output[i_atom, i_atom, :, :] += d2S_atom_ij
+ output[j_atom, j_atom, :, :] += d2S_atom_ij
+ output[i_atom, j_atom, :, :] -= d2S_atom_ij
+ output[j_atom, i_atom, :, :] -= d2S_atom_ij
+ return output
+def get_v_dot_d2ST_dot_q(d2S, d2Sii, v_left, q_right, natom, gridslice):
+ # S is symmetric
+ return get_v_dot_d2S_dot_q(d2S, d2Sii, v_left, q_right, natom, gridslice)
+
+def get_v_dot_d2A_dot_q(d2A, v_left, q_right):
+ return d2A @ (v_left * q_right)
+
+def get_v_dot_d2D_dot_q(d2D, v_left, q_right, natom, gridslice):
+ output = cupy.zeros([natom, natom, 3, 3])
+ for i_atom in range(natom):
+ gi0,gi1 = gridslice[i_atom]
+ for j_atom in range(natom):
+ gj0,gj1 = gridslice[j_atom]
+ d2D_atom_ij = cupy.einsum('q,dDq->dD', v_left[gi0:gi1], d2D[:,:,gi0:gi1,gj0:gj1] @ q_right[gj0:gj1])
+ output[i_atom, i_atom, :, :] += d2D_atom_ij
+ output[j_atom, j_atom, :, :] += d2D_atom_ij
+ output[i_atom, j_atom, :, :] -= d2D_atom_ij
+ output[j_atom, i_atom, :, :] -= d2D_atom_ij
+ return output
+def get_v_dot_d2DT_dot_q(d2D, v_left, q_right, natom, gridslice):
+ return get_v_dot_d2D_dot_q(d2D.transpose(0,1,3,2), v_left, q_right, natom, gridslice)
+
+def analytical_hess_solver(pcmobj, dm, verbose=None):
+ if not pcmobj._intermediates:
+ pcmobj.build()
+ dm_cache = pcmobj._intermediates.get('dm', None)
+ if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10:
+ pass
+ else:
+ pcmobj._get_vind(dm)
+ mol = pcmobj.mol
+ log = logger.new_logger(mol, verbose)
+ t1 = log.init_timer()
+
+ natom = mol.natm
+ atmlst = range(natom) # Attention: This cannot be split
+
+ gridslice = pcmobj.surface['gslice_by_atom']
+ v_grids = pcmobj._intermediates['v_grids']
+ A = pcmobj._intermediates['A']
+ D = pcmobj._intermediates['D']
+ S = pcmobj._intermediates['S']
+ K = pcmobj._intermediates['K']
+ R = pcmobj._intermediates['R']
+ q = pcmobj._intermediates['q']
+ f_epsilon = pcmobj._intermediates['f_epsilon']
+
+ ngrids = q.shape[0]
+
+ vK_1 = cupy.linalg.solve(K.T, v_grids)
+
+ if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']:
+ _, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True)
+ dF, _ = get_dF_dA(pcmobj.surface)
+ dSii = get_dSii(pcmobj.surface, dF)
+
+ # dR = 0, dK = dS
+ # d(S-1 R) = - S-1 dS S-1 R
+ # d2(S-1 R) = (S-1 dS S-1 dS S-1 R) + (S-1 dS S-1 dS S-1 R) - (S-1 d2S S-1 R)
+ dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
+ S_1_dSdx_dot_q = einsum_ij_Adj_Adi_inverseK(K, dSdx_dot_q)
+ dSdx_dot_q = None
+ VS_1_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1, atmlst, gridslice)
+ dS = None
+ dSii = None
+ d2e_from_d2KR = cupy.einsum('Adi,BDi->ABdD', VS_1_dot_dSdx, S_1_dSdx_dot_q) * 2
+
+ _, d2S = get_d2D_d2S(pcmobj.surface, with_D=False, with_S=True)
+ d2F, _ = get_d2F_d2A(pcmobj.surface)
+ d2Sii = get_d2Sii(pcmobj.surface, dF, d2F)
+ dF = None
+ d2F = None
+ d2e_from_d2KR -= get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1, q, natom, gridslice)
+ d2S = None
+ d2Sii = None
+
+ dK_1Rv = -S_1_dSdx_dot_q
+ dvK_1R = -einsum_Adi_ij_Adj_inverseK(VS_1_dot_dSdx, K) @ R
+
+ elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']:
+ dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
+ dF, dA = get_dF_dA(pcmobj.surface)
+ dSii = get_dSii(pcmobj.surface, dF)
+
+ # dR = f_eps/(2*pi) * (dD*A + D*dA)
+ # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
+
+ # d2R = f_eps/(2*pi) * (d2D*A + dD*dA + dD*dA + D*d2A)
+ # d2K = d2S - f_eps/(2*pi) * (d2D*A*S + D*d2A*S + D*A*d2S + dD*dA*S + dD*dA*S + dD*A*dS + dD*A*dS + D*dA*dS + D*dA*dS)
+ # The terms showing up twice on equation above (dD*dA + dD*dA for example) refer to dD/dx * dA/dy + dD/dy * dA/dx,
+ # since D is not symmetric, they are not the same.
+
+ # d(K-1 R) = - K-1 dK K-1 R + K-1 dR
+ # d2(K-1 R) = (K-1 dK K-1 dK K-1 R) + (K-1 dK K-1 dK K-1 R) - (K-1 d2K K-1 R) - (K-1 dK K-1 dR)
+ # - (K-1 dK K-1 dR) + (K-1 d2R)
+ f_eps_over_2pi = f_epsilon/(2.0*PI)
+
+ dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
+ DA = D*A
+ dKdx_dot_q = dSdx_dot_q - f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q)
+ dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst)
+ dKdx_dot_q -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq)
+ AS = (A * S.T).T # It's just diag(A) @ S
+ ASq = AS @ q
+ dDdx_dot_ASq = get_dD_dot_q(dD, ASq, atmlst, gridslice, ngrids)
+ dKdx_dot_q -= f_eps_over_2pi * dDdx_dot_ASq
+ dDdx_dot_ASq = None
+
+ K_1_dot_dKdx_dot_q = einsum_ij_Adj_Adi_inverseK(K, dKdx_dot_q)
+ dKdx_dot_q = None
+
+ vK_1_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1, atmlst, gridslice)
+ vK_1_dot_dKdx = vK_1_dot_dSdx
+ vK_1_dot_dSdx = None
+ vK_1_dot_dDdx = get_dDT_dot_q(dD, vK_1, atmlst, gridslice, ngrids)
+ vK_1_dot_dKdx -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', AS.T, vK_1_dot_dDdx)
+ AS = None
+ vK_1D = D.T @ vK_1
+ vK_1D_dot_dAdx = get_dA_dot_q(dA, vK_1D, atmlst)
+ vK_1_dot_dKdx -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', S.T, vK_1D_dot_dAdx)
+ vK_1DA = DA.T @ vK_1
+ DA = None
+ vK_1DA_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1DA, atmlst, gridslice)
+ dS = None
+ dSii = None
+ vK_1_dot_dKdx -= f_eps_over_2pi * vK_1DA_dot_dSdx
+ vK_1DA_dot_dSdx = None
+
+ d2e_from_d2KR = cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dKdx, K_1_dot_dKdx_dot_q)
+ d2e_from_d2KR += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dKdx, K_1_dot_dKdx_dot_q)
+
+ d2F, d2A = get_d2F_d2A(pcmobj.surface)
+ vK_1_d2K_q = get_v_dot_d2A_dot_q(d2A, vK_1D, S @ q)
+ vK_1_d2R_V = get_v_dot_d2A_dot_q(d2A, vK_1D, v_grids)
+ d2A = None
+ d2Sii = get_d2Sii(pcmobj.surface, dF, d2F)
+ dF = None
+ d2F = None
+ d2D, d2S = get_d2D_d2S(pcmobj.surface, with_D=True, with_S=True)
+ vK_1_d2K_q += get_v_dot_d2D_dot_q(d2D, vK_1, ASq, natom, gridslice)
+ vK_1_d2R_V += get_v_dot_d2D_dot_q(d2D, vK_1, A * v_grids, natom, gridslice)
+ d2D = None
+ vK_1_d2K_q += get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1DA, q, natom, gridslice)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx, dAdx_dot_Sq)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx * A, dSdx_dot_q)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1D_dot_dAdx, dSdx_dot_q)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx, dAdx_dot_Sq)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx * A, dSdx_dot_q)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1D_dot_dAdx, dSdx_dot_q)
+ vK_1_d2K_q *= -f_eps_over_2pi
+ vK_1_d2K_q += get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1, q, natom, gridslice)
+ d2S = None
+ d2Sii = None
+
+ d2e_from_d2KR -= vK_1_d2K_q
+
+ dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst)
+ dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice, ngrids)
+ dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V))
+ dDdx_dot_AV = None
+
+ K_1_dot_dRdx_dot_V = einsum_ij_Adj_Adi_inverseK(K, dRdx_dot_V)
+ dRdx_dot_V = None
+
+ d2e_from_d2KR -= cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dKdx, K_1_dot_dRdx_dot_V)
+ d2e_from_d2KR -= cupy.einsum('Adi,BDi->BADd', vK_1_dot_dKdx, K_1_dot_dRdx_dot_V)
+
+ vK_1_d2R_V += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx, dAdx_dot_V)
+ vK_1_d2R_V += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx, dAdx_dot_V)
+ vK_1_d2R_V *= f_eps_over_2pi
+
+ d2e_from_d2KR += vK_1_d2R_V
+
+ dK_1Rv = -K_1_dot_dKdx_dot_q + K_1_dot_dRdx_dot_V
+
+ VK_1D_dot_dAdx = get_dA_dot_q(dA, (D.T @ vK_1).T, atmlst)
+ VK_1_dot_dDdx = get_dDT_dot_q(dD, vK_1, atmlst, gridslice, ngrids)
+ VK_1_dot_dRdx = f_eps_over_2pi * (VK_1D_dot_dAdx + VK_1_dot_dDdx * A)
+
+ dvK_1R = -einsum_Adi_ij_Adj_inverseK(vK_1_dot_dKdx, K) @ R + VK_1_dot_dRdx
+
+ elif pcmobj.method.upper() in ['SS(V)PE']:
+ dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
+ dF, dA = get_dF_dA(pcmobj.surface)
+ dSii = get_dSii(pcmobj.surface, dF)
+
+ # dR = f_eps/(2*pi) * (dD*A + D*dA)
+ # dK = dS - f_eps/(4*pi) * (dD*A*S + D*dA*S + D*A*dS + dST*AT*DT + ST*dAT*DT + ST*AT*dDT)
+
+ # d2R = f_eps/(2*pi) * (d2D*A + dD*dA + dD*dA + D*d2A)
+ # d2K = d2S - f_eps/(4*pi) * (d2D*A*S + D*d2A*S + D*A*d2S + dD*dA*S + dD*dA*S + dD*A*dS + dD*A*dS + D*dA*dS + D*dA*dS
+ # + d2ST*AT*DT + ST*d2AT*DT + ST*AT*d2DT + dST*dAT*DT + dST*dAT*DT + dST*AT*dDT + dST*AT*dDT + ST*dAT*dDT + ST*dAT*dDT)
+ f_eps_over_2pi = f_epsilon/(2.0*PI)
+ f_eps_over_4pi = f_epsilon/(4.0*PI)
+
+ dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
+ DA = D*A
+ dKdx_dot_q = dSdx_dot_q - f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q)
+ dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst)
+ dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq)
+ AS = (A * S.T).T # It's just diag(A) @ S
+ ASq = AS @ q
+ dDdx_dot_ASq = get_dD_dot_q(dD, ASq, atmlst, gridslice, ngrids)
+ dKdx_dot_q -= f_eps_over_4pi * dDdx_dot_ASq
+ dDdx_dot_ASq = None
+ dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice, ngrids)
+ dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_q)
+ dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst)
+ dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_q)
+ AT_DT_q = DA.T @ q
+ dSdxT_dot_AT_DT_q = get_dS_dot_q(dS, dSii, AT_DT_q, atmlst, gridslice)
+ dKdx_dot_q -= f_eps_over_4pi * dSdxT_dot_AT_DT_q
+ dSdxT_dot_AT_DT_q = None
+
+ K_1_dot_dKdx_dot_q = einsum_ij_Adj_Adi_inverseK(K, dKdx_dot_q)
+ dKdx_dot_q = None
+
+ vK_1_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1, atmlst, gridslice)
+ vK_1_dot_dKdx = vK_1_dot_dSdx
+ vK_1_dot_dSdx = None
+ vK_1_dot_dDdx = get_dDT_dot_q(dD, vK_1, atmlst, gridslice, ngrids)
+ vK_1_dot_dKdx -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, vK_1_dot_dDdx)
+ vK_1D_dot_dAdx = get_dA_dot_q(dA, D.T @ vK_1, atmlst)
+ vK_1_dot_dKdx -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, vK_1D_dot_dAdx)
+ vK_1DA = DA.T @ vK_1
+ vK_1DA_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1DA, atmlst, gridslice)
+ vK_1_dot_dKdx -= f_eps_over_4pi * vK_1DA_dot_dSdx
+ vK_1DA_dot_dSdx = None
+ vK_1_dot_dSdxT = get_dS_dot_q(dS, dSii, vK_1, atmlst, gridslice)
+ dS = None
+ dSii = None
+ vK_1_dot_dKdx -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, vK_1_dot_dSdxT)
+ DA = None
+ vK_1_ST_dot_dAdxT = get_dA_dot_q(dA, (S @ vK_1).T, atmlst)
+ vK_1_dot_dKdx -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, vK_1_ST_dot_dAdxT)
+ vK_1_ST_AT = AS @ vK_1
+ AS = None
+ vK_1_ST_AT_dot_dDdxT = get_dD_dot_q(dD, vK_1_ST_AT, atmlst, gridslice, ngrids)
+ vK_1_dot_dKdx -= f_eps_over_4pi * vK_1_ST_AT_dot_dDdxT
+ vK_1_ST_AT_dot_dDdxT = None
+
+ d2e_from_d2KR = cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dKdx, K_1_dot_dKdx_dot_q)
+ d2e_from_d2KR += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dKdx, K_1_dot_dKdx_dot_q)
+
+ d2F, d2A = get_d2F_d2A(pcmobj.surface)
+ vK_1_d2K_q = get_v_dot_d2A_dot_q(d2A, (D.T @ vK_1).T, S @ q)
+ vK_1_d2K_q += get_v_dot_d2A_dot_q(d2A, (S @ vK_1).T, D.T @ q)
+ vK_1_d2R_V = get_v_dot_d2A_dot_q(d2A, (D.T @ vK_1).T, v_grids)
+ d2A = None
+ d2Sii = get_d2Sii(pcmobj.surface, dF, d2F)
+ dF = None
+ d2F = None
+ d2D, d2S = get_d2D_d2S(pcmobj.surface, with_D=True, with_S=True)
+ vK_1_d2K_q += get_v_dot_d2D_dot_q(d2D, vK_1, ASq, natom, gridslice)
+ vK_1_d2K_q += get_v_dot_d2DT_dot_q(d2D, vK_1_ST_AT, q, natom, gridslice)
+ vK_1_d2R_V += get_v_dot_d2D_dot_q(d2D, vK_1, A * v_grids, natom, gridslice)
+ d2D = None
+ vK_1_d2K_q += get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1DA, q, natom, gridslice)
+ vK_1_d2K_q += get_v_dot_d2ST_dot_q(d2S, d2Sii, vK_1, AT_DT_q, natom, gridslice)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx, dAdx_dot_Sq)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx * A, dSdx_dot_q)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1D_dot_dAdx, dSdx_dot_q)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dSdxT, dAdxT_dot_DT_q)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dSdxT * A, dDdxT_dot_q)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_ST_dot_dAdxT, dDdxT_dot_q)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx, dAdx_dot_Sq)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx * A, dSdx_dot_q)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1D_dot_dAdx, dSdx_dot_q)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dSdxT, dAdxT_dot_DT_q)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dSdxT * A, dDdxT_dot_q)
+ vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_ST_dot_dAdxT, dDdxT_dot_q)
+ vK_1_d2K_q *= -f_eps_over_4pi
+ vK_1_d2K_q += get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1, q, natom, gridslice)
+ d2S = None
+ d2Sii = None
+
+ d2e_from_d2KR -= vK_1_d2K_q
+
+ dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst)
+ dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice, ngrids)
+ dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V))
+ dDdx_dot_AV = None
+
+ K_1_dot_dRdx_dot_V = einsum_ij_Adj_Adi_inverseK(K, dRdx_dot_V)
+
+ d2e_from_d2KR -= cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dKdx, K_1_dot_dRdx_dot_V)
+ d2e_from_d2KR -= cupy.einsum('Adi,BDi->BADd', vK_1_dot_dKdx, K_1_dot_dRdx_dot_V)
+
+ vK_1_d2R_V += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx, dAdx_dot_V)
+ vK_1_d2R_V += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx, dAdx_dot_V)
+ vK_1_d2R_V *= f_eps_over_2pi
+
+ d2e_from_d2KR += vK_1_d2R_V
+
+ dK_1Rv = -K_1_dot_dKdx_dot_q + K_1_dot_dRdx_dot_V
+
+ VK_1D_dot_dAdx = get_dA_dot_q(dA, (D.T @ vK_1).T, atmlst)
+ VK_1_dot_dDdx = get_dDT_dot_q(dD, vK_1, atmlst, gridslice, ngrids)
+ VK_1_dot_dRdx = f_eps_over_2pi * (VK_1D_dot_dAdx + VK_1_dot_dDdx * A)
+
+ dvK_1R = -einsum_Adi_ij_Adj_inverseK(vK_1_dot_dKdx, K) @ R + VK_1_dot_dRdx
+
+ else:
+ raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}")
+
+ d2e = d2e_from_d2KR
+
+ intopt_derivative = int3c1e.VHFOpt(mol)
+ intopt_derivative.build(cutoff = 1e-14, aosym = False)
+
+ dVdx = get_dvgrids(pcmobj, dm, range(mol.natm), intopt_derivative)
+ d2e -= cupy.einsum('Adi,BDi->BADd', dvK_1R, dVdx)
+ d2e -= cupy.einsum('Adi,BDi->ABdD', dVdx, dK_1Rv)
+
+ d2e *= 0.5
+ d2e = d2e.get()
+ t1 = log.timer_debug1('solvent hessian d(V * dK-1R/dx * V)/dx contribution', *t1)
+ return d2e
+
+def get_dqsym_dx_fix_vgrids(pcmobj, atmlst):
assert pcmobj._intermediates is not None
gridslice = pcmobj.surface['gslice_by_atom']
@@ -161,35 +705,14 @@ def get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K):
A = pcmobj._intermediates['A']
D = pcmobj._intermediates['D']
S = pcmobj._intermediates['S']
+ K = pcmobj._intermediates['K']
R = pcmobj._intermediates['R']
+ q = pcmobj._intermediates['q']
q_sym = pcmobj._intermediates['q_sym']
f_epsilon = pcmobj._intermediates['f_epsilon']
ngrids = q_sym.shape[0]
- def get_dS_dot_q(dS, dSii, q, atmlst, gridslice):
- output = cupy.einsum('diA,i->Adi', dSii[:,:,atmlst], q)
- for i_atom in atmlst:
- g0,g1 = gridslice[i_atom]
- output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dS[:,g0:g1,:], q)
- output[i_atom, :, :] -= cupy.einsum('dij,j->di', dS[:,:,g0:g1], q[g0:g1])
- return output
- def get_dST_dot_q(dS, dSii, q, atmlst, gridslice):
- return get_dS_dot_q(-dS.transpose(0,2,1), dSii, q, atmlst, gridslice)
-
- def get_dA_dot_q(dA, q, atmlst, gridslice):
- return cupy.einsum('diA,i->Adi', dA[:,:,atmlst], q)
-
- def get_dD_dot_q(dD, q, atmlst, gridslice):
- output = cupy.zeros([len(atmlst), 3, ngrids])
- for i_atom in atmlst:
- g0,g1 = gridslice[i_atom]
- output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dD[:,g0:g1,:], q)
- output[i_atom, :, :] -= cupy.einsum('dij,j->di', dD[:,:,g0:g1], q[g0:g1])
- return output
- def get_dDT_dot_q(dD, q, atmlst, gridslice):
- return get_dD_dot_q(-dD.transpose(0,2,1), q, atmlst, gridslice)
-
if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']:
_, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True)
dF, _ = get_dF_dA(pcmobj.surface)
@@ -199,7 +722,7 @@ def get_dDT_dot_q(dD, q, atmlst, gridslice):
# dR = 0, dK = dS
dSdx_dot_q = get_dS_dot_q(dS, dSii, q_sym, atmlst, gridslice)
- dqdx_fix_Vq = cupy.einsum('ij,Adj->Adi', inverse_K, dSdx_dot_q)
+ dqdx_fix_Vq = einsum_ij_Adj_Adi_inverseK(K, dSdx_dot_q)
elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']:
dF, dA = get_dF_dA(pcmobj.surface)
@@ -212,33 +735,32 @@ def get_dDT_dot_q(dD, q, atmlst, gridslice):
# dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
f_eps_over_2pi = f_epsilon/(2.0*PI)
- q = inverse_K @ R @ v_grids
dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
DA = D*A
dKdx_dot_q = dSdx_dot_q - f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q)
- dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice)
+ dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst)
dKdx_dot_q -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq)
AS = (A * S.T).T # It's just diag(A) @ S
- dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice)
+ dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice, ngrids)
dKdx_dot_q -= f_eps_over_2pi * dDdx_dot_ASq
- dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q)
+ dqdx_fix_Vq = -einsum_ij_Adj_Adi_inverseK(K, dKdx_dot_q)
- dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice)
+ dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst)
- dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice)
+ dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice, ngrids)
dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V))
- dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V)
+ dqdx_fix_Vq += einsum_ij_Adj_Adi_inverseK(K, dRdx_dot_V)
- invKT_V = inverse_K.T @ v_grids
- dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice)
+ invKT_V = cupy.linalg.solve(K.T, v_grids)
+ dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice, ngrids)
DT_invKT_V = D.T @ invKT_V
- dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice)
+ dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst)
dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V)
dSdxT_dot_invKT_V = get_dST_dot_q(dS, dSii, invKT_V, atmlst, gridslice)
@@ -249,8 +771,9 @@ def get_dDT_dot_q(dD, q, atmlst, gridslice):
dSdxT_dot_AT_DT_invKT_V = get_dST_dot_q(dS, dSii, DA.T @ invKT_V, atmlst, gridslice)
dKdxT_dot_invKT_V -= f_eps_over_2pi * dSdxT_dot_AT_DT_invKT_V
+ invKT_dKdxT_dot_invKT_V = einsum_ij_Adj_Adi_inverseK(K.T, dKdxT_dot_invKT_V)
- dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdxT_dot_invKT_V)
+ dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T, invKT_dKdxT_dot_invKT_V)
dqdx_fix_Vq *= -0.5
@@ -269,17 +792,17 @@ def dK_dot_q(q):
DA = D*A
dKdx_dot_q = dSdx_dot_q - f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q)
- dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice)
+ dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst)
dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq)
AS = (A * S.T).T # It's just diag(A) @ S
- dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice)
+ dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice, ngrids)
dKdx_dot_q -= f_eps_over_4pi * dDdx_dot_ASq
- dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice)
+ dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice, ngrids)
dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_q)
- dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst, gridslice)
+ dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst)
dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_q)
dSdxT_dot_AT_DT_q = get_dST_dot_q(dS, dSii, DA.T @ q, atmlst, gridslice)
@@ -289,26 +812,27 @@ def dK_dot_q(q):
f_eps_over_2pi = f_epsilon/(2.0*PI)
- q = inverse_K @ R @ v_grids
dKdx_dot_q = dK_dot_q(q)
- dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q)
+ dqdx_fix_Vq = -einsum_ij_Adj_Adi_inverseK(K, dKdx_dot_q)
- dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice)
+ dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst)
- dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice)
+ dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice, ngrids)
dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V))
- dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V)
+ dqdx_fix_Vq += einsum_ij_Adj_Adi_inverseK(K, dRdx_dot_V)
- invKT_V = inverse_K.T @ v_grids
- dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice)
+ invKT_V = cupy.linalg.solve(K.T, v_grids)
+ dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice, ngrids)
DT_invKT_V = D.T @ invKT_V
- dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice)
+ dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst)
dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V)
dKdx_dot_invKT_V = dK_dot_q(invKT_V)
- dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdx_dot_invKT_V)
+ invKT_dKdx_dot_invKT_V = einsum_ij_Adj_Adi_inverseK(K.T, dKdx_dot_invKT_V)
+
+ dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T, invKT_dKdx_dot_invKT_V)
dqdx_fix_Vq *= -0.5
@@ -317,14 +841,13 @@ def dK_dot_q(q):
return dqdx_fix_Vq
-def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative):
+def get_dvgrids(pcmobj, dm, atmlst, intopt_derivative):
assert pcmobj._intermediates is not None
mol = pcmobj.mol
gridslice = pcmobj.surface['gslice_by_atom']
charge_exp = pcmobj.surface['charge_exp']
grid_coords = pcmobj.surface['grid_coords']
- R = pcmobj._intermediates['R']
atom_coords = mol.atom_coords(unit='B')
atom_charges = numpy.asarray(mol.atom_charges(), dtype=numpy.float64)
@@ -351,17 +874,24 @@ def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative):
g0,g1 = gridslice[i_atom]
dV_on_charge_dx[i_atom,:,g0:g1] -= dIdC[:,g0:g1]
- KR_symmetrized = 0.5 * (inverse_K @ R + R.T @ inverse_K.T)
- dqdx_fix_K_R = cupy.einsum('ij,Adj->Adi', KR_symmetrized, dV_on_charge_dx)
+ return dV_on_charge_dx
+
+def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, intopt_derivative):
+ dV_on_charge_dx = get_dvgrids(pcmobj, dm, atmlst, intopt_derivative)
+ K = pcmobj._intermediates['K']
+ R = pcmobj._intermediates['R']
+ R_dVdx = cupy.einsum('ij,Adj->Adi', R, dV_on_charge_dx)
+ K_1_R_dVdx = einsum_ij_Adj_Adi_inverseK(K, R_dVdx)
+ K_1T_dVdx = einsum_ij_Adj_Adi_inverseK(K.T, dV_on_charge_dx)
+ RT_K_1T_dVdx = cupy.einsum('ij,Adj->Adi', R.T, K_1T_dVdx)
+ dqdx_fix_K_R = 0.5 * (K_1_R_dVdx + RT_K_1T_dVdx)
return dqdx_fix_K_R
def get_dqsym_dx(pcmobj, dm, atmlst, intopt_derivative):
- K = pcmobj._intermediates['K']
- inverse_K = cupy.linalg.inv(K)
- return get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K) + get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative)
+ return get_dqsym_dx_fix_vgrids(pcmobj, atmlst) + get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, intopt_derivative)
-def analytic_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None):
+def analytical_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None):
'''
dv_solv / da
'''
@@ -470,8 +1000,9 @@ def kernel(self, *args, dm=None, atmlst=None, **kwargs):
dm = dm[0] + dm[1]
is_equilibrium = self.base.with_solvent.equilibrium_solvation
self.base.with_solvent.equilibrium_solvation = True
- self.de_solvent = hess_elec(self.base.with_solvent, dm, verbose=self.verbose)
- #self.de_solvent+= hess_nuc(self.base.with_solvent)
+ self.de_solvent = analytical_hess_nuc(self.base.with_solvent, dm, verbose=self.verbose)
+ self.de_solvent += analytical_hess_qv(self.base.with_solvent, dm, verbose=self.verbose)
+ self.de_solvent += analytical_hess_solver(self.base.with_solvent, dm, verbose=self.verbose)
self.de_solute = super().kernel(*args, **kwargs)
self.de = self.de_solute + self.de_solvent
self.base.with_solvent.equilibrium_solvation = is_equilibrium
@@ -483,7 +1014,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
if isinstance(self.base, scf.hf.RHF):
dm = self.base.make_rdm1(ao_repr=True)
- dv = analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+ dv = analytical_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
for i0, ia in enumerate(atmlst):
h1ao[i0] += dv[i0]
return h1ao
@@ -492,15 +1023,15 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
solvent = self.base.with_solvent
dm = self.base.make_rdm1(ao_repr=True)
dm = dm[0] + dm[1]
- dva = analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
- dvb = analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
+ dva = analytical_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
+ dvb = analytical_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
for i0, ia in enumerate(atmlst):
h1aoa[i0] += dva[i0]
h1aob[i0] += dvb[i0]
return h1aoa, h1aob
else:
raise NotImplementedError('Base object is not supported')
-
+
def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1):
v1vo = super().get_veff_resp_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi)
if not self.base.with_solvent.equilibrium_solvation:
@@ -523,7 +1054,7 @@ def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1):
else:
raise NotImplementedError('Base object is not supported')
return v1vo
-
+
def _finalize(self):
# disable _finalize. It is called in grad_method.kernel method
# where self.de was not yet initialized.
diff --git a/gpu4pyscf/solvent/hessian/smd.py b/gpu4pyscf/solvent/hessian/smd.py
index 49897d74..dafaa573 100644
--- a/gpu4pyscf/solvent/hessian/smd.py
+++ b/gpu4pyscf/solvent/hessian/smd.py
@@ -22,8 +22,6 @@
from gpu4pyscf import scf
from gpu4pyscf.lib import logger
from gpu4pyscf.solvent import smd
-from gpu4pyscf.solvent.grad import smd as smd_grad
-from gpu4pyscf.solvent.grad import pcm as pcm_grad
from gpu4pyscf.solvent.hessian import pcm as pcm_hess
from gpu4pyscf.hessian.jk import _ao2mo
@@ -60,45 +58,6 @@ def smd_grad_scanner(mol):
t1 = log.timer_debug1('solvent energy', *t1)
return hess_cds # hartree
-
-def hess_elec(smdobj, dm, verbose=None):
- '''
- slow version with finite difference
- TODO: use analytical hess_nuc
- '''
- log = logger.new_logger(smdobj, verbose)
- t1 = log.init_timer()
- pmol = smdobj.mol.copy()
- mol = pmol.copy()
- coords = mol.atom_coords(unit='Bohr')
-
- def pcm_grad_scanner(mol):
- # TODO: use more analytical forms
- smdobj.reset(mol)
- e, v = smdobj._get_vind(dm)
- #return grad_elec(smdobj, dm)
- grad = pcm_grad.grad_nuc(smdobj, dm)
- grad+= smd_grad.grad_solver(smdobj, dm)
- grad+= pcm_grad.grad_qv(smdobj, dm)
- return grad
-
- mol.verbose = 0
- de = np.zeros([mol.natm, mol.natm, 3, 3])
- eps = 1e-3
- for ia in range(mol.natm):
- for ix in range(3):
- dv = np.zeros_like(coords)
- dv[ia,ix] = eps
- mol.set_geom_(coords + dv, unit='Bohr')
- g0 = pcm_grad_scanner(mol)
-
- mol.set_geom_(coords - dv, unit='Bohr')
- g1 = pcm_grad_scanner(mol)
- de[ia,:,ix] = (g0 - g1)/2.0/eps
- t1 = log.timer_debug1('solvent energy', *t1)
- smdobj.reset(pmol)
- return de
-
def make_hess_object(hess_method):
'''For hess_method in vacuum, add nuclear Hessian of solvent smdobj'''
if hess_method.base.with_solvent.frozen:
@@ -140,8 +99,9 @@ def kernel(self, *args, dm=None, atmlst=None, **kwargs):
dm = dm[0] + dm[1]
is_equilibrium = self.base.with_solvent.equilibrium_solvation
self.base.with_solvent.equilibrium_solvation = True
- self.de_solvent = pcm_hess.hess_elec(self.base.with_solvent, dm, verbose=self.verbose)
- #self.de_solvent+= hess_nuc(self.base.with_solvent)
+ self.de_solvent = pcm_hess.analytical_hess_nuc(self.base.with_solvent, dm, verbose=self.verbose)
+ self.de_solvent += pcm_hess.analytical_hess_qv(self.base.with_solvent, dm, verbose=self.verbose)
+ self.de_solvent += pcm_hess.analytical_hess_solver(self.base.with_solvent, dm, verbose=self.verbose)
self.de_solute = super().kernel(*args, **kwargs)
self.de_cds = get_cds(self.base.with_solvent)
self.de = self.de_solute + self.de_solvent + self.de_cds
@@ -154,7 +114,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
if isinstance(self.base, scf.hf.RHF):
dm = self.base.make_rdm1(ao_repr=True)
- dv = pcm_hess.analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+ dv = pcm_hess.analytical_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
for i0, ia in enumerate(atmlst):
h1ao[i0] += dv[i0]
return h1ao
@@ -163,8 +123,8 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
solvent = self.base.with_solvent
dm = self.base.make_rdm1(ao_repr=True)
dm = dm[0] + dm[1]
- dva = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
- dvb = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
+ dva = pcm_hess.analytical_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
+ dvb = pcm_hess.analytical_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
for i0, ia in enumerate(atmlst):
h1aoa[i0] += dva[i0]
h1aob[i0] += dvb[i0]
diff --git a/gpu4pyscf/solvent/tests/test_pcm_grad.py b/gpu4pyscf/solvent/tests/test_pcm_grad.py
index f141ae56..c17e05f3 100644
--- a/gpu4pyscf/solvent/tests/test_pcm_grad.py
+++ b/gpu4pyscf/solvent/tests/test_pcm_grad.py
@@ -36,6 +36,7 @@ def setUpModule():
mol.basis = 'sto3g'
mol.output = '/dev/null'
mol.build(verbose=0)
+ # Warning: This system has all orbitals filled, which is FAR from physical
mol.nelectron = mol.nao * 2
epsilon = 35.9
lebedev_order = 3
@@ -169,11 +170,14 @@ def test_grad_IEFPCM(self):
def test_grad_SSVPE(self):
grad = _grad_with_solvent('SS(V)PE')
- g0 = numpy.asarray(
- [[ 3.42479745e-15, -1.00280742e-16, -1.61117735e+00],
- [ 1.07135985e+00, -6.97375148e-16, 8.05588676e-01],
- [-1.07135985e+00, 7.91425487e-16, 8.05588676e-01]]
- )
+ # Note: This reference value is obtained via finite difference with dx = 1e-5
+ # QChem 6.1 has a bug in SSVPE gradient, they use the IEFPCM gradient algorithm
+ # to compute SSVPE gradient, which is wrong.
+ g0 = numpy.asarray([
+ [ 0.00000000e+00, -7.10542736e-10, -1.63195623e+00],
+ [ 1.07705138e+00, 2.13162821e-09, 8.15978117e-01],
+ [-1.07705138e+00, -2.13162821e-09, 8.15978116e-01],
+ ])
print(f"Gradient error in RHF with SS(V)PE: {numpy.linalg.norm(g0 - grad)}")
assert numpy.linalg.norm(g0 - grad) < 1e-6
diff --git a/gpu4pyscf/solvent/tests/test_pcm_hessian.py b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
index c7076f29..6e19ec96 100644
--- a/gpu4pyscf/solvent/tests/test_pcm_hessian.py
+++ b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
@@ -21,7 +21,7 @@
from gpu4pyscf.solvent import pcm
from gpu4pyscf import scf, dft
from packaging import version
-from gpu4pyscf.solvent.hessian.pcm import analytic_grad_vmat
+from gpu4pyscf.solvent.hessian.pcm import analytical_grad_vmat, analytical_hess_nuc, analytical_hess_solver, analytical_hess_qv
from gpu4pyscf.lib.cupy_helper import contract
pyscf_25 = version.parse(pyscf.__version__) <= version.parse('2.5.0')
@@ -130,6 +130,37 @@ def pcm_vmat_scanner(mol):
pcmobj.reset(pmol)
return vmat
+def _fd_hess_contribution(pcmobj, dm, gradient_function):
+ pmol = pcmobj.mol.copy()
+ mol = pmol.copy()
+ coords = mol.atom_coords(unit='Bohr')
+
+ def pcm_grad_scanner(mol):
+ pcmobj.reset(mol)
+ e, v = pcmobj._get_vind(dm)
+ pcm_grad = gradient_function(pcmobj, dm)
+ # pcm_grad = grad_nuc(pcmobj, dm)
+ # pcm_grad+= grad_solver(pcmobj, dm)
+ # pcm_grad+= grad_qv(pcmobj, dm)
+ return pcm_grad
+
+ mol.verbose = 0
+ de = np.zeros([mol.natm, mol.natm, 3, 3])
+ eps = 1e-5
+ for ia in range(mol.natm):
+ for ix in range(3):
+ dv = np.zeros_like(coords)
+ dv[ia,ix] = eps
+ mol.set_geom_(coords + dv, unit='Bohr')
+ g0 = pcm_grad_scanner(mol)
+
+ mol.set_geom_(coords - dv, unit='Bohr')
+ g1 = pcm_grad_scanner(mol)
+
+ de[ia,:,ix,:] = (g0 - g1)/2.0/eps
+ pcmobj.reset(pmol)
+ return de
+
@unittest.skipIf(pcm.libsolvent is None, "solvent extension not compiled")
class KnownValues(unittest.TestCase):
def test_df_hess_cpcm(self):
@@ -192,7 +223,7 @@ def test_grad_vmat_cpcm(self):
mo_coeff = mf.mo_coeff
mo_occ = mf.mo_occ
- test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+ test_grad_vmat = analytical_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
@@ -206,7 +237,7 @@ def test_grad_vmat_iefpcm(self):
mo_coeff = mf.mo_coeff
mo_occ = mf.mo_occ
- test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+ test_grad_vmat = analytical_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
@@ -220,11 +251,71 @@ def test_grad_vmat_ssvpe(self):
mo_coeff = mf.mo_coeff
mo_occ = mf.mo_occ
- test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+ test_grad_vmat = analytical_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+ def test_hess_nuc_iefpcm(self):
+ print("testing IEF-PCM d2E_nuc/dx2")
+ mf = _make_mf(method='IEF-PCM')
+ hobj = mf.Hessian()
+ dm = mf.make_rdm1()
+
+ test_grad_vmat = analytical_hess_nuc(hobj.base.with_solvent, dm)
+ from gpu4pyscf.solvent.grad.pcm import grad_nuc
+ ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_nuc)
+
+ cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
+ def test_hess_qv_iefpcm(self):
+ print("testing IEF-PCM d2E_elec/dx2")
+ mf = _make_mf(method='IEF-PCM')
+ hobj = mf.Hessian()
+ dm = mf.make_rdm1()
+
+ test_grad_vmat = analytical_hess_qv(hobj.base.with_solvent, dm)
+ from gpu4pyscf.solvent.grad.pcm import grad_qv
+ ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_qv)
+
+ cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
+ def test_hess_solver_cpcm(self):
+ print("testing C-PCM d2E_KR/dx2")
+ mf = _make_mf(method='C-PCM')
+ hobj = mf.Hessian()
+ dm = mf.make_rdm1()
+
+ test_grad_vmat = analytical_hess_solver(hobj.base.with_solvent, dm)
+ from gpu4pyscf.solvent.grad.pcm import grad_solver
+ ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_solver)
+
+ cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
+ def test_hess_solver_iefpcm(self):
+ print("testing IEF-PCM d2E_KR/dx2")
+ mf = _make_mf(method='IEF-PCM')
+ hobj = mf.Hessian()
+ dm = mf.make_rdm1()
+
+ test_grad_vmat = analytical_hess_solver(hobj.base.with_solvent, dm)
+ from gpu4pyscf.solvent.grad.pcm import grad_solver
+ ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_solver)
+
+ cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
+ def test_hess_solver_ssvpe(self):
+ print("testing SS(V)PE d2E_KR/dx2")
+ mf = _make_mf(method='SS(V)PE')
+ hobj = mf.Hessian()
+ dm = mf.make_rdm1()
+
+ test_grad_vmat = analytical_hess_solver(hobj.base.with_solvent, dm)
+ from gpu4pyscf.solvent.grad.pcm import grad_solver
+ ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_solver)
+
+ cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
@pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher')
def test_to_gpu(self):
import pyscf