diff --git a/.github/workflows/pypi_wheel.yml b/.github/workflows/pypi_wheel.yml index e350cd40..bf0565af 100644 --- a/.github/workflows/pypi_wheel.yml +++ b/.github/workflows/pypi_wheel.yml @@ -28,7 +28,7 @@ jobs: ls ${{ github.workspace }}/wheelhouse - name: Publish to PyPI run: | - pip install twine + pip install twine==6.0.1 export TWINE_USERNAME=__token__ export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}" twine upload --verbose "${{ github.workspace }}/wheelhouse/*" @@ -51,7 +51,7 @@ jobs: ls ${{ github.workspace }}/wheelhouse - name: Publish to PyPI run: | - pip install twine + pip install twine==6.0.1 export TWINE_USERNAME=__token__ export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}" twine upload --verbose "${{ github.workspace }}/wheelhouse/*" @@ -66,7 +66,7 @@ jobs: python3 setup.py sdist - name: Publish to PyPI run: | - pip install twine + pip install twine==6.0.1 export TWINE_USERNAME=__token__ export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}" twine upload --verbose "${{ github.workspace }}/dist/*" diff --git a/CHANGELOG b/CHANGELOG index 7f747686..a95f5108 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,12 @@ +v1.3.1 (2025-02-04) +------------------- +* New Features + - Analytical Hessian for PCM solvent model + - Driver for 3c methods (wB97x-3c, R2Scan-3c, B97-3c, etc.) +* Improvements + - Preconditioner and computation efficiency of Davidson iterations for TDDFT + + v1.3.0 (2025-01-07) ------------------- * New Features diff --git a/examples/40-all_electron_scf.py b/examples/40-all_electron_scf.py new file mode 100644 index 00000000..a33f2953 --- /dev/null +++ b/examples/40-all_electron_scf.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# Copyright 2025 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +''' +Gamma point Hartree-Fock/DFT using density fitting approximation +''' + +import numpy as np +import pyscf + +cell = pyscf.M( + a = np.eye(3)*3.5668, + atom = '''C 0. 0. 0. + C 0.8917 0.8917 0.8917 + C 1.7834 1.7834 0. + C 2.6751 2.6751 0.8917 + C 1.7834 0. 1.7834 + C 2.6751 0.8917 2.6751 + C 0. 1.7834 1.7834 + C 0.8917 2.6751 2.6751''', + basis = 'ccpvdz', + verbose = 5, +) + +# +# Gamma point HF and DFT +# +mf = cell.RHF().to_gpu().density_fit().run() + +mf = cell.RKS(xc='pbe0').to_gpu().density_fit().run() + +# +# K-point sampled HF and DFT +# +kpts = cell.make_kpts([2,2,2]) +kmf = cell.KRHF(kpts=kpts).to_gpu().density_fit().run() + +kmf = cell.KRKS(xc='pbe0', kpts=kpts).to_gpu().density_fit().run() diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py index 1bb12f85..bfd8d8d8 100644 --- a/gpu4pyscf/__config__.py +++ b/gpu4pyscf/__config__.py @@ -14,11 +14,11 @@ import cupy -_num_devices = cupy.cuda.runtime.getDeviceCount() +num_devices = cupy.cuda.runtime.getDeviceCount() # TODO: switch to non_blocking stream (currently blocked by libxc) -_streams = [None] * _num_devices -for device_id in range(_num_devices): +_streams = [None] * num_devices +for device_id in range(num_devices): with cupy.cuda.Device(device_id): _streams[device_id] = cupy.cuda.stream.Stream(non_blocking=False) @@ -38,11 +38,16 @@ mem_fraction = 0.9 cupy.get_default_memory_pool().set_limit(fraction=mem_fraction) +if props['sharedMemPerBlockOptin'] > 65536: + shm_size = props['sharedMemPerBlockOptin'] +else: + shm_size = props['sharedMemPerBlock'] + # Check P2P data transfer is available _p2p_access = True -if _num_devices > 1: - for src in range(_num_devices): - for dst in range(_num_devices): +if num_devices > 1: + for src in range(num_devices): + for dst in range(num_devices): if src != dst: can_access_peer = cupy.cuda.runtime.deviceCanAccessPeer(src, dst) _p2p_access &= can_access_peer diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py index af2c2982..4cd95fbc 100644 --- a/gpu4pyscf/__init__.py +++ b/gpu4pyscf/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = '1.3.0' +__version__ = '1.3.1' from . import lib, grad, hessian, solvent, scf, dft, tdscf diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index da61804c..c58c1428 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -25,7 +25,7 @@ from gpu4pyscf.df import int3c2e, df_jk from gpu4pyscf.lib import logger from gpu4pyscf import __config__ -from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.__config__ import _streams, num_devices MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128) ALIGNED = getattr(__config__, 'ao_aligned', 32) @@ -218,7 +218,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, # CDERI will be equally distributed to the devices # Other devices usually have more memory available than Device 0 # CDERI will use up to 40% of the available memory - use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices + use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * num_devices if use_gpu_memory: log.debug("Saving CDERI on GPU") @@ -226,9 +226,9 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, log.debug("Saving CDERI on CPU") _cderi = {} - aux_blksize = (naux + _num_devices - 1) // _num_devices + aux_blksize = (naux + num_devices - 1) // num_devices aux_blksize = (aux_blksize + ALIGNED - 1) // ALIGNED * ALIGNED - for device_id in range(_num_devices): + for device_id in range(num_devices): p0 = min(aux_blksize*device_id, naux) p1 = min(aux_blksize*(device_id+1), naux) #for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)): @@ -246,16 +246,16 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, npairs_per_ctr = np.array(npairs_per_ctr) total_task_list = np.argsort(npairs_per_ctr) task_list_per_device = [] - for device_id in range(_num_devices): - task_list_per_device.append(total_task_list[device_id::_num_devices]) + for device_id in range(num_devices): + task_list_per_device.append(total_task_list[device_id::num_devices]) cd_low_f = cupy.array(cd_low, order='F', copy=False) cd_low_f = tag_array(cd_low_f, tag=cd_low.tag) cupy.cuda.get_current_stream().synchronize() futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): task_list = task_list_per_device[device_id] future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, aux_blksize, omega=omega, sr_only=sr_only, device_id=device_id) @@ -352,7 +352,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize, for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)): tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True) copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1]) - elif _num_devices > 1: + elif num_devices > 1: # Multi-GPU case, copy data to other Devices for dev_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)): # Making a copy for contiguous data transfer diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index 5561cf9c..66f1dd49 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -26,7 +26,7 @@ from gpu4pyscf.dft import rks, uks, numint from gpu4pyscf.scf import hf, uhf from gpu4pyscf.df import df, int3c2e -from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.__config__ import _streams, num_devices def _pin_memory(array): mem = cupy.cuda.alloc_pinned_memory(array.nbytes) @@ -453,8 +453,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e- mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1]) futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _jk_task_with_mo, dfobj, dms, mo_coeff, mo_occ, @@ -474,8 +474,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e- mo1s = [intopt.sort_orbitals(mo1, axis=[1]) for mo1 in mo1s] futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _jk_task_with_mo1, dfobj, dms, mo1s, occ_coeffs, @@ -486,8 +486,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e- # general K matrix with density matrix else: futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _jk_task_with_dm, dfobj, dms, hermi=hermi, device_id=device_id, diff --git a/gpu4pyscf/df/grad/jk.py b/gpu4pyscf/df/grad/jk.py index 2bbf9d9e..4595af65 100644 --- a/gpu4pyscf/df/grad/jk.py +++ b/gpu4pyscf/df/grad/jk.py @@ -18,7 +18,7 @@ from gpu4pyscf.df.int3c2e import get_int3c2e_ip_jk, VHFOpt, _split_tasks from gpu4pyscf.lib.cupy_helper import contract, concatenate, reduce_to_device from gpu4pyscf.lib import logger -from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.__config__ import _streams, num_devices def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0): ''' # (L|ij) -> rhoj: (L), rhok: (L|oo) @@ -61,8 +61,8 @@ def get_rhojk(with_df, dm, orbo, with_j=True, with_k=True): ''' futures = [] cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _jk_task, with_df, dm, orbo, with_j=with_j, with_k=with_k, device_id=device_id) @@ -161,12 +161,12 @@ def get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, aux_ao_loc = np.array(intopt.aux_ao_loc) loads = aux_ao_loc[1:] - aux_ao_loc[:-1] - task_list = _split_tasks(loads, _num_devices) + task_list = _split_tasks(loads, num_devices) futures = [] cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _jk_ip_task, intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list[device_id], with_j=with_j, with_k=with_k, device_id=device_id, omega=omega) diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py index 40ab3bfd..5baff1d0 100644 --- a/gpu4pyscf/df/hessian/jk.py +++ b/gpu4pyscf/df/hessian/jk.py @@ -23,7 +23,7 @@ from gpu4pyscf.hessian.jk import _ao2mo from gpu4pyscf.lib import logger from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device -from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.__config__ import _streams, num_devices NROOT_ON_GPU = 7 @@ -171,8 +171,8 @@ def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0, mo_coeff = [intopt.sort_orbitals(mo, axis=[0]) for mo in mo_coeff] futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _jk_task_with_mo1, dfobj, dms, mo_coeff, mo1s, occ_coeffs, @@ -415,12 +415,12 @@ def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_j=True, with_k=True, ncp_ij = len(intopt.log_qs) tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij)))) task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) + for device_id in range(num_devices): + task_list.append(tasks[device_id::num_devices]) cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _int3c2e_ipip_tasks, intopt, task_list[device_id], rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k, diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index e0d5cd90..321c9654 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -46,8 +46,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, mocc = mo_coeff[:,mo_occ>0] dm0 = numpy.dot(mocc, mocc.T) * 2 - if mf.nlc != '': - raise NotImplementedError + if mf.do_nlc(): + raise NotImplementedError("2nd derivative of NLC is not implemented.") omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) with_k = mf._numint.libxc.is_hybrid_xc(mf.xc) diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py index 059f571c..99661740 100644 --- a/gpu4pyscf/df/hessian/uks.py +++ b/gpu4pyscf/df/hessian/uks.py @@ -48,8 +48,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, moccb = mo_coeff[1][:,mo_occ[1]>0] dm0a = numpy.dot(mocca, mocca.T) dm0b = numpy.dot(moccb, moccb.T) - if mf.nlc != '': - raise NotImplementedError + if mf.do_nlc(): + raise NotImplementedError("2nd derivative of NLC is not implemented.") omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) with_k = mf._numint.libxc.is_hybrid_xc(mf.xc) diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index e77e30ca..28e7e49e 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -24,7 +24,7 @@ reduce_to_device, copy_array, transpose_sum) from gpu4pyscf.lib import logger from gpu4pyscf.gto.mole import basis_seg_contraction -from gpu4pyscf.__config__ import _num_devices, _streams +from gpu4pyscf.__config__ import num_devices, _streams LMAX_ON_GPU = 8 FREE_CUPY_CACHE = True @@ -824,11 +824,11 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None): futures = [] aux_ao_loc = np.array(intopt.aux_ao_loc) loads = aux_ao_loc[1:] - aux_ao_loc[:-1] - task_list = _split_tasks(loads, _num_devices) + task_list = _split_tasks(loads, num_devices) cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _int3c2e_jk_task, intopt, task_list[device_id], dm0_tag, orbo, device_id=device_id, omega=omega) @@ -935,11 +935,11 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_j=True, aux_ao_loc = np.array(intopt.aux_ao_loc) loads = aux_ao_loc[1:] - aux_ao_loc[:-1] - task_list = _split_tasks(loads, _num_devices) + task_list = _split_tasks(loads, num_devices) cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _int3c2e_ip1_vjk_task, intopt, task_list[device_id], rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k, @@ -1033,11 +1033,11 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, aux_ao_loc = np.array(intopt.aux_ao_loc) loads = aux_ao_loc[1:] - aux_ao_loc[:-1] - task_list = _split_tasks(loads, _num_devices) + task_list = _split_tasks(loads, num_devices) cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _int3c2e_ip2_vjk_task, intopt, task_list[device_id], rhoj, rhok, dm0_tag, orbo, with_j=with_j, @@ -1096,7 +1096,7 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): aux_ao_loc = np.array(intopt.aux_ao_loc) loads = aux_ao_loc[1:] - aux_ao_loc[:-1] - task_list = _split_tasks(loads, _num_devices) + task_list = _split_tasks(loads, num_devices) nao = intopt.mol.nao naux = intopt.auxmol.nao @@ -1107,8 +1107,8 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): wk = np.ndarray([naux,nao,nocc,3], dtype=np.float64, order='C', buffer=mem) cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _int3c2e_ip1_wjk_task, intopt, task_list[device_id], dm0_tag, orbo, wk, with_k=with_k, device_id=device_id, omega=omega) @@ -1156,11 +1156,11 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): aux_ao_loc = np.array(intopt.aux_ao_loc) loads = aux_ao_loc[1:] - aux_ao_loc[:-1] - task_list = _split_tasks(loads, _num_devices) + task_list = _split_tasks(loads, num_devices) cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _int3c2e_ip2_wjk, intopt, task_list[device_id], dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega) diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py index 2908b9a3..9dd1d813 100644 --- a/gpu4pyscf/dft/gen_grid.py +++ b/gpu4pyscf/dft/gen_grid.py @@ -30,9 +30,10 @@ import cupy from pyscf import lib from pyscf import gto +from pyscf.dft import gen_grid as gen_grid_cpu +from gpu4pyscf.lib import utils from pyscf.gto.eval_gto import BLKSIZE, NBINS, CUTOFF, make_screen_index from pyscf import __config__ -from cupyx.scipy.spatial.distance import cdist from gpu4pyscf.lib import logger from gpu4pyscf.dft import radi from gpu4pyscf.lib.cupy_helper import load_library @@ -72,13 +73,17 @@ def sg1_prune(nuc, rads, n_ang, radii=radi.SG1RADII): ''' # In SG1 the ang grids for the five regions # 6 38 86 194 86 - leb_ngrid = cupy.array([6, 38, 86, 194, 86]) - alphas = cupy.array(( + if nuc >= 19: + return 194 * numpy.ones_like(rads, dtype=numpy.int64) + + leb_ngrid = numpy.array([6, 38, 86, 194, 86], dtype=numpy.int64) + alphas = numpy.array(( (0.25 , 0.5, 1.0, 4.5), (0.1667, 0.5, 0.9, 3.5), (0.1 , 0.4, 0.8, 2.5))) + r_atom = radii[nuc] + 1e-200 - rads = cupy.asarray(rads) + rads = numpy.asarray(rads) if nuc <= 2: # H, He place = ((rads/r_atom).reshape(-1,1) > alphas[0]).sum(axis=1) elif nuc <= 10: # Li - Ne @@ -463,8 +468,6 @@ def _load_conf(mod, name, default): else: return var -from pyscf.dft import gen_grid -from gpu4pyscf.lib import utils class Grids(lib.StreamObject): from gpu4pyscf.lib.utils import to_gpu, device @@ -481,9 +484,10 @@ class Grids(lib.StreamObject): level = getattr(__config__, 'dft_gen_grid_Grids_level', 3) alignment = ALIGNMENT_UNIT cutoff = CUTOFF - _keys = gen_grid.Grids._keys + _keys = gen_grid_cpu.Grids._keys - __init__ = gen_grid.Grids.__init__ + __init__ = gen_grid_cpu.Grids.__init__ + dump_flags = gen_grid_cpu.Grids.dump_flags def __setattr__(self, key, val): if key in ('atom_grid', 'atomic_radii', 'radii_adjust', 'radi_method', @@ -581,12 +585,12 @@ def prune_by_density_(self, rho, threshold=0): return self def to_cpu(self): - grids = gen_grid.Grids(self.mol) + grids = gen_grid_cpu.Grids(self.mol) utils.to_cpu(self, out=grids) return grids -_default_rad = gen_grid._default_rad -RAD_GRIDS = gen_grid.RAD_GRIDS -_default_ang = gen_grid._default_ang -ANG_ORDER = gen_grid.ANG_ORDER -_padding_size = gen_grid._padding_size +_default_rad = gen_grid_cpu._default_rad +RAD_GRIDS = gen_grid_cpu.RAD_GRIDS +_default_ang = gen_grid_cpu._default_ang +ANG_ORDER = gen_grid_cpu.ANG_ORDER +_padding_size = gen_grid_cpu._padding_size diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index bf6c65c9..bb98e857 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -28,7 +28,7 @@ from gpu4pyscf.dft import xc_deriv, xc_alias, libxc from gpu4pyscf import __config__ from gpu4pyscf.lib import logger -from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.__config__ import _streams, num_devices LMAX_ON_GPU = 6 BAS_ALIGNED = 1 @@ -395,7 +395,7 @@ def gen_grid_range(ngrids, device_id, blksize=MIN_BLK_SIZE): ''' Calculate the range of grids assigned the given device ''' - ngrids_per_device = (ngrids + _num_devices - 1) // _num_devices + ngrids_per_device = (ngrids + num_devices - 1) // num_devices ngrids_per_device = (ngrids_per_device + blksize - 1) // blksize * blksize grid_start = min(device_id * ngrids_per_device, ngrids) grid_end = min((device_id + 1) * ngrids_per_device, ngrids) @@ -523,8 +523,8 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, release_gpu_stack() cupy.cuda.get_current_stream().synchronize() futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _nr_rks_task, ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, @@ -914,8 +914,8 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, release_gpu_stack() cupy.cuda.get_current_stream().synchronize() futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _nr_uks_task, ni, mol, grids, xc_code, (dma,dmb), mo_coeff, mo_occ, @@ -1026,7 +1026,7 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, ao_deriv = 1 ngrids_glob = grids.coords.shape[0] - ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices + ngrids_per_device = (ngrids_glob + num_devices - 1) // num_devices ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE grid_start = min(device_id * ngrids_per_device, ngrids_glob) grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) @@ -1108,8 +1108,8 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= futures = [] cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _nr_rks_fxc_task, ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, @@ -1178,7 +1178,7 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, ao_deriv = 1 ngrids_glob = grids.coords.shape[0] - ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices + ngrids_per_device = (ngrids_glob + num_devices - 1) // num_devices ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE grid_start = min(device_id * ngrids_per_device, ngrids_glob) grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) @@ -1277,8 +1277,8 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= futures = [] cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _nr_uks_fxc_task, ni, mol, grids, xc_code, fxc, (dma, dmb), mo1, occ_coeff, diff --git a/gpu4pyscf/dft/uks.py b/gpu4pyscf/dft/uks.py index 5e11bb81..4d561e62 100644 --- a/gpu4pyscf/dft/uks.py +++ b/gpu4pyscf/dft/uks.py @@ -16,7 +16,7 @@ from pyscf.dft import uks as uks_cpu from pyscf import lib from gpu4pyscf.lib import logger -from gpu4pyscf.dft import numint, gen_grid, rks +from gpu4pyscf.dft import rks from gpu4pyscf.scf import hf, uhf from gpu4pyscf.lib.cupy_helper import tag_array from gpu4pyscf.lib import utils diff --git a/gpu4pyscf/drivers/basis_vDZP_NWCHEM.dat b/gpu4pyscf/drivers/basis_vDZP_NWCHEM.dat new file mode 100644 index 00000000..1fc10e1e --- /dev/null +++ b/gpu4pyscf/drivers/basis_vDZP_NWCHEM.dat @@ -0,0 +1,2310 @@ +BASIS "ao basis" PRINT +#BASIS SET: +H S + 81.886780875039 0.008423954179 + 12.231063861388 0.064861285350 + 2.786815144183 0.311400883616 + 0.775786677408 0.985308081721 + 0.223433692783 1.256819962883 +H S + 0.331097483644 0.052292300794 + 0.107455350812 0.104139302794 + 0.050680508365 0.245115714360 +H P + 1.417043684193 0.759765848611 + 0.290781406697 1.522844626098 + +#BASIS SET: +He S + 248.304359266256 0.005013791761 + 39.257359859983 0.034983701525 + 9.290242872987 0.162973195617 + 2.650678948299 0.489691016373 + 0.811596267579 0.932899350713 +He S + 0.268607161928 0.222877468588 + 0.345025805948 0.062111112842 + 0.102007122911 0.091342429226 +He P + 1.310041712606 0.696987984442 + 0.265008725379 0.178488628760 + +#BASIS SET: +Li S + 261.504397395816 0.007995837268 + 39.435060612595 0.058753345768 + 8.903002628902 0.251012919032 + 2.312128536738 0.601136930396 + 0.673560740522 0.478266124634 +Li S + 0.637937919385 -0.146472924115 + 0.063047215665 0.821661055674 + 0.020134930908 0.034773514803 +Li S + 0.029187683675 1.062645883498 + 0.014762824299 1.095509715610 +Li P + 1.607685808951 0.088808339206 + 0.261313614186 0.424066699835 + 0.078704441731 0.737998244728 +Li D + 0.248573704371 0.694743515137 + 0.088415849082 0.788715353455 + +#BASIS SET: +Be S + 510.032398065642 0.005805184625 + 76.405556797429 0.043342046125 + 17.255250508893 0.188908143114 + 4.586010018287 0.469633572220 + 1.356430512009 0.395896576117 +Be S + 1.628923371459 -0.134493067135 + 0.158484974791 0.944737331855 + 0.063902892456 0.171032012077 +Be S + 0.056984898840 0.893658825536 + 0.029154653875 1.463000499223 +Be P + 2.658894134528 0.025302164759 + 0.449379800839 0.144909124946 + 0.113066880133 0.200552018717 +Be D + 0.458082080027 0.881395926545 + 0.118291023217 0.993425036644 + +#BASIS SET: +B S + 1.548610750968 -0.375453530717 + 1.230872527218 0.296383194735 + 0.288936160855 0.400281132411 + 0.105945534640 0.252339650517 +B S + 0.149430266146 0.631907262941 + 0.046078378647 0.717819753504 +B P + 6.824625642150 0.032055054798 + 1.786841467886 0.168629590401 + 0.529445685476 0.508222453209 + 0.171288703178 0.449200455124 +B P + 0.137855130395 1.122762891963 + 0.045638926887 1.688087920718 +B D + 0.685952928321 1.202317712080 + 0.217177410256 1.378136829807 + +#BASIS SET: +C S + 2.174987790335 -0.276058910823 + 1.852226946510 0.240154751118 + 0.459098487623 0.191124862153 + 0.169128986465 0.087294467451 +C S + 0.227455022611 0.987938126698 + 0.086918590957 1.016650788977 +C P + 12.870085333684 0.024741438180 + 3.295443583046 0.152192079858 + 0.976645717756 0.448674298135 + 0.348425197118 0.494756938315 +C P + 0.146489481077 1.594415811311 + 0.063733914289 0.807859142831 +C D + 1.025317925428 0.833187865353 + 0.250249064514 0.858263567336 + +#BASIS SET: +N S + 2.735378582931 -0.239593796561 + 2.219800598729 0.224989693361 + 0.592708671545 0.194517761229 + 0.227406999886 0.098125575699 +N S + 0.280435456826 0.853515477480 + 0.105373393767 1.130727240507 +N P + 19.104528476156 0.011792637499 + 4.831013846305 0.072900287221 + 1.454969505940 0.205542762778 + 0.494779046501 0.258358206618 +N P + 0.178530326117 1.242146697946 + 0.068618652214 0.429051028293 +N D + 1.230812954761 1.261894149963 + 0.370851969193 1.181659566926 + +#BASIS SET: +O S + 3.543641820586 -0.263554292988 + 2.712717475497 0.265678831423 + 0.683184525058 0.279087292612 + 0.255976739185 0.112868556239 +O S + 0.286500025367 1.027594577324 + 0.123012019465 1.159166516125 +O P + 27.003841820377 0.012470154545 + 6.762654966183 0.074520776234 + 2.011704598628 0.210079890981 + 0.634428348949 0.291319728722 +O P + 0.199067660399 1.512413659527 + 0.065643622039 0.312634979070 +O D + 1.366109340367 0.858294230859 + 0.461931641275 0.846034666146 + +#BASIS SET: +F S + 7.034861151373 -0.446046417537 + 4.872932295917 0.241120382569 + 1.330878323026 0.666593307204 + 0.525469722669 0.781917161284 +F S + 0.821266047563 0.135544243991 + 0.198341185022 0.993554010066 +F P + 35.526100039768 0.026818249901 + 9.093570542569 0.133596681058 + 2.685413226386 0.376326355144 + 0.833678395080 0.516681452384 +F P + 0.262279770686 0.931221931175 + 0.082396241151 0.200669872602 +F D + 1.454912451083 1.369428727075 + 0.446498292031 1.466564035452 + +#BASIS SET: +Ne S + 8.511025142406 -0.346407731370 + 6.944709652311 0.242428346755 + 1.561041650912 0.456810607744 + 0.599496632788 0.413991739819 +Ne S + 0.609511035171 0.175668695012 + 0.245683034081 0.854892929924 +Ne P + 56.111190785374 0.018617197509 + 14.542058717068 0.119964511456 + 4.468992837375 0.351570092625 + 1.504786372395 0.552758556945 +Ne P + 0.498808346618 0.959617655183 + 0.156283251576 0.293739815995 +Ne D + 1.678877049768 0.562501368964 + 0.416060290014 1.450686746125 + +#BASIS SET: +Na S + 17.859469979814 -0.094639395365 + 1.741589491951 0.454368094759 + 0.671216662419 0.326110219634 + 0.356748679884 0.099733348288 +Na S + 1.040532238619 1.128928459961 + 0.363586481174 0.971270254416 + 0.030788669980 0.725021277866 +Na S + 0.063388594723 0.461074706312 + 0.027886589957 0.865217967753 +Na P + 94.408900907741 0.022285520404 + 22.833035343911 0.141094814336 + 7.392308589988 0.456452935832 + 2.700481523814 0.811931701139 + 0.980041469729 0.865637159210 +Na P + 2.667291528209 -0.005824466879 + 0.336960843152 0.524769342105 + 0.055297986191 0.011767234747 +Na D + 0.317736281462 0.216305744638 + 0.050399448912 1.084520528356 + +#BASIS SET: +Mg S + 21.746078000863 -0.107712415417 + 2.054693963682 0.605039945129 + 0.775378201881 0.358253218691 + 0.646755053402 0.124456587448 +Mg S + 1.518035874065 0.953325303523 + 0.626648472409 0.885052846709 + 0.061342469056 0.663200597406 +Mg S + 0.114525781742 1.102821153701 + 0.049143798668 1.262827197035 +Mg P + 98.149921577771 0.020030532297 + 22.986901774625 0.130645961601 + 7.073704413314 0.416305903439 + 2.357463360333 0.664621574936 + 0.776737081590 0.460953274264 +Mg P + 3.771301810954 -0.008032934562 + 0.189166874642 0.340369688597 + 0.068419879293 0.260127856330 +Mg D + 0.401806625040 0.846622892966 + 0.102441134001 1.533893458570 + +#BASIS SET: +Al S + 3.307057950862 0.035887679384 + 1.064990666611 -0.301434323135 + 0.217094967607 0.408003674147 + 0.103399050795 0.504944565236 +Al S + 0.091326549223 0.267250123332 + 0.040384684710 0.424093785538 +Al P + 0.966979219663 -0.093593510129 + 0.418602698432 0.186041911562 + 0.179016462009 0.364571859325 + 0.092774532026 0.270083093687 +Al P + 0.101275278069 0.562522659034 + 0.032903691927 0.829515939897 +Al D + 0.653520505846 0.545097705481 + 0.177964130832 1.149228250111 + +#BASIS SET: +Si S + 4.206061763818 0.036046100379 + 1.379029753670 -0.278621659842 + 0.265740193555 0.514032304554 + 0.112502697537 0.365777551979 +Si S + 0.133673616794 0.251860678646 + 0.048297859440 1.001912217231 +Si P + 1.367505459412 -0.045311794170 + 0.391314943212 0.288581385277 + 0.158228930721 0.372264544458 + 0.068937667225 0.100776575609 +Si P + 0.117817448223 0.511784304415 + 0.041067655925 0.790699182240 +Si D + 1.229803285857 0.317490526212 + 0.304062845855 1.160802102368 + +#BASIS SET: +P S + 7.492690912960 0.036211256938 + 1.520979692504 -0.410346524622 + 0.389986469517 0.590431160497 + 0.185944246896 0.700026469133 +P S + 0.426277431342 0.066946209604 + 0.075641155401 0.662868977638 +P P + 1.182449802418 -0.070708334586 + 0.716815122233 0.179798137437 + 0.284356002842 0.347185537499 + 0.119844662929 0.186156103439 +P P + 0.104174736899 0.642173678179 + 0.042996540650 0.592979489332 +P D + 1.695162960423 0.257470657436 + 0.351822528642 1.089094143136 + +#BASIS SET: +S S + 7.601355269472 0.046444664335 + 1.927121218723 -0.464421807155 + 0.521456307465 0.575890670867 + 0.230532240510 0.826584502527 +S S + 0.470005347879 0.136072106019 + 0.083469210422 1.019121415962 +S P + 1.711639257837 -0.172150677616 + 1.157276160172 0.256510381181 + 0.449727077619 0.514122297269 + 0.195290728699 0.449898131647 +S P + 0.121042410856 0.700057493662 + 0.061043915320 0.824465481915 +S D + 1.777751118268 0.301983328801 + 0.404467069554 1.098792945928 + +#BASIS SET: +Cl S + 14.151131413181 0.017603655830 + 2.057697743755 -0.379080686343 + 0.825720874850 0.374023499125 + 0.302442901804 0.721003299953 +Cl S + 0.514888452515 0.170089201402 + 0.098706322995 0.945113577067 +Cl P + 2.844174271253 -0.135762495160 + 1.377515628579 0.223435753709 + 0.619123933124 0.728151126508 + 0.261922095680 0.819017057903 +Cl P + 0.126421788724 0.990717918733 + 0.073988046008 0.696607999630 +Cl D + 1.922210814739 0.588629611664 + 0.428318736672 1.937851927117 + +#BASIS SET: +Ar S + 14.940101122322 0.011521861228 + 2.243165636285 -0.558968080854 + 1.438970926578 0.473342341500 + 0.406876486290 0.631293265007 +Ar S + 0.440259652763 0.069249308310 + 0.161875873542 1.603023542921 +Ar P + 3.165484511165 -0.077261666511 + 1.111111719803 0.471181733030 + 0.414649437581 0.751059102315 + 0.162616494311 0.223201162944 +Ar P + 0.154347795709 1.472564774171 + 0.071270891501 0.581126940969 +Ar D + 1.467813037801 0.644461354176 + 0.399396403391 1.401420583174 + +#BASIS SET: +K S + 2.712682131864 -0.138120511191 + 0.959527159237 0.183647787140 + 0.415861396286 0.054504151428 + 0.222428317820 0.079993072198 +K S + 0.745830108700 -0.013684884521 + 0.447736522029 0.507569624842 + 0.027242967193 0.912536205404 +K S + 0.032660680703 0.739672700474 + 0.024351810227 1.104288980008 +K P + 10.065499144539 -0.038491537428 + 0.974019481043 0.674332393862 + 0.374596372255 0.612538235871 + 0.156921520903 0.172520943233 +K P + 4.042012088018 -0.007059823100 + 0.087259092608 0.077806374422 + 0.054714257801 0.688537533778 +K D + 0.866638592894 0.502040108967 + 0.157085997854 1.055271878444 + +#BASIS SET: +Ca S + 2.719301351535 -0.078978644821 + 1.883275651368 0.187115863443 + 0.642163903598 0.589578641721 + 0.336487867317 0.370804234632 +Ca S + 3.026324320338 -0.233621817613 + 1.242283226509 0.166011018416 + 0.053488348283 0.035851055675 +Ca S + 0.058373367494 1.094426980400 + 0.029921642326 0.431156605007 +Ca P + 10.411513039603 -0.021761105222 + 1.181201852942 0.307819154635 + 0.472806545197 0.269256985339 + 0.206363671314 0.068406701922 +Ca P + 1.103963525530 -0.009366778560 + 0.092210626014 0.588635174998 + 0.007655059125 0.028655018892 +Ca D + 1.466572349546 0.333010877274 + 0.261547289431 0.740339846733 + +#BASIS SET: +Sc S + 12.894521140222 0.079236996463 + 5.071686274181 -0.458078625675 + 1.167181899577 0.336412876746 + 0.663942013228 0.332736377800 + 0.309654579707 0.211456915306 +Sc S + 1.040885767796 0.463978291008 + 0.568097001649 0.435148577227 +Sc S + 0.070617692779 1.482515914675 + 0.027248406219 0.639348111708 +Sc P + 6.368578496263 -0.093545873727 + 1.949004097442 0.341238050163 + 0.815301694561 0.601180009390 + 0.313815959827 0.269603495756 +Sc P + 0.037710430517 0.936037163888 + 0.026841704778 1.288316160280 +Sc D + 11.687225911271 0.048657783353 + 3.314758379726 0.202225028823 + 1.046336738397 0.396787231764 + 0.313968073144 0.458512682004 +Sc D + 0.096369535224 0.705041436498 + 0.031054056783 0.578360893303 + +#BASIS SET: +Ti S + 16.461347886752 0.082016774935 + 5.371386815523 -0.650480440851 + 1.448985756914 0.538706537935 + 0.638919559348 0.450440953157 + 0.241619151339 0.154995866347 +Ti S + 1.051736442516 0.627756794746 + 0.589928288257 0.487164574898 +Ti S + 0.075250745810 1.663780527189 + 0.026602025230 0.399689063950 +Ti P + 7.122429655502 -0.037873691723 + 2.383153836626 0.114975280753 + 1.028878547766 0.221233308088 + 0.416069961536 0.117542200112 +Ti P + 0.179440719634 1.269893747125 + 0.076704284265 0.585246578780 +Ti D + 16.285523097231 0.057717359677 + 4.903186429987 0.253388428709 + 1.644289014934 0.545862699555 + 0.525281547715 0.674601799858 +Ti D + 0.170096639197 1.166232841900 + 0.064103525267 0.622642131383 + +#BASIS SET: +V S + 16.255319265212 0.094023708983 + 6.405063127696 -0.537187056948 + 1.457163624962 0.520258926643 + 0.499926424667 0.122520741372 + 0.301185417021 0.072736674208 +V S + 1.012605416044 1.098550324678 + 0.547886952925 0.607303361815 +V S + 0.090933679299 1.595189552198 + 0.036744292616 0.948508696780 +V P + 8.103098572263 -0.081234318416 + 2.629836559131 0.267004275831 + 1.112202390392 0.485982458150 + 0.429812510937 0.238017396511 +V P + 0.134170344208 0.865319211565 + 0.045541731704 1.001512124222 +V D + 20.991852147597 0.047940709014 + 6.482973795152 0.218068368478 + 2.259607977658 0.483008624822 + 0.787388461330 0.596005568614 +V D + 0.266299635377 0.873322009295 + 0.085848775242 0.412455368037 + +#BASIS SET: +Cr S + 19.326807872923 0.115928746519 + 6.895352080094 -0.789981589302 + 1.869640267141 0.563707338990 + 0.843806445546 0.445197267976 + 0.311586877483 0.070648620143 +Cr S + 1.226144167110 1.430412693717 + 0.536784361672 0.817200561042 +Cr S + 0.103687455191 1.278362236512 + 0.040453943570 0.998838137196 +Cr P + 9.274973304564 -0.067626882136 + 2.834803217107 0.248491248438 + 1.188671904758 0.418309250990 + 0.452672452476 0.190643237960 +Cr P + 0.099943441911 0.572805817456 + 0.020690649882 0.829846109272 +Cr D + 25.343518443315 0.043409250189 + 7.837194674080 0.208571751452 + 2.703730990740 0.481280315878 + 0.923329332261 0.600167729446 +Cr D + 0.301729037231 1.148719306493 + 0.095659118228 0.610428134382 + +#BASIS SET: +Mn S + 20.843228079998 0.103402377154 + 7.760419419145 -0.656141637759 + 1.904516413545 0.665798788078 + 1.001320433899 0.396522428503 + 0.481937472515 0.292024154733 +Mn S + 1.268268420404 1.565497038368 + 0.857241213876 1.201323188808 +Mn S + 0.109140028793 1.457741873878 + 0.045318992446 1.357680548185 +Mn P + 10.512436868143 -0.097556999181 + 3.099073135769 0.375169172686 + 1.322948831985 0.597227851646 + 0.527055310067 0.279726392464 +Mn P + 0.344507060257 0.623790512345 + 0.118361361298 1.250852309631 +Mn D + 31.818227456997 0.047047726486 + 10.037421666655 0.238358793520 + 3.534516417893 0.579576958050 + 1.249761773374 0.755472666211 +Mn D + 0.419663812162 1.107456649217 + 0.131489658435 0.625907283697 + +#BASIS SET: +Fe S + 22.358655151183 0.092871540367 + 8.726722256050 -0.543175687899 + 1.883511346045 0.788008191373 + 0.898098829763 0.506118339909 + 0.499141495354 0.151467937894 +Fe S + 1.414672733116 1.048090878533 + 0.838969961703 0.701944708617 +Fe S + 0.126967189900 1.110584663987 + 0.048297686989 0.960183114424 +Fe P + 11.737335796504 -0.087457071524 + 3.491740558354 0.321194930339 + 1.519670314056 0.525914230497 + 0.612230245037 0.263690954761 +Fe P + 0.270740512750 0.802890646298 + 0.110956520505 0.614971889314 +Fe D + 37.602436513551 0.035788062793 + 11.980469021430 0.185256836504 + 4.315862847956 0.455518034427 + 1.583817661861 0.610355600220 +Fe D + 0.548357256279 1.544966076786 + 0.171477360605 0.810476849978 + +#BASIS SET: +Co S + 25.335315694476 0.087766855961 + 9.488355054778 -0.550692716629 + 2.105770866101 0.731208569241 + 1.152741563097 0.435591231225 + 0.600134651908 0.260972326852 +Co S + 1.728593048399 0.743738752336 + 0.965135408786 0.944149972951 +Co S + 0.138217537887 1.152256178711 + 0.048799619525 1.322788514015 +Co P + 13.019846825599 -0.106687768822 + 3.898569128239 0.379287211199 + 1.691912041740 0.646070467691 + 0.663194327000 0.321080022842 +Co P + 0.266233736271 0.516713551942 + 0.087682602286 0.610544822757 +Co D + 46.467186502091 0.036738936657 + 14.844727962964 0.206400986919 + 5.324781910313 0.545964703962 + 1.946193019964 0.772313951631 +Co D + 0.663697766556 1.764841258136 + 0.197538703092 0.981959446237 + +#BASIS SET: +Ni S + 28.117307328944 0.095054204041 + 10.323114146207 -0.619352378957 + 2.266501334743 0.903046067553 + 1.031698338004 0.572669971934 + 0.429763116477 0.107936112138 +Ni S + 1.585096116449 0.536471821182 + 0.927688425987 0.466470527797 +Ni S + 0.142723981700 1.052935132680 + 0.052288562321 0.852229071228 +Ni P + 14.649263427379 -0.123572234826 + 4.487248555279 0.373049398485 + 1.926522201916 0.666618694496 + 0.755571182364 0.334357692295 +Ni P + 0.320923338030 1.109796900506 + 0.124597438762 1.063986125250 +Ni D + 55.795596873330 0.032016254993 + 17.671713442822 0.206558524336 + 6.458884370599 0.576880736568 + 2.375449580314 0.853632978049 +Ni D + 0.808597906774 1.577004983294 + 0.239256209431 0.904057266556 + +#BASIS SET: +Cu S + 30.095654464774 0.088111002450 + 11.189465415118 -0.566864618289 + 2.541452118390 0.690601613017 + 1.290510764029 0.526642987015 + 0.608961815658 0.132227842362 +Cu S + 2.018430478515 0.946194744344 + 0.849138974882 0.961852815883 +Cu S + 0.155609251028 1.205505591180 + 0.055100263916 1.040275581217 +Cu P + 16.197916896934 -0.105945973220 + 4.487908313373 0.425417913649 + 1.928366077649 0.642632779569 + 0.763943236555 0.292317921669 +Cu P + 0.299840227632 1.292856821394 + 0.116773403393 1.061205571547 +Cu D + 60.487297938910 0.032026327350 + 19.233046405660 0.186974917418 + 6.965090689821 0.519387752985 + 2.568903372512 0.762425819652 +Cu D + 0.875809628134 0.877877663513 + 0.257690509171 0.512991728701 + +#BASIS SET: +Zn S + 33.874473714375 0.071277616937 + 11.972622956367 -0.506758729754 + 2.961714395977 0.493378998699 + 1.829981912490 0.307312140366 + 0.870554735732 0.249108397868 +Zn S + 1.739203860675 1.189126584032 + 0.784958106810 0.340497149689 +Zn S + 0.173777482831 1.056166866705 + 0.058936157612 0.875584102608 +Zn P + 18.428852736536 -0.072783254370 + 4.757828516192 0.320487006186 + 2.051504911103 0.464955123397 + 0.811290090160 0.202072427695 +Zn P + 0.280329527429 0.290680713435 + 0.074967629409 0.340832692654 +Zn D + 69.345783920227 0.038894475768 + 22.344643463037 0.225936760009 + 8.230073746758 0.625199117295 + 3.129607637648 0.942430315944 +Zn D + 1.115445344238 1.300039063947 + 0.350438433853 0.725428578056 + +#BASIS SET: +Ga S + 3.461851099712 0.086596634698 + 1.673374271825 -0.270587362691 + 0.211318368014 0.351819394013 + 0.097828153295 0.285416686844 +Ga S + 0.151627010733 0.297902650076 + 0.055604376270 0.811459212326 +Ga P + 1.224072196918 -0.142334247355 + 0.570340687021 0.156300957280 + 0.176342181917 0.414233473261 + 0.069338127059 0.187202564348 +Ga P + 0.103202148668 0.781680200651 + 0.032521362068 1.139904102344 +Ga D + 0.434136758705 0.630724293037 + 0.136436950793 1.735376440057 + +#BASIS SET: +Ge S + 3.217496101688 0.220957013438 + 1.918166572642 -0.511683753720 + 0.214903864260 0.831622355013 + 0.067589930320 0.205018958674 +Ge S + 0.465696258761 0.242715316187 + 0.060174025120 1.570350456114 +Ge P + 3.914179557047 0.071448877544 + 2.346468630555 -0.196014253790 + 0.273861111944 0.722477976627 + 0.105990071273 0.319809081571 +Ge P + 0.202135065169 0.219317596243 + 0.061214687117 1.973742824912 +Ge D + 0.370009633120 1.071258503403 + 0.132976201398 1.060360506723 + +#BASIS SET: +As S + 3.507693642950 0.158031853980 + 1.888383025988 -0.490805795974 + 0.288111240456 0.729540893159 + 0.143180088068 0.326673579759 +As S + 0.352438514226 0.168792374132 + 0.073126498505 1.058779710836 +As P + 1.358255409051 -0.165061038563 + 0.934647702814 0.154119568450 + 0.292654437651 0.269752770917 + 0.122927318606 0.174509228238 +As P + 0.118767537140 0.878680997698 + 0.047605103697 1.162342892188 +As D + 0.314487938869 0.831547539129 + 0.218493965228 1.033423867748 + +#BASIS SET: +Se S + 3.650865730594 0.231457444968 + 2.210903609188 -0.563533855155 + 0.335170928150 0.729862073845 + 0.151638836734 0.315657165245 +Se S + 0.599638822173 0.126025157342 + 0.075582134264 1.069252579041 +Se P + 1.507148257439 -0.162605422408 + 0.913367414636 0.183499918283 + 0.341221978698 0.372235321531 + 0.150432587271 0.271959003606 +Se P + 0.157082854490 0.530816305712 + 0.061272426885 1.598391183858 +Se D + 0.367597502630 1.060909735181 + 0.223013187434 0.893014584292 + +#BASIS SET: +Br S + 4.214014635008 0.165692028470 + 2.412188698594 -0.476877193218 + 0.391067027010 0.710299602030 + 0.164674252704 0.362093036361 +Br S + 0.485599664697 0.145825877593 + 0.074598994605 0.768087358690 +Br P + 1.789947253144 -0.207356828201 + 1.292036187413 0.186107969204 + 0.449383252160 0.332524333195 + 0.194590167329 0.303778721054 +Br P + 0.122759759967 0.804134734196 + 0.066738625731 1.319919526700 +Br D + 0.442438146643 1.158672768266 + 0.245849018795 1.021366227819 + +#BASIS SET: +Kr S + 3.986511440636 0.290559957024 + 2.943901744792 -0.516111029924 + 0.429901866727 0.564971586700 + 0.151821838900 0.272351694911 +Kr S + 0.334711812968 0.193054210335 + 0.111745168911 0.897546361771 +Kr P + 2.153957686193 -0.178565423896 + 1.283748685547 0.172715443611 + 0.501685561731 0.400700715929 + 0.237520267977 0.316528198613 +Kr P + 0.140113161172 0.940357138145 + 0.089204908642 0.846922224775 +Kr D + 0.526301285310 1.174418328612 + 0.203100266787 0.975302884833 + +#BASIS SET: +Rb S + 3.869999599846 0.122319888582 + 2.130904636104 -0.666846831080 + 0.738002026783 0.418046906087 + 0.409528742839 0.516121898223 +Rb S + 1.350580726758 0.441228993164 + 0.672123461407 -0.180183191143 + 0.215026040193 0.940988897210 +Rb S + 0.041727573698 0.194823232373 + 0.018173085367 0.240784908108 +Rb P + 2.767410417511 -0.100991461056 + 0.690279272515 0.523102112330 + 0.272134686488 0.509586156257 + 0.099861314223 0.105281356899 +Rb P + 0.483868425918 -0.021967319393 + 0.064990339093 0.192198563244 + 0.026344167036 0.254778803087 +Rb D + 0.482952908592 0.814918945654 + 0.118599680126 1.160272481701 + +#BASIS SET: +Sr S + 1.692706637459 -0.393341748067 + 1.149795785772 0.208407810148 + 0.669131477932 0.410560473766 + 0.248220076725 0.223568940036 +Sr S + 1.886266511407 0.403235193588 + 0.400164390156 -0.077244774899 + 0.087928617203 0.264020515846 +Sr S + 0.059443196810 1.173409533858 + 0.028584686102 1.214114711178 +Sr P + 2.806647699305 -0.059210927424 + 0.819376524702 0.263531483008 + 0.348158267797 0.253839198083 + 0.146003646962 0.054726018823 +Sr P + 0.935150498254 -0.009917943921 + 0.087786356974 0.731454921275 + 0.045420993133 0.227916283507 +Sr D + 0.688406352900 0.490903537712 + 0.192704647015 0.763662684452 + +#BASIS SET: +Y S + 7.296549891946 0.195341789784 + 2.688422185597 -1.838359171848 + 2.177642572388 1.023950898895 + 0.659090683583 0.677612516206 + 0.318000313098 0.286803253760 +Y S + 0.674812957467 0.585665285488 + 0.290353362618 0.292072052074 +Y S + 0.061667823748 1.471139784374 + 0.025673512214 0.434089285930 +Y P + 2.413073905893 -0.896109901350 + 1.991118335445 0.920914883211 + 0.644885028991 0.747249800994 + 0.262158149515 0.358967578245 +Y P + 0.090251834443 1.587694599781 + 0.038734540875 1.124989169808 +Y D + 2.640667082288 -0.051410329462 + 1.377231693280 0.259691609738 + 0.521170303505 0.647670134977 + 0.191317945677 0.722772032171 +Y D + 0.066561171952 0.635575534061 + 0.021058714013 0.213198852821 + +#BASIS SET: +Zr S + 7.636755459435 0.225594387816 + 3.059460604715 -2.021770934596 + 2.734289469745 1.043987761524 + 0.974030529831 0.649481513646 + 0.321237768185 0.271978542754 +Zr S + 0.666118313328 0.610098372668 + 0.382004929999 0.279366801720 +Zr S + 0.078638985037 1.573000069276 + 0.030977707836 0.591169478729 +Zr P + 2.635304536716 -0.851203628339 + 2.139701672965 0.892120469003 + 0.701270261887 0.710477753317 + 0.294117575826 0.292528213505 +Zr P + 0.153963461038 1.438560009408 + 0.088262101191 0.995725387494 +Zr D + 3.201047577661 -0.038892498436 + 1.485048727987 0.275692468196 + 0.559574253517 0.646436509544 + 0.196618868868 0.666536113423 +Zr D + 0.062451815401 0.649948004351 + 0.020325928756 0.194809845607 + +#BASIS SET: +Nb S + 8.226907672370 0.173869214563 + 5.258853938239 0.400716421611 + 4.024909003000 -1.292442883143 + 1.195129963229 0.474286872382 + 0.359665734464 0.272918985707 +Nb S + 0.780972111614 0.609456624827 + 0.468149789914 0.309910528963 +Nb S + 0.084330299999 1.270848051253 + 0.034918666624 0.591189631131 +Nb P + 2.871351590001 -0.873217473927 + 2.280935596682 0.940543482261 + 0.743551490746 0.760237258354 + 0.298069250267 0.269556485513 +Nb P + 0.152595605504 1.401687661947 + 0.068327454661 1.010582696646 +Nb D + 3.367880306345 -0.051672605872 + 1.756564781302 0.284516251789 + 0.658735017693 0.664232933582 + 0.231059396397 0.658825321794 +Nb D + 0.072216226374 0.696966749733 + 0.024801099920 0.196624147646 + +#BASIS SET: +Mo S + 9.673752512870 0.293639539791 + 3.572282146733 -1.857614871108 + 1.530849754894 1.224864128516 + 0.631996839839 0.660456106153 + 0.288839329473 0.296153352239 +Mo S + 0.778417996936 0.603632414429 + 0.534172652813 0.301871306040 +Mo S + 0.087626525151 1.317087783672 + 0.033562681859 0.580743179363 +Mo P + 3.093707673366 -0.826631577474 + 2.403641873352 0.900674860790 + 0.818240314903 0.711828517694 + 0.336205879987 0.269489929318 +Mo P + 0.141936162003 1.303129589177 + 0.051142057872 1.262881655764 +Mo D + 4.248706637709 -0.056047421983 + 2.043076701133 0.260482078879 + 0.819219599195 0.645891778457 + 0.300623287176 0.680758170713 +Mo D + 0.099924939859 0.934167552489 + 0.033837585352 0.225962855556 + +#BASIS SET: +Tc S + 8.734488231535 0.191829141563 + 3.835899095068 -2.105775116524 + 3.469751744685 1.518466647586 + 0.967124516165 0.732292685600 + 0.424182603993 0.274139633354 +Tc S + 1.003924527229 0.601613332020 + 0.543013116325 0.295145009859 +Tc S + 0.088094236993 1.301256535828 + 0.033682774860 0.537823653934 +Tc P + 3.405023984944 -0.854982996734 + 2.564265074003 0.959319009575 + 0.863581625172 0.828846615206 + 0.337411029995 0.255144526070 +Tc P + 0.091795517667 1.101466987783 + 0.029879282774 1.937115484620 +Tc D + 5.097732110758 -0.054667679941 + 2.187138333162 0.264182410092 + 0.976624475913 0.623831001407 + 0.396051934959 0.663361650807 +Tc D + 0.148110040542 1.029802355721 + 0.059688691595 0.261393560258 + +#BASIS SET: +Ru S + 9.696500481430 0.176368995944 + 3.986490207677 -1.981311830052 + 3.507270554217 1.413770781341 + 1.044398638037 0.657254090853 + 0.448751919987 0.252393374138 +Ru S + 1.064403045557 0.566761648709 + 0.627779982819 0.302811587640 +Ru S + 0.101793183953 1.334893817716 + 0.034776411982 0.605967167434 +Ru P + 3.763404295092 -0.859444471047 + 2.770872457961 0.955526231776 + 0.988572431005 0.893940245276 + 0.408597854458 0.334564018189 +Ru P + 0.187687004290 1.566749185235 + 0.051504739627 0.913525241281 +Ru D + 5.550032090322 -0.057846761038 + 2.446355819382 0.253638478653 + 1.073342025516 0.583490878354 + 0.429389404890 0.589592130440 +Ru D + 0.165052645167 1.116296297183 + 0.068844897289 0.265257106223 + +#BASIS SET: +Rh S + 10.612277159968 0.168503102460 + 4.175711357562 -1.847338836606 + 3.540198264706 1.311067350155 + 1.054712457574 0.697993868303 + 0.463476941827 0.185865531158 +Rh S + 1.120987885596 0.630953703067 + 0.505354440197 0.263165934158 +Rh S + 0.107799055751 0.895058309015 + 0.035593708316 0.457714131915 +Rh P + 3.915762760107 -1.155445931450 + 3.214946272686 1.203030672069 + 1.138390014316 0.792074916425 + 0.483900206512 0.361955969996 +Rh P + 0.173801790703 1.467776545715 + 0.058955083779 1.888699833975 +Rh D + 6.396827458805 -0.048409300225 + 2.908324768342 0.267035530162 + 1.367666120621 0.694020583366 + 0.573074572998 0.792961890374 +Rh D + 0.215417984872 1.166151849377 + 0.074694475620 0.264937531216 + +#BASIS SET: +Pd S + 11.260042205230 0.164097577056 + 4.447554407937 -1.866722502527 + 3.798950429913 1.359736862671 + 1.125191239801 0.565530046178 + 0.452556029260 0.188772887643 +Pd S + 1.151505961507 0.944652624868 + 0.758022887254 0.204351049313 +Pd S + 0.114341106908 0.954437219940 + 0.041622071605 0.471208585065 +Pd P + 4.280202330909 -1.177377340534 + 3.258393937931 1.288735738708 + 1.165454990282 1.060685702915 + 0.488221099390 0.409953870721 +Pd P + 0.201275394801 1.775717120960 + 0.057053584256 1.173935561093 +Pd D + 6.939335039037 -0.045626149466 + 3.004130610160 0.311839410976 + 1.365590049615 0.718055557380 + 0.570518822592 0.709739345643 +Pd D + 0.235211306335 0.761544637626 + 0.098548614743 0.252650801917 + +#BASIS SET: +Ag S + 11.137380185754 0.170113394090 + 4.989686293940 -1.887735459567 + 4.475920194052 1.434356812706 + 1.201486343175 0.868528612669 + 0.510392667886 0.192122601443 +Ag S + 1.168865294473 0.981239664457 + 0.703892508518 0.301221440276 +Ag S + 0.109374172855 1.053993979072 + 0.033776584003 0.411521983508 +Ag P + 4.556372684610 -1.202933355935 + 3.513166261338 1.313296588850 + 1.244794863986 1.036312910226 + 0.512047738553 0.383294270449 +Ag P + 0.176416800129 0.973703560128 + 0.052372032836 0.912910487489 +Ag D + 7.446453391764 -0.045685848836 + 3.255927976164 0.334321217612 + 1.458177385755 0.759037145474 + 0.596305718479 0.729668743368 +Ag D + 0.229825513509 1.458004453308 + 0.090151969871 0.269101010696 + +#BASIS SET: +Cd S + 12.017244504805 0.169339966251 + 5.114217825559 -1.898543443118 + 4.446940115430 1.450853794865 + 1.216628349228 0.933523984483 + 0.532052126747 0.193968463984 +Cd S + 1.055105987411 1.272808254083 + 0.604148329306 0.229142637869 +Cd S + 0.146383394326 0.696754211201 + 0.047046479439 0.358960905929 +Cd P + 4.757233246634 -1.076910687488 + 3.867372408050 1.151779729705 + 1.358823077137 0.721432137820 + 0.577098779341 0.278790097257 +Cd P + 0.182360815830 0.804988419982 + 0.062419275984 0.598193345980 +Cd D + 8.110020059042 -0.025222412807 + 3.451726768569 0.211885297280 + 1.581708281795 0.447542350487 + 0.673407975664 0.418668166040 +Cd D + 0.272634468912 1.116448847886 + 0.113339867322 0.175327136751 + +#BASIS SET: +In S + 1.424753204835 0.195216369564 + 0.967776518278 -0.390335918373 + 0.189316580236 0.332066512393 + 0.069682986693 0.481912204467 +In S + 0.262064023539 0.051928774476 + 0.064458770986 0.801906317169 +In P + 1.810449027147 0.061344922403 + 1.046643496682 -0.193157129825 + 0.184987578096 0.458390096608 + 0.071861619824 0.194158130930 +In P + 0.138098170856 0.104631733864 + 0.041247580889 0.868493244912 +In D + 0.138242005138 0.573395637068 + 0.079777300441 1.016219806155 + +#BASIS SET: +Sn S + 2.401062078850 0.149527197103 + 1.153316115528 -0.502781332360 + 0.226914764388 0.515242159192 + 0.110037948497 0.383620829459 +Sn S + 0.317225487215 0.157353145385 + 0.061053997178 0.982405762325 +Sn P + 2.566898425488 0.055051200864 + 1.499260435985 -0.169738769858 + 0.227034747645 0.493183210725 + 0.092179498934 0.327033786313 +Sn P + 0.173407347066 0.225606877021 + 0.048674549711 2.065300464132 +Sn D + 0.241102386017 1.101090309967 + 0.116551335998 1.474465735400 + +#BASIS SET: +Sb S + 1.901291496434 0.372390986717 + 1.449475120294 -0.589821664528 + 0.255179663388 0.238020525012 + 0.159557301375 0.282602074546 +Sb S + 0.355114126591 0.126701793951 + 0.067393141859 1.046620800528 +Sb P + 2.375689851709 0.051122584238 + 1.348062774398 -0.161075779289 + 0.243744636819 0.405765152732 + 0.107166990009 0.186486532978 +Sb P + 0.205419448722 0.140291569705 + 0.062916291344 1.926934328503 +Sb D + 0.205284248872 1.923448831425 + 0.161194685111 1.084075276779 + +#BASIS SET: +Te S + 2.083514130685 0.802285578371 + 1.714507901604 -1.147432002317 + 0.243721857039 0.709372799498 + 0.130681370450 0.126965084683 +Te S + 0.315794405079 0.190600374708 + 0.088221331240 0.836704224185 +Te P + 2.481012810644 0.073901692051 + 1.399169205477 -0.238783627662 + 0.229946308236 0.340455500866 + 0.127434638298 0.247951299805 +Te P + 0.503816419435 0.919676716095 + 0.056809623374 0.797571264542 +Te D + 0.192018182978 1.571397655653 + 0.064437864854 0.969777509197 + +#BASIS SET: +I S + 2.467637755918 0.270191039383 + 1.696928162357 -0.514853581902 + 0.298836215328 0.446031974526 + 0.139199227031 0.193993291432 +I S + 0.241437612793 0.214060325418 + 0.078209906425 0.853326113661 +I P + 0.954417866049 -0.267890449748 + 0.663589859519 0.294307194920 + 0.236750134212 0.246021979653 + 0.130584739507 0.138773377806 +I P + 0.229168242463 0.063105849732 + 0.067220884629 2.030056520658 +I D + 0.327941623565 0.159626880427 + 0.228514545922 1.064910731508 + +#BASIS SET: +Xe S + 2.755095924827 0.249896927023 + 1.738312366995 -0.537631660732 + 0.358046650467 0.489101738862 + 0.142124603625 0.268006037727 +Xe S + 0.275478850221 0.193322700350 + 0.100907057598 0.887859271842 +Xe P + 1.117839719300 -0.276367079429 + 0.691267910074 0.323178756010 + 0.279267713061 0.298358628779 + 0.155955498860 0.226885229369 +Xe P + 0.265352026982 0.067336765896 + 0.075892744312 1.830243781388 +Xe D + 0.402780208511 0.161378596156 + 0.243632826926 1.043744835741 + +#BASIS SET: +Cs S + 2.172433323343 0.093944897136 + 1.213559429867 -0.574876851884 + 0.913074181038 0.380293350663 + 0.313880498180 0.275539125878 +Cs S + 1.294434390251 -0.246882008315 + 1.088756263514 0.352493537018 + 0.141990201906 1.013884420396 +Cs S + 0.110486818629 0.142948191575 + 0.021036226155 0.289955177474 +Cs P + 1.296612062813 -0.158205270387 + 0.511587649506 0.411480813114 + 0.201550092329 0.414262558281 + 0.065338478075 0.072302175823 +Cs P + 0.299675722332 -0.097355523294 + 0.063470843305 0.070064297558 + 0.029363753532 0.660221576187 +Cs D + 0.288013152964 1.130417708310 + 0.094776153263 0.956697547107 + +#BASIS SET: +Ba S + 2.165224439573 0.172126972669 + 1.563606663015 -0.412498520678 + 0.468388660094 0.403433792501 + 0.182490630329 0.153935914977 +Ba S + 0.969811881199 0.332173932718 + 0.438335807571 -0.093598712810 + 0.051413927514 0.275829475814 +Ba S + 0.038420685468 0.348249206633 + 0.034827156628 1.049222993274 +Ba P + 1.246867677258 -0.090713352176 + 0.619719104166 0.186065346861 + 0.314722191970 0.053345917338 + 0.218214872641 0.124038886456 +Ba P + 0.888823238004 -0.009266901736 + 0.112339424956 0.520029153739 + 0.048917251542 0.294753654056 +Ba D + 0.376362798375 0.740747300068 + 0.122678391091 0.544085547842 + +#BASIS SET: +La S + 2.961518815289 0.344436595849 + 2.116898887325 -0.624622276100 + 0.521152464968 0.515127237675 + 0.230069398586 0.246218458006 +La S + 0.579546119270 0.937709515416 + 0.290509752938 1.164367331351 +La S + 0.048916762584 1.105995776925 + 0.018604737481 0.464967111858 +La P + 3.245245643206 0.176769094460 + 2.470890435412 -0.332713646676 + 0.545303856973 0.526757519263 + 0.216537966689 0.336610767661 +La P + 0.033299369099 1.044022672845 + 0.006745724156 0.067353901679 +La D + 1.509848815606 -0.091622463451 + 0.743009875436 0.179196344423 + 0.339879135044 0.485427467590 + 0.137512286533 0.463167031971 +La D + 0.060432252241 0.866616108693 + 0.021446771088 0.436873072386 + +#BASIS SET: +Ce S + 2.338708269400 0.130423191930 + 1.321938059900 -0.945407435390 + 0.936653430510 0.578660129970 + 0.433174507830 0.199228432480 +Ce S + 0.787161385190 0.613999779320 + 0.256141028710 0.974944732900 +Ce S + 0.042862440378 1.450044603900 + 0.019521352708 0.347469655730 +Ce P + 2.219774933800 0.196030889890 + 2.018912507100 -0.320257523750 + 0.592087363040 0.406207592670 + 0.277158908340 0.243517746880 +Ce P + 0.156062580290 1.000000000000 +Ce D + 1.084221385500 -0.014352880349 + 0.448540175050 0.588466222730 + 0.151960945110 0.147553217860 +Ce D + 0.139549259200 1.539344007900 + 0.040874025783 0.369009644880 +Ce F + 34.145399262000 0.032170509402 + 12.065428998000 0.140071215210 + 4.450111205600 0.284220463290 + 1.676403229600 0.342402334160 + 0.652084636310 0.123817383030 +Ce F + 0.545199397300 0.899844805050 + 0.161907400480 0.966849207400 + +#BASIS SET: +Pr S + 3.168984651800 0.171631098330 + 1.667890716800 -0.995838233800 + 0.812186125420 0.272458450300 + 0.601529244950 0.152421654920 +Pr S + 0.903875352100 0.657504248670 + 0.301189177150 0.542467435400 +Pr S + 0.062615656705 0.959511574480 + 0.022405546382 0.289850823590 +Pr P + 2.482108136900 0.203400612200 + 2.257381996700 -0.340953226390 + 0.685565850410 0.453133555060 + 0.387144669100 0.087858983753 +Pr P + 0.220381064000 1.000000000000 +Pr D + 1.002533215800 -0.019351122016 + 0.750223540940 0.598029336850 + 0.229329053320 0.138544903510 +Pr D + 0.205673753170 0.684336589290 + 0.046531823684 0.191287740830 +Pr F + 40.232881930000 0.039727695667 + 14.439096901000 0.185973568320 + 5.666824231800 0.364846754290 + 2.437873255300 0.393202983180 + 1.049827395300 0.151852361610 +Pr F + 1.155904606900 0.649433443810 + 0.367184865050 0.738335210940 + +#BASIS SET: +Nd S + 2.399445231100 0.330052519610 + 1.682884324300 -1.087812006300 + 0.988141728820 0.228026434820 + 0.208920206230 0.190568753630 +Nd S + 1.127298797900 0.721120134100 + 0.489284491850 0.803593302000 +Nd S + 0.024303840698 0.504862388910 + 0.013029609227 0.612696241810 +Nd P + 2.324735346000 0.170147916960 + 2.056219145600 -0.355178646810 + 0.697829062940 0.572197291710 + 0.509194679390 0.085761284929 +Nd P + 0.235469139510 1.000000000000 +Nd D + 1.074577298900 -0.017620186217 + 0.472062103620 0.708203155280 + 0.200015863370 0.168647666540 +Nd D + 0.142617264320 0.520890592900 + 0.057631923663 0.180610053610 +Nd F + 46.847185945000 0.030502424037 + 17.019304283000 0.153104586980 + 6.798621254000 0.316876806310 + 2.887091563100 0.406866529590 + 1.108982204200 0.166396269320 +Nd F + 1.261794730700 0.585966756640 + 0.388764008300 0.611998230940 + +#BASIS SET: +Pm S + 2.930428338900 0.357177154220 + 1.927498493600 -1.170119154300 + 0.772548914370 0.230068336210 + 0.169843099460 0.183828159010 +Pm S + 1.062021113300 0.229843223870 + 0.443061656340 0.640385348860 +Pm S + 0.047080341941 0.566945207290 + 0.020131076829 0.951730188260 +Pm P + 2.526409558100 0.186350621230 + 2.182882378400 -0.359383234040 + 0.678345299410 0.573936192200 + 0.368422273410 0.203240994710 +Pm P + 0.217452026430 1.000000000000 +Pm D + 1.175937624900 -0.019615039553 + 0.437438037980 0.684495543500 + 0.212867517890 0.138802799410 +Pm D + 0.112237432370 0.346138298110 + 0.029513406922 0.263739259970 +Pm F + 49.708022670000 0.030656850946 + 17.966438036000 0.154675899030 + 7.102080585100 0.330654822160 + 2.908142509900 0.444450968650 + 1.215606314300 0.270568901430 +Pm F + 0.879756044810 0.562280426050 + 0.359805598200 0.722340818030 + +#BASIS SET: +Sm S + 3.198468986000 0.388700993160 + 2.054638395700 -1.296672422600 + 0.824222222660 0.227766807290 + 0.160168477290 0.172833079050 +Sm S + 0.946609884890 0.286948469730 + 0.428263406830 0.671620639940 +Sm S + 0.055236922625 0.599595120810 + 0.024617411607 0.912179105210 +Sm P + 2.513124078800 0.191310169700 + 2.237188900400 -0.361399138370 + 0.697827704550 0.575663871770 + 0.380565701450 0.175498933290 +Sm P + 0.225623014280 1.000000000000 +Sm D + 0.926535125110 -0.017463655728 + 0.519362892700 0.585997594070 + 0.221098036510 0.167648746790 +Sm D + 0.154304824910 0.311945638600 + 0.053900115632 0.235546802760 +Sm F + 50.128022078000 0.032967782994 + 18.164742527000 0.158422860610 + 7.183502716200 0.325839145800 + 2.962754106700 0.421637206220 + 1.232140976700 0.251720808500 +Sm F + 0.930577398270 0.531939752500 + 0.372797890110 0.763575162480 + +#BASIS SET: +Eu S + 3.345059905700 0.375970166750 + 2.075574062000 -1.405131776800 + 0.940426981650 0.232505604610 + 0.181674650410 0.192608133490 +Eu S + 1.052166334100 0.327606031730 + 0.442926265500 0.643659880550 +Eu S + 0.061414092824 0.588230789160 + 0.031784177184 0.930609513330 +Eu P + 2.495634008500 0.194164540170 + 2.292506715600 -0.356747943100 + 0.720591307920 0.567428064980 + 0.386142526820 0.170263118340 +Eu P + 0.229374588050 1.000000000000 +Eu D + 0.964946609210 -0.016327079928 + 0.497327762740 0.714355608550 + 0.212503372040 0.164848026920 +Eu D + 0.155452858570 0.317749826660 + 0.058419430722 0.210656287270 +Eu F + 51.095384126000 0.035305949222 + 18.473596326000 0.162335136700 + 7.346815898000 0.324543595140 + 3.059653733000 0.409215096810 + 1.273688657300 0.267755957520 +Eu F + 0.881971831840 0.506986166910 + 0.381292553470 0.853657650950 + +#BASIS SET: +Gd S + 4.000019104300 0.333487234160 + 2.195731389500 -1.484942023100 + 0.965241154350 0.249401906810 + 0.162248187410 0.184150027830 +Gd S + 1.098367709400 0.332219940840 + 0.432548270620 0.524530415210 +Gd S + 0.064209408826 0.603912156030 + 0.033870628205 0.929468533350 +Gd P + 2.427873110800 0.198067677630 + 2.311779823100 -0.357983931930 + 0.739703409910 0.578006472370 + 0.366725221840 0.189778608390 +Gd P + 0.227828614570 1.000000000000 +Gd D + 0.917667303880 -0.016219993919 + 0.529461794640 0.687152060260 + 0.221266400270 0.194933590720 +Gd D + 0.150791182580 0.317109114010 + 0.055406934975 0.219244642480 +Gd F + 52.702891850000 0.037944092036 + 18.968629359000 0.172950486860 + 7.483761636800 0.339296635260 + 3.124551141300 0.409244092080 + 1.297917258000 0.266432518960 +Gd F + 0.882336787390 0.516585464810 + 0.384484579370 0.873328896390 + +#BASIS SET: +Tb S + 3.201386505200 0.411894267370 + 2.126784415700 -1.506935953000 + 0.999097495030 0.254535056700 + 0.157417449500 0.117990777940 +Tb S + 1.023627053700 0.340110769870 + 0.430697801590 0.454888900280 +Tb S + 0.069974898990 0.698643296210 + 0.032940990975 0.740386009190 +Tb P + 2.073989571400 -0.343595673730 + 1.828471268600 0.215362185340 + 0.775726769020 0.490677935480 + 0.384091371450 0.256087426350 +Tb P + 0.217915710920 1.000000000000 +Tb D + 0.825171758200 -0.015782273999 + 0.600154080990 0.515824182390 + 0.239457542090 0.219930276580 +Tb D + 0.170840294520 0.339344511810 + 0.066393296853 0.179617582290 +Tb F + 56.589558964000 0.038014387935 + 20.155230196000 0.179793848070 + 7.820360552900 0.362111598160 + 3.178882888700 0.431909631700 + 1.289918311100 0.254255670440 +Tb F + 0.942818356420 0.524283327370 + 0.375168165820 0.774641908370 + +#BASIS SET: +Dy S + 3.077638972300 0.474009059980 + 2.144516903200 -1.571543383700 + 1.032524860600 0.251872641160 + 0.149884855500 0.115997598420 +Dy S + 1.187918492500 0.371841314040 + 0.439964465440 0.439801832140 +Dy S + 0.049122676587 0.662889360870 + 0.020927956353 0.758733207730 +Dy P + 2.282049763500 -0.335364965580 + 2.149986794100 0.212987377950 + 0.778513739630 0.469808752900 + 0.375219342670 0.251032791170 +Dy P + 0.199939554420 1.000000000000 +Dy D + 0.832650334010 -0.016672954384 + 0.586505648120 0.572585525810 + 0.249679875370 0.220091894330 +Dy D + 0.180105077260 0.344582383530 + 0.065825970099 0.191502604620 +Dy F + 57.861342453000 0.038972875107 + 20.635393825000 0.181045246390 + 8.004535365600 0.359424664790 + 3.226449556100 0.423360346770 + 1.294175171300 0.256326644590 +Dy F + 0.881165570020 0.496894803850 + 0.339754421590 0.786379098440 + +#BASIS SET: +Ho S + 3.215855287600 0.435915312520 + 2.254633264000 -1.409057919500 + 0.926678464460 0.236812959040 + 0.197978695910 0.113859531790 +Ho S + 1.074584686700 0.349159708930 + 0.433613079530 0.295799100520 +Ho S + 0.053966343732 0.578706693790 + 0.021991634871 0.803713364560 +Ho P + 2.276740546700 -0.309919424620 + 1.875689434100 0.222433385250 + 0.769351845080 0.483617009030 + 0.342467015030 0.282620610930 +Ho P + 0.189559521080 1.000000000000 +Ho D + 0.723086365990 -0.014022935559 + 0.636267204790 0.519060618500 + 0.234931783170 0.244755281630 +Ho D + 0.179485956040 0.338303537140 + 0.073557821351 0.179021787710 +Ho F + 60.830053758000 0.038490576289 + 21.588065934000 0.181910610310 + 8.250479411400 0.365992385350 + 3.275619486500 0.415195997170 + 1.280581763200 0.223432973500 +Ho F + 1.061858771100 0.560929808620 + 0.378053363800 0.866634478600 + +#BASIS SET: +Er S + 3.300008194300 0.449912443110 + 2.261456184800 -1.616929874300 + 0.901142008760 0.243660305590 + 0.185129017200 0.116060607530 +Er S + 1.234831222100 0.310759560470 + 0.435101114100 0.233295092470 +Er S + 0.039648759574 0.783823145590 + 0.036670951258 0.635200383500 +Er P + 2.327643667900 -0.301384780980 + 1.853492883500 0.228041624770 + 0.774338801330 0.459307886770 + 0.361332277440 0.285507707070 +Er P + 0.186448517290 1.000000000000 +Er D + 0.900253313010 -0.017255084799 + 0.645098906620 0.519588132000 + 0.171165586210 0.251556771310 +Er D + 0.177202737280 0.375443551790 + 0.063204717631 0.194404298070 +Er F + 66.650276003000 0.038515010289 + 23.862873532000 0.185215593670 + 9.413269867800 0.371237926920 + 3.903703455900 0.442139749250 + 1.628218800500 0.297041225020 +Er F + 0.964181588410 0.515260179640 + 0.417374040540 0.679879748240 + +#BASIS SET: +Tm S + 3.529951967700 0.440030003130 + 2.373264752500 -1.676147150600 + 0.859732693780 0.244015605370 + 0.188994272710 0.126181490180 +Tm S + 1.237714700000 0.353524952200 + 0.437883330560 0.257138376250 +Tm S + 0.056601467736 0.751591102950 + 0.034452658435 0.665435412990 +Tm P + 2.389415038600 -0.298798971110 + 1.855600057200 0.231839187010 + 0.806657888830 0.465772792620 + 0.364317357270 0.303562724750 +Tm P + 0.189679696220 1.000000000000 +Tm D + 0.842104081560 -0.016821641435 + 0.642693889420 0.525275761120 + 0.174205588420 0.254940589090 +Tm D + 0.185376205340 0.386133952670 + 0.066999787462 0.185960001190 +Tm F + 69.098639820000 0.039780930603 + 24.622398191000 0.189150061040 + 9.670584964100 0.377486221370 + 3.967675958400 0.447132729830 + 1.602691573900 0.305383090360 +Tm F + 0.862982955250 0.539209910530 + 0.387135399100 0.660330253160 + +#BASIS SET: +Yb S + 4.129766509200 0.314141037340 + 2.367490669700 -1.896766067000 + 0.806650220260 0.225771432160 + 0.202273850080 0.109885398660 +Yb S + 1.438428189100 0.334354680940 + 0.458551811280 0.223260919060 +Yb S + 0.060105436712 0.719579093490 + 0.035482632702 0.643413955490 +Yb P + 2.471659314300 -0.295125193320 + 1.860993668600 0.237399570240 + 0.823485411640 0.457292600590 + 0.367510901360 0.351242665330 +Yb P + 0.171373315040 1.000000000000 +Yb D + 0.805143471510 -0.017580695087 + 0.498184965450 0.507346270890 + 0.183187725480 0.270574390960 +Yb D + 0.048692800777 0.230634878920 + 0.002065313870 0.297864857370 +Yb F + 69.918468598000 0.041342514961 + 24.947950088000 0.192551350850 + 9.765463162000 0.376774380600 + 4.011151681600 0.429599734010 + 1.635982295700 0.293066793210 +Yb F + 0.885391198850 0.546626208600 + 0.427393308340 0.733864066460 + +#BASIS SET: +Lu S + 3.997702916200 0.457441680180 + 2.709261988900 -1.540133835600 + 0.902625980060 0.245518229230 + 0.197099834360 0.103486958220 +Lu S + 1.249815976000 0.312543278220 + 0.465480048010 0.234491973770 +Lu S + 0.080960274419 0.624707137520 + 0.047485302330 0.624526188080 +Lu P + 2.581464009800 -0.297208403370 + 1.976311904200 0.250055874640 + 0.835207096090 0.450659846040 + 0.356413889420 0.305459141120 +Lu P + 0.156421764500 1.000000000000 +Lu D + 0.917062629490 -0.017339228386 + 0.572775203150 0.503610273730 + 0.168191033340 0.248443090760 +Lu D + 0.086838737228 0.311494627600 + 0.038900529418 0.217449926630 +Lu F + 73.095465952000 0.042169444125 + 26.023715946000 0.195911479400 + 10.230613237000 0.380324283710 + 4.182255200200 0.437222388230 + 1.656374469500 0.297996648530 +Lu F + 0.826840213730 0.493037406260 + 0.497649365380 0.842886162920 + +#BASIS SET: +Hf S + 5.745263913265 0.276398505380 + 3.894142553735 -0.839898489463 + 3.121027335851 0.474240430954 + 0.635496702086 0.158777918809 +Hf S + 0.764844351020 1.707468977910 + 0.290339685115 0.519137602207 +Hf S + 0.082091325177 1.958910571765 + 0.025505481057 0.408538141346 +Hf P + 8.047349756152 0.134374727878 + 5.353154905330 -0.304612877776 + 0.989731302776 0.607862566692 + 0.373084955619 0.433934935433 +Hf P + 0.042930780688 1.371306677174 + 0.018204544818 0.809563007668 +Hf D + 3.430686255910 -0.067789549648 + 1.284638756596 0.283064494075 + 0.433718517572 0.619508023969 + 0.150295851218 0.390710591201 +Hf D + 0.072503797491 0.794719269255 + 0.027061821278 0.342285689082 + +#BASIS SET: +Ta S + 5.963803707657 0.279364073058 + 3.961116677799 -0.850260809206 + 3.085766632618 0.479568840294 + 0.665808701395 0.193394187367 +Ta S + 0.805972319842 1.688246848358 + 0.268930532034 0.379133660479 +Ta S + 0.086553429743 2.595263529220 + 0.027791157335 0.533383099437 +Ta P + 8.182904208394 0.136625206869 + 5.419010727730 -0.317091050150 + 1.080897305362 0.587463210567 + 0.427101858969 0.435988981255 +Ta P + 0.123665166522 1.348117857613 + 0.074631391466 0.779829286766 +Ta D + 3.938780857951 -0.057900768529 + 1.367499138219 0.225894545813 + 0.629935945405 0.379219030762 + 0.273092407860 0.508381938982 +Ta D + 0.099925683041 0.812911312187 + 0.037943852635 0.332399580288 + +#BASIS SET: +W S + 6.106125947092 0.277025340819 + 4.127702570212 -0.844730321999 + 3.274970817330 0.483142244883 + 0.733004472901 0.158006598997 +W S + 0.841444067228 1.644909883363 + 0.271780212684 0.409568828017 +W S + 0.100331812736 1.869589815121 + 0.029632732761 0.528929932062 +W P + 8.285553529800 0.157354211889 + 5.615678187579 -0.352854952607 + 1.158573046813 0.612420934890 + 0.467617380312 0.446722978433 +W P + 0.137270627660 0.964875039887 + 0.082293678701 0.246586455905 +W D + 4.192126847007 -0.054899215601 + 1.448413562415 0.216997892231 + 0.626627825114 0.432982414411 + 0.249124670225 0.423123825902 +W D + 0.097421390343 1.090266290688 + 0.036447054814 0.478789058281 + +#BASIS SET: +Re S + 6.488782003396 0.257736215445 + 4.217568459632 -0.822875620070 + 3.262879428220 0.479067032833 + 0.754784222831 0.203147240894 +Re S + 0.941081734619 1.881764103129 + 0.272712355739 0.429341141770 +Re S + 0.111126860141 2.255322226400 + 0.040009355137 1.259531768378 +Re P + 7.520497681685 0.496927359844 + 6.498258506185 -0.676809195328 + 1.216427394186 0.572364689927 + 0.495158438289 0.396340498872 +Re P + 0.136186441470 4.165341545653 + 0.082441357688 0.908639271543 +Re D + 4.424276090621 -0.075416964612 + 1.537528643872 0.289740692768 + 0.720850710433 0.511674499486 + 0.312377703413 0.543026746807 +Re D + 0.122723095719 0.419130234729 + 0.050344645461 0.124831357094 + +#BASIS SET: +Os S + 6.589301521031 0.262324014895 + 4.483410826406 -0.824151542759 + 3.598784747600 0.488236883338 + 0.830482873892 0.148959289850 +Os S + 0.974401985901 1.704276404135 + 0.337099880208 0.419939365808 +Os S + 0.119371477901 1.190416170712 + 0.040334652774 0.415098407891 +Os P + 8.018868508095 0.366991980684 + 6.516333088213 -0.572512459986 + 1.298661604992 0.634399744500 + 0.537047008060 0.437296123703 +Os P + 0.163425595329 2.846863530709 + 0.046752457019 0.349464207023 +Os D + 4.562022886893 -0.075845399099 + 1.700498170420 0.262012418274 + 0.783963505481 0.489253051009 + 0.333034419339 0.480607051003 +Os D + 0.128320311421 1.946527751776 + 0.051494690010 0.291341732890 + +#BASIS SET: +Ir S + 6.990696480869 0.238565941294 + 4.657628826621 -0.824425741900 + 3.822133240323 0.507613356927 + 0.888825622345 0.119157387695 +Ir S + 1.023793770162 1.457427286127 + 0.309199315635 0.281543512886 +Ir S + 0.127967479426 1.171391362467 + 0.040088560933 0.521901566386 +Ir P + 8.689088185082 0.243119322841 + 6.577366558565 -0.444544360607 + 1.356413354364 0.645655248595 + 0.544928093624 0.402704630432 +Ir P + 0.140794381741 0.983202269789 + 0.048851626916 0.535708402160 +Ir D + 4.665178664686 -0.081060285159 + 1.806050424121 0.274531805989 + 0.787394625251 0.509312962011 + 0.321373987554 0.421907340162 +Ir D + 0.121625299200 1.542032145949 + 0.049105698957 0.215355952279 + +#BASIS SET: +Pt S + 7.679412615881 0.230877908692 + 4.709087759587 -0.861654782814 + 3.645313517688 0.541904557353 + 0.953420950841 0.472553125277 +Pt S + 1.027769770519 1.543816333192 + 0.895827169520 0.276175320568 +Pt S + 0.139555297410 1.211933971063 + 0.049340522870 0.547971738045 +Pt P + 8.289144538801 0.712848039607 + 7.316311140271 -0.940414367671 + 1.449109980349 0.710309776673 + 0.609243388052 0.456919979943 +Pt P + 0.193683119741 2.292500144336 + 0.044166144409 0.224210089383 +Pt D + 4.910332949872 -0.079625149422 + 1.938691426506 0.267813136394 + 0.864747561557 0.492535757006 + 0.357332283630 0.415036501570 +Pt D + 0.137946336644 2.006060038467 + 0.050242829695 0.262075608301 + +#BASIS SET: +Au S + 8.482247329754 0.229037007166 + 5.008887371391 -0.907405209082 + 3.953463080432 0.532729463041 + 1.016674280753 0.288719412142 +Au S + 1.244532317707 1.102769284388 + 0.309885825329 0.343912022063 +Au S + 0.147596397487 1.099367153402 + 0.047719646781 0.522674919172 +Au P + 8.831554690513 0.509475138022 + 7.566282794345 -0.721074776360 + 1.506700409832 0.698388067412 + 0.612822693070 0.409074097606 +Au P + 0.176221965082 2.730273272472 + 0.061994012169 1.122895118562 +Au D + 5.149888373979 -0.076940684362 + 2.038287817758 0.266148488840 + 0.898017206860 0.479912521191 + 0.362635858492 0.377471368469 +Au D + 0.138458740562 2.462111180979 + 0.046715591266 0.250738248974 + +#BASIS SET: +Hg S + 9.562300229367 0.181250771129 + 5.079395532020 -0.869350443981 + 3.982850471475 0.486376181267 + 0.961127400893 0.258983305153 +Hg S + 1.377803059990 1.722106394360 + 0.381440671172 0.250455726703 +Hg S + 0.172351434399 1.100334941673 + 0.063943840693 0.677823980347 +Hg P + 9.893101932495 0.218067320959 + 7.423812439662 -0.420280763099 + 1.600738605984 0.669305658396 + 0.652677096183 0.386781925502 +Hg P + 0.183498087066 2.466433663901 + 0.064592858638 1.561759630522 +Hg D + 5.315692442902 -0.078574152910 + 2.245554598474 0.243559945187 + 1.001947266394 0.451151800485 + 0.413902886479 0.345923827411 +Hg D + 0.167213642564 2.300779412806 + 0.067098161686 0.210685863765 + +#BASIS SET: +Tl S + 1.505313049428 0.313662634995 + 0.926406638906 -0.722746145099 + 0.195472441877 0.634249265129 + 0.079197571540 0.616447338532 +Tl S + 0.179423367042 0.144781868221 + 0.072061846216 0.890059337687 +Tl P + 1.340445390895 0.090978395301 + 0.866604785093 -0.259543806887 + 0.217123440967 0.362159556277 + 0.091408672799 0.365315709408 +Tl P + 0.591817274684 0.018848049794 + 0.043682496425 1.416103944518 +Tl D + 0.117314837048 0.604178245689 + 0.054459036250 0.886013825833 + +#BASIS SET: +Pb S + 1.341753004241 0.775578610898 + 1.105497689546 -1.202765308935 + 0.211699001507 0.723705670622 + 0.129547855691 0.256761268066 +Pb S + 0.549706034009 0.131886387067 + 0.063284849518 1.510485987615 +Pb P + 1.414803989187 0.152750104080 + 1.024069215145 -0.288877125439 + 0.186029005782 0.437219972839 + 0.082796300988 0.118522273753 +Pb P + 0.331088269580 0.201768239863 + 0.046482916169 3.750488820230 +Pb D + 0.200493671996 0.835023372291 + 0.088262123032 1.095405604994 + +#BASIS SET: +Bi S + 1.598452354960 0.090505383135 + 1.024031934201 -0.505556064927 + 0.211694361874 0.733716856593 + 0.188315325223 0.112026081947 +Bi S + 0.302425752233 0.209510573559 + 0.084994052070 0.597248589405 +Bi P + 1.494708346058 0.395953810643 + 1.254367409879 -0.565931848387 + 0.207175316033 0.638559022440 + 0.075039529988 0.141797873604 +Bi P + 0.199985131181 0.116806624587 + 0.062373215520 3.086007117955 +Bi D + 0.125642967394 1.009438999376 + 0.092809453349 0.739934271101 + +#BASIS SET: +Po S + 1.899315412209 0.398802099258 + 1.330205684309 -0.819810438961 + 0.295449029905 0.530266670623 + 0.199451922422 0.183432964269 +Po S + 0.315025299523 0.224516411804 + 0.104988411226 0.818764958809 +Po P + 1.611909638782 0.155499395899 + 1.099867763114 -0.399122211489 + 0.206617899935 0.547783019859 + 0.120990132583 0.207180326062 +Po P + 0.471197704174 0.778854060703 + 0.062603532297 1.066397923226 +Po D + 0.165433941928 0.960300261731 + 0.062674049116 0.786318229414 + +#BASIS SET: +At S + 1.924148580749 0.525856661156 + 1.346688705121 -1.067720621131 + 0.359439113226 0.638723774762 + 0.188782116680 0.502953817840 +At S + 0.436733046143 0.086926389554 + 0.086660202651 0.943503262571 +At P + 2.082544353701 0.062574676624 + 1.160295712623 -0.258455276224 + 0.350563463518 0.458257809303 + 0.160220695112 0.515683272949 +At P + 0.191314694381 0.104704037336 + 0.069240934858 0.929284118241 +At D + 0.286195490906 0.174109793764 + 0.168758025833 1.048978368888 + +#BASIS SET: +Rn S + 1.974019782555 0.658758503133 + 1.478356500621 -1.204012936389 + 0.338292719593 0.976772343385 + 0.138050661999 0.215247233371 +Rn S + 0.239238522409 0.078556974413 + 0.115156734440 1.152238798180 +Rn P + 1.866046112655 0.255378767651 + 1.572334840818 -0.377681250068 + 0.317716247096 0.531475806749 + 0.141239873985 0.285072318557 +Rn P + 0.200353951276 0.101799470336 + 0.077190057936 0.881907208554 +Rn D + 0.333065558020 0.245718887909 + 0.214108626479 1.219897201244 + +END diff --git a/gpu4pyscf/drivers/dft_3c_driver.py b/gpu4pyscf/drivers/dft_3c_driver.py new file mode 100644 index 00000000..adf9fe8f --- /dev/null +++ b/gpu4pyscf/drivers/dft_3c_driver.py @@ -0,0 +1,409 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +################################################################### +# This is a customized driver for three composite methods only +# It only works for b97-3c, r2scan-3c, and wb97x-3c +################################################################### + +import os +import time +import json +import pyscf +import argparse +import tempfile +import shutil +import cupy +import traceback +import h5py +import numpy as np +from types import MethodType +from pyscf import lib +from pyscf import dft +from pyscf.hessian import thermo +from pyscf.lib import logger +from pyscf.dispersion import dftd3, dftd4, gcp + +CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) + +import importlib.metadata +required_version = "1.3.0" +installed_version = importlib.metadata.version('pyscf-dispersion') +assert installed_version >= required_version + +def parse_3c(xc_name): + """ + return xc, nlc, basis, ecp, (xc_disp, disp), xc_gcp + """ + if xc_name == 'b973c': + return 'GGA_XC_B97_3C', 0, 'def2-mtzvp', None, ('b97-3c', 'D3BJ'), 'b973c' + elif xc_name == 'r2scan3c': + return 'r2scan', 0, 'def2-mtzvpp', None, ('r2scan-3c', 'D4'), 'r2scan3c' + elif xc_name == 'wb97x3c': + # 'Grimme vDZP' is available is BSE, but pyscf 2.8 is not able to parse ECP properly + # basis = 'Grimme vDZP' + # ecp = 'Grimme vDZP' + basis = os.path.join(CURRENT_DIR, 'basis_vDZP_NWCHEM.dat') + ecp = os.path.join(CURRENT_DIR, 'ecp_vDZP_NWCHEM.dat') + return 'wb97x-v', 0, basis, ecp, ('wb97x-3c', 'D4'), None + else: + raise RuntimeError('Unknow xc functionals for parsing 3c') + +def get_dispersion(mol, xc, grad=True): + if xc == 'b97-3c': + d3_model = dftd3.DFTD3Dispersion(mol, xc=xc, atm=True) + res = d3_model.get_dispersion(grad=grad) + elif xc == 'r2scan-3c': + # r2scan-3c use customized parameters + # https://github.com/psi4/psi4/blob/0e54962d629494f4ed142d0499d7faeaf36effdd/psi4/driver/procrouting/dft/mgga_functionals.py#L250 + d4_model = dftd4.DFTD4Dispersion(mol, xc=xc, atm=True, ga=2.0, gc=1.0) + d4_model.set_param(0.0, 0.42, 5.65, s9=2.0) + res = d4_model.get_dispersion(grad=grad) + elif xc == 'wb97x-3c': + d4_model = dftd4.DFTD4Dispersion(mol, xc=xc, atm=True) + res = d4_model.get_dispersion(grad=grad) + else: + raise NotImplementedError + return res + +def gen_disp_fun(xc_disp, xc_gcp): + """ + Generate a function to calculate the sum of dispersion and gcp contributions + """ + def get_disp(mf, disp=None, with_3body=None, verbose=None): + mol = mf.mol + energy = 0.0 + if xc_disp is not None: + res = get_dispersion(mol, xc_disp, grad=False) + energy += res.get('energy') + mf.scf_summary['dispersion'] = energy + if xc_gcp is not None: + gcp_model = gcp.GCP(mol, method=xc_gcp) + res = gcp_model.get_counterpoise() + energy += res['energy'] + return energy + return get_disp + +def gen_disp_grad_fun(xc_disp, xc_gcp): + """ + Generate a function to calculate gradient of dispersion + gcp + """ + def get_disp_grad(mf_grad, disp=None, with_3body=None, verbose=None): + mf = mf_grad.base + mol = mf.mol + gradient = 0.0 + if xc_disp is not None: + res = get_dispersion(mol, xc_disp, grad=True) + gradient += res.get('gradient') + + if xc_gcp is not None: + gcp_model = gcp.GCP(mol, method=xc_gcp) + res = gcp_model.get_counterpoise(grad=True) + gradient += res['gradient'] + return gradient + return get_disp_grad + +def gen_disp_hess_fun(xc_disp, xc_gcp): + """ + Generate a function to calculate Hessian of dispersion + gcp + """ + def get_disp_hess(mf_hess, disp=None, with_3body=None): + mf = mf_hess.base + mol = mf.mol + natm = mol.natm + h_disp = np.empty([natm,natm,3,3]) + + coords = mf_hess.mol.atom_coords() + mol = mol.copy() + eps = 1e-5 + for i in range(natm): + for j in range(3): + coords[i,j] += eps + mol.set_geom_(coords, unit='Bohr') + g1 = 0.0 + if xc_disp is not None: + res = get_dispersion(mol, xc_disp, grad=True) + g1 += res.get('gradient') + if xc_gcp is not None: + gcp_model = gcp.GCP(mol, method=xc_gcp) + res = gcp_model.get_counterpoise(grad=True) + g1 += res['gradient'] + + coords[i,j] -= 2.0*eps + mol.set_geom_(coords, unit='Bohr') + g2 = 0.0 + if xc_disp is not None: + res = get_dispersion(mol, xc_disp, grad=True) + g2 += res.get('gradient') + if xc_gcp is not None: + gcp_model = gcp.GCP(mol, method=xc_gcp) + res = gcp_model.get_counterpoise(grad=True) + g2 += res['gradient'] + + coords[i,j] += eps + h_disp[i,:,j,:] = (g1 - g2)/(2.0*eps) + return h_disp + return get_disp_hess + +def run_dft(mol_name, config, charge=None, spin=0): + ''' Perform DFT calculations based on the configuration file. + Saving the results, timing, and log to a HDF5 file. + ''' + xc = config.get('xc', 'b3lyp') + grids = config.get('grids', {'atom_grid': (99,590)}) + nlcgrids = config.get('nlcgrids', {'atom_grid': (50,194)}) + verbose = config.get('verbose', 4) + scf_conv_tol = config.get('scf_conv_tol', 1e-10) + direct_scf_tol = config.get('direct_scf_tol', 1e-14) + with_df = config.get('with_df', True) + auxbasis = config.get('auxbasis', 'def2-universal-jkfit') + with_gpu = config.get('with_gpu', True) + + with_grad = config.get('with_grad', True) + with_hess = config.get('with_hess', True) + with_thermo = config.get('with_thermo', False) + save_density = config.get('save_density', False) + input_dir = config.get('input_dir', './') + + default_solvent = {'method': 'iefpcm', 'eps': 78.3553, 'solvent': 'water'} + with_solvent = config.get('with_solvent', False) + solvent = config.get('solvent', default_solvent) + + pyscf_xc, nlc, basis, ecp, (xc_disp, disp), xc_gcp = parse_3c(xc) + + # I/O + fp = tempfile.TemporaryDirectory() + local_dir = f'{fp.name}/' + logfile = f'{mol_name[:-4]}_pyscf.log' + shutil.copyfile(f'{input_dir}/{mol_name}', local_dir+mol_name) + cupy.get_default_memory_pool().free_all_blocks() + lib.num_threads(8) + start_time = time.time() + mol = pyscf.M( + atom=local_dir+mol_name, + basis=basis, + ecp=ecp, + max_memory=32000, + verbose=verbose, + charge=charge, + spin=spin, + output=f'{local_dir}/{logfile}') + mol.build() + + mf = dft.KS(mol, xc=pyscf_xc) + if 'atom_grid' in grids: mf.grids.atom_grid = grids['atom_grid'] + if 'level' in grids: mf.grids.level = grids['level'] + if mf._numint.libxc.is_nlc(mf.xc): + if 'atom_grid' in nlcgrids: mf.nlcgrids.atom_grid = nlcgrids['atom_grid'] + if 'level' in nlcgrids: mf.nlcgrids.level = nlcgrids['level'] + + if with_df: + mf = mf.density_fit(auxbasis=auxbasis) + if with_gpu: + mf = mf.to_gpu() + + #### Changes for 3C methods ##### + # Setup dispersion correction and GCP + mf.nlc = nlc + mf.get_dispersion = MethodType(gen_disp_fun(xc_disp, xc_gcp), mf) + mf.do_disp = lambda: True + ################################# + + mf.chkfile = None + if with_solvent: + if solvent['method'].endswith(('PCM', 'pcm')): + mf = mf.PCM() + mf.with_solvent.lebedev_order = 29 + mf.with_solvent.method = solvent['method'].replace('PCM','-PCM') + mf.with_solvent.eps = solvent['eps'] + elif solvent['method'].endswith(('smd', 'SMD')): + mf = mf.SMD() + mf.with_solvent.lebedev_order = 29 + mf.with_solvent.method = 'SMD' + mf.with_solvent.solvent = solvent['solvent'] + else: + raise NotImplementedError + + mf.direct_scf_tol = direct_scf_tol + mf.chkfile = None + mf.conv_tol = scf_conv_tol + e_tot = mf.kernel() + + if not mf.converged: + logger.warn(mf, 'SCF failed to converge') + + scf_time = time.time() - start_time + print(f'compute time for energy: {scf_time:.3f} s') + + e1 = mf.scf_summary.get('e1', 0.0) + e_coul = mf.scf_summary.get('coul', 0.0) + e_xc = mf.scf_summary.get('exc', 0.0) + e_disp = mf.scf_summary.get('dispersion', 0.0) + e_solvent = mf.scf_summary.get('e_solvent', 0.0) + + data_file = mol_name[:-4] + '_pyscf.h5' + + with h5py.File(f'{local_dir}/{data_file}', 'w') as h5f: + h5f.create_dataset('e_tot', data=e_tot) + h5f.create_dataset('e1', data=e1) + h5f.create_dataset('e_coul', data=e_coul) + h5f.create_dataset('e_xc', data=e_xc) + h5f.create_dataset('e_disp', data=e_disp) + h5f.create_dataset('e_solvent', data=e_solvent) + h5f.create_dataset('scf_time', data=scf_time) + + dm = mf.make_rdm1() + if isinstance(dm, cupy.ndarray): dm = dm.get() + h5f.create_dataset('dm', data=dm) + + if save_density and xc.lower() != 'hf': + weights = mf.grids.weights + coords = mf.grids.coords + dm0 = dm[0] + dm[1] if dm.ndim == 3 else dm + rho = mf._numint.get_rho(mf.mol, dm0, mf.grids) + + if isinstance(weights, cupy.ndarray): weights = weights.get() + if isinstance(coords, cupy.ndarray): coords = coords.get() + if isinstance(rho, cupy.ndarray): rho = rho.get() + + h5f.create_dataset('grids_weights', data=weights) + h5f.create_dataset('grids_coords', data=coords) + h5f.create_dataset('grids_rho', data=rho) + + if dm.ndim == 3: + # open-shell case + mo_energy = mf.mo_energy + if isinstance(mo_energy, cupy.ndarray): mo_energy = mo_energy.get() + mo_energy[0].sort() + mo_energy[1].sort() + na, nb = mf.nelec + h5f.create_dataset('e_lumo_alpha', data=mo_energy[0][na]) + h5f.create_dataset('e_lumo_beta', data=mo_energy[1][nb]) + h5f.create_dataset('e_homo_alpha', data=mo_energy[0][na-1]) + h5f.create_dataset('e_homo_beta', data=mo_energy[1][nb-1]) + else: + # closed-shell case + mo_energy = mf.mo_energy + if isinstance(mo_energy, cupy.ndarray): mo_energy = mo_energy.get() + mo_energy.sort() + nocc = mf.mol.nelectron // 2 + h5f.create_dataset('e_lumo', data=mo_energy[nocc]) + h5f.create_dataset('e_homo', data=mo_energy[nocc-1]) + + ##################### Gradient Calculation ############################### + g = None + if with_grad: + try: + start_time = time.time() + g = mf.nuc_grad_method() + # Overwrite the method for 3C method + g.get_dispersion = MethodType(gen_disp_grad_fun(xc_disp, xc_gcp), g) + if with_df: + g.auxbasis_response = True + f = g.kernel() + g = None + grad_time = time.time() - start_time + print(f'compute time for gradient: {grad_time:.3f} s') + except Exception as exc: + print(traceback.format_exc()) + print(exc) + f = -1 + grad_time = -1 + + with h5py.File(f'{local_dir}/{data_file}', 'a') as h5f: + h5f.create_dataset('grad', data=f) + h5f.create_dataset('grad_time', data=grad_time) + + #################### Hessian Calculation ############################### + h = None + if with_hess: + try: + natm = mol.natm + start_time = time.time() + h = mf.Hessian() + # Overwrite the method for 3C method + h.get_dispersion = MethodType(gen_disp_hess_fun(xc_disp, xc_gcp), h) + h.auxbasis_response = 2 + _h_dft = h.kernel() + h_dft = _h_dft.transpose([0,2,1,3]).reshape([3*natm, 3*natm]) + hess_time = time.time() - start_time + print(f'compute time for hessian: {hess_time:.3f} s') + + if with_thermo: + # harmonic analysis + start_time = time.time() + normal_mode = thermo.harmonic_analysis(mol, _h_dft) + + thermo_dat = thermo.thermo( + mf, # GPU4PySCF object + normal_mode['freq_au'], + 298.15, # room temperature + 101325) # standard atmosphere + thermo_time = time.time() - start_time + print(f'compute time for harmonic analysis: {thermo_time:.3f} s') + + except Exception as exc: + print(traceback.format_exc()) + print(exc) + h_dft = -1 + hess_time = -1 + + with h5py.File(f'{local_dir}/{data_file}', 'a') as h5f: + h5f.create_dataset('hess', data=h_dft) + h5f.create_dataset('hess_time', data=hess_time) + + if with_thermo: + h5f.create_dataset('freq_au', data=normal_mode['freq_au']) + h5f.create_dataset('freq_wavenumber', data=normal_mode['freq_wavenumber']) + h5f.create_dataset('E_tot', data=thermo_dat['E_tot'][0]) + h5f.create_dataset('H_tot', data=thermo_dat['H_tot'][0]) + h5f.create_dataset('G_tot', data=thermo_dat['G_tot'][0]) + h5f.create_dataset('E_elec', data=thermo_dat['E_elec'][0]) + h5f.create_dataset('E_trans', data=thermo_dat['E_trans'][0]) + h5f.create_dataset('E_rot', data=thermo_dat['E_rot'][0]) + h5f.create_dataset('E_vib', data=thermo_dat['E_vib'][0]) + h5f.create_dataset('E_0K', data=thermo_dat['E_0K'][0]) + h5f.create_dataset('H_elec', data=thermo_dat['H_elec'][0]) + h5f.create_dataset('H_trans', data=thermo_dat['H_trans'][0]) + h5f.create_dataset('H_rot', data=thermo_dat['H_rot'][0]) + h5f.create_dataset('H_vib', data=thermo_dat['H_vib'][0]) + h5f.create_dataset('G_elec', data=thermo_dat['G_elec'][0]) + h5f.create_dataset('G_trans', data=thermo_dat['G_trans'][0]) + h5f.create_dataset('G_rot', data=thermo_dat['G_rot'][0]) + h5f.create_dataset('G_vib', data=thermo_dat['G_vib'][0]) + + # copy the files to destination folder + output_dir = config['output_dir'] + isExist = os.path.exists(output_dir) + if not isExist: + os.makedirs(output_dir) + + shutil.copyfile(f'{local_dir}/{data_file}', f'{output_dir}/{data_file}') + shutil.copyfile(f'{local_dir}/{logfile}', f'{output_dir}/{logfile}') + + return mf + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Run DFT with GPU4PySCF for molecules') + parser.add_argument("--config", type=str, default='example.json') + parser.add_argument("--charge", type=int, default=None) + parser.add_argument("--spin", type=int, default=0) + args = parser.parse_args() + + with open(args.config) as f: + config = json.load(f) + if isinstance(config, list): + config = config[0] + for mol_name in config['molecules']: + run_dft(mol_name, config, charge=args.charge, spin=args.spin) diff --git a/gpu4pyscf/drivers/dft_b973c_sample.json b/gpu4pyscf/drivers/dft_b973c_sample.json new file mode 100644 index 00000000..9085744c --- /dev/null +++ b/gpu4pyscf/drivers/dft_b973c_sample.json @@ -0,0 +1,26 @@ +[{ + "input_dir": "./", + "output_dir": "./", + "molecules": [ + "h2o.xyz", + "h2o.xyz", + "h2o.xyz", + "h2o.xyz", + "h2o.xyz" + ], + "xc": "b973c", + "auxbasis": "def2-universal-JFIT", + "verbose": 6, + "with_solvent": false, + "with_thermo": false, + "solvent": { + "eps": 78.3553, + "solvent": "water", + "method": "SMD" + }, + "with_gpu": true, + "with_df": true, + "with_grad": true, + "with_hess": true, + "save_density": true +}] diff --git a/gpu4pyscf/drivers/dft_r2scan3c_sample.json b/gpu4pyscf/drivers/dft_r2scan3c_sample.json new file mode 100644 index 00000000..3f0eb6aa --- /dev/null +++ b/gpu4pyscf/drivers/dft_r2scan3c_sample.json @@ -0,0 +1,26 @@ +[{ + "input_dir": "./", + "output_dir": "./", + "molecules": [ + "h2o.xyz", + "h2o.xyz", + "h2o.xyz", + "h2o.xyz", + "h2o.xyz" + ], + "xc": "r2scan3c", + "auxbasis": "def2-universal-JFIT", + "verbose": 4, + "with_solvent": false, + "with_thermo": false, + "solvent": { + "eps": 78.3553, + "solvent": "water", + "method": "SMD" + }, + "with_gpu": true, + "with_df": true, + "with_grad": true, + "with_hess": true, + "save_density": false +}] diff --git a/gpu4pyscf/drivers/dft_wb97x3c_sample.json b/gpu4pyscf/drivers/dft_wb97x3c_sample.json new file mode 100644 index 00000000..c32e0cc0 --- /dev/null +++ b/gpu4pyscf/drivers/dft_wb97x3c_sample.json @@ -0,0 +1,25 @@ +[{ + "input_dir": "./", + "output_dir": "./", + "molecules": [ + "h2o.xyz", + "h2o.xyz", + "h2o.xyz", + "h2o.xyz", + "h2o.xyz" + ], + "xc": "wb97x3c", + "verbose": 4, + "with_solvent": false, + "with_thermo": false, + "solvent": { + "eps": 78.3553, + "solvent": "water", + "method": "SMD" + }, + "with_gpu": true, + "with_df": true, + "with_grad": true, + "with_hess": false, + "save_density": false +}] diff --git a/gpu4pyscf/drivers/ecp_vDZP_NWCHEM.dat b/gpu4pyscf/drivers/ecp_vDZP_NWCHEM.dat new file mode 100644 index 00000000..228ad57d --- /dev/null +++ b/gpu4pyscf/drivers/ecp_vDZP_NWCHEM.dat @@ -0,0 +1,1214 @@ + +ECP +B nelec 2 +B S +2 4.50610000 23.99296000 +B P +2 5.60000000 -1.30000000 +B D +2 0.08000000 -0.00300000 +B ul +2 1.00000000 0.00000000 +C nelec 2 +C S +2 6.40105200 33.12163800 +C P +2 7.30774700 -1.98625700 +C D +2 5.96179600 -9.45431800 +C ul +2 1.00000000 0.00000000 +N nelec 2 +N S +2 7.97723200 38.53383100 +N P +2 10.18385400 -2.55081000 +N D +2 11.55994700 -2.99554500 +N ul +2 1.00000000 0.00000000 +O nelec 2 +O S +2 10.44567000 50.77106900 +O P +2 18.04517400 -4.90355100 +O D +2 8.16479800 -3.31212400 +O ul +2 1.00000000 0.00000000 +F nelec 2 +F S +2 22.35040000 102.59795200 +2 11.17520000 19.04966300 +F P +2 26.47680000 -15.14396000 +2 13.23840000 2.80292100 +F D +F ul +2 1.00000000 0.00000000 +Ne nelec 2 +Ne S +2 31.86016200 112.52543566 +2 12.36221900 28.30083454 +Ne P +2 21.50803400 -11.12658543 +2 12.91044700 3.38754919 +Ne D +2 0.85038500 -0.18408921 +Ne ul +2 1.00000000 0.00000000 +Na nelec 2 +Na S +2 2.47830001 -14.53866100 +2 3.09900001 31.91120791 +2 3.94710001 -32.32224607 +1 8.21659994 3.14094701 +0 1.50080000 1.87765500 +Na ul +2 1.60680000 -0.00010200 +2 22.52309990 -1.71544300 +1 76.23649979 -1.36191100 +Mg nelec 2 +Mg S +2 2.95159999 -15.62671006 +2 3.73249999 33.53119421 +2 4.81180000 -36.07307196 +1 9.91949999 3.12963399 +0 1.81850000 1.93843301 +Mg ul +2 1.13200000 -0.00005300 +2 27.30570006 -1.93249400 +1 93.30560017 -1.39384700 +Al nelec 10 +Al S +2 2.19822500 20.40981300 +Al P +2 1.60139500 8.98049500 +Al D +2 1.49902600 -1.97041100 +Al ul +2 1.00000000 0.00000000 +Si nelec 10 +Si S +2 2.71362197 26.62331865 +Si P +2 1.96687987 10.92995391 +Si D +2 2.71001600 -4.66941200 +Si ul +2 1.00000000 0.00000000 +P nelec 10 +P S +2 2.94055986 26.53226131 +P P +2 2.22771255 11.49721021 +P D +2 5.66170600 -16.77278000 +P ul +2 1.00000000 0.00000000 +S nelec 10 +S S +2 3.74389164 37.97481900 +S P +2 3.08608744 18.79052931 +S D +2 4.86241400 -7.83796400 +S ul +2 1.00000000 0.00000000 +Cl nelec 10 +Cl S +2 6.39430000 33.13663196 +2 3.19710000 16.27072783 +Cl P +2 5.62070000 24.41699269 +2 2.81030000 7.68304978 +Cl D +2 5.33810000 -8.58764865 +Cl ul +2 1.00000000 0.00000000 +Ar nelec 10 +Ar S +2 10.26172100 68.66778801 +2 3.95272500 24.04276629 +Ar P +2 5.39271400 27.73076331 +2 2.69996700 4.04545904 +Ar D +2 8.08623500 -8.13747696 +2 4.01663200 -1.66452808 +Ar F +2 5.20845900 -3.40009845 +Ar ul +2 1.00000000 0.00000000 +K nelec 10 +K S +2 2.41030002 33.92499542 +2 2.77449989 -117.52191162 +2 3.33690000 108.25654602 +1 2.22169995 5.80375385 +0 12.19449997 3.21023202 +K P +2 3.15459990 -69.97570801 +2 4.03189993 178.15260315 +2 5.16720009 -136.11946106 +1 11.33360004 4.43228197 +0 1.74329996 4.74835920 +K ul +2 4.99889994 -1.96356797 +2 14.58220005 -15.53031635 +2 44.89559937 -38.57669830 +1 141.48350525 -7.30101395 +Ca nelec 10 +Ca S +2 2.68330002 29.85718918 +2 3.08299994 -117.43736267 +2 3.70659995 102.65641785 +1 2.47650003 10.56573391 +0 13.28890038 3.14651394 +Ca P +2 3.47429991 -77.17020416 +2 4.38199997 180.64172363 +2 5.66209984 -125.80883789 +1 2.20009995 5.88094378 +0 6.12169981 4.76948595 +Ca ul +2 5.92140007 -2.26729298 +2 17.20369911 -17.17546654 +2 53.37039948 -42.60916519 +1 169.98489380 -7.46766806 +Sc nelec 10 +Sc S +2 11.50000000 138.53815200 +2 5.18400000 14.83404210 +Sc P +2 10.93000000 82.45861400 +2 4.58100000 8.56520569 +Sc D +2 13.47000000 -16.12986210 +2 4.37500000 -0.53469012 +Sc ul +2 1.00000000 0.00000000 +Ti nelec 10 +Ti S +2 13.01000000 158.24159300 +2 5.86200000 17.51182390 +Ti P +2 12.46000000 95.23512680 +2 5.21700000 10.04785600 +Ti D +2 15.35000000 -17.56886120 +2 4.98000000 -0.58725612 +Ti ul +2 1.00000000 0.00000000 +V nelec 10 +V S +2 14.49000000 178.44797100 +2 6.52400000 19.83137520 +V P +2 14.30000000 109.52976300 +2 6.02100000 12.57030950 +V D +2 17.48000000 -19.21965700 +2 5.70900000 -0.64277474 +V ul +2 1.00000000 0.00000000 +Cr nelec 10 +Cr S +2 16.39000000 201.57888700 +2 7.40200000 24.20574090 +Cr P +2 16.45000000 125.02277400 +2 6.96200000 16.47906550 +Cr D +2 19.93000000 -20.82742110 +2 6.59800000 -0.83436781 +Cr ul +2 1.00000000 0.00000000 +Mn nelec 10 +Mn S +2 18.52000000 226.43090200 +2 8.37300000 30.35907230 +Mn P +2 18.92000000 142.15470500 +2 8.01700000 21.53650930 +Mn D +2 22.72000000 -22.56811870 +2 7.64000000 -1.20581020 +Mn ul +2 1.00000000 0.00000000 +Fe nelec 10 +Fe S +2 20.93000000 253.74958800 +2 9.44500000 37.92284500 +Fe P +2 21.76000000 161.03681200 +2 9.17800000 27.65129800 +Fe D +2 25.90000000 -24.43127600 +2 8.83500000 -1.43425100 +Fe ul +2 1.00000000 0.00000000 +Co nelec 10 +Co S +2 23.66000000 283.96056600 +2 10.61000000 47.15684590 +Co P +2 25.04000000 182.21223600 +2 10.44000000 35.23335150 +Co D +2 29.54000000 -26.47533270 +2 10.18000000 -1.82578723 +Co ul +2 1.00000000 0.00000000 +Ni nelec 10 +Ni S +2 26.74000000 317.68227200 +2 11.86000000 58.25539100 +Ni P +2 28.80000000 252.47436600 +2 11.79000000 36.08150310 +Ni D +2 33.70000000 -18.52295510 +2 11.66000000 -4.55766810 +Ni ul +2 1.00000000 0.00000000 +Cu nelec 10 +Cu S +2 30.11054300 355.75051200 +2 13.07631000 70.93090600 +Cu P +2 32.69261400 77.96993100 +2 32.77033900 155.92744800 +2 13.75106700 18.02113200 +2 13.32216600 36.09437200 +Cu D +2 38.99651100 -12.34341000 +2 39.53978800 -18.27336200 +2 12.28751100 -0.98470500 +2 11.45930000 -1.31874700 +Cu F +2 6.19010200 -0.22726400 +2 8.11878000 -0.46877300 +Cu ul +2 1.00000000 0.00000000 +Zn nelec 10 +Zn S +2 34.17400100 399.98639900 +2 14.45637100 85.48975000 +Zn P +2 39.88868300 92.38107700 +2 39.65501700 184.77117600 +2 15.29054600 23.00254100 +2 14.90352400 46.05742700 +Zn D +2 43.70829600 -13.69073400 +2 43.69853600 -20.54398000 +2 15.15071800 -1.31615400 +2 15.28244100 -1.83871500 +Zn F +2 8.16001400 -0.37036000 +2 12.22842200 -1.06294300 +Zn ul +2 1.00000000 0.00000000 +Ga nelec 28 +Ga S +2 5.21596000 203.85397200 +Ga P +2 4.30890400 156.10339000 +Ga D +2 0.49635700 1.03164700 +Ga F +2 1.71517000 -10.67373500 +Ga ul +2 1.00000000 0.00000000 +Ge nelec 28 +Ge S +2 4.81540900 149.24657900 +Ge P +2 4.16951500 132.84433500 +Ge D +2 0.59195800 1.34615400 +Ge F +2 1.79177000 -7.04422300 +Ge ul +2 1.00000000 0.00000000 +As nelec 28 +As S +2 3.61262500 53.96562000 +As P +2 3.90792600 88.94908800 +As D +2 1.92646700 22.42028800 +As F +2 1.77343400 -4.70481500 +As ul +2 1.00000000 0.00000000 +Se nelec 28 +Se S +2 4.23705700 79.66334500 +Se P +2 2.91033400 31.56099300 +Se D +2 2.33570100 30.80461000 +Se F +2 2.25463900 -6.54687500 +Se ul +2 1.00000000 0.00000000 +Br nelec 28 +Br S +2 5.02180000 61.51372100 +2 2.51090000 9.02149300 +Br P +2 4.28140000 53.87586400 +2 2.14070000 4.62940200 +Br D +2 2.88000000 20.84967700 +2 1.44000000 2.96544400 +Br F +2 2.72070000 -8.16149300 +Br ul +2 1.00000000 0.00000000 +Kr nelec 28 +Kr S +2 5.87771800 73.91569390 +2 3.08462200 16.16825080 +Kr P +2 5.16411000 58.51769101 +2 2.35830200 8.25910073 +Kr D +2 3.21536200 33.45822776 +2 1.28500800 0.67725331 +Kr F +2 4.08286900 -15.15869859 +2 1.19396000 -0.17408825 +Kr G +2 3.18077500 -6.83315877 +Kr ul +2 1.00000000 0.00000000 +Rb nelec 28 +Rb S +2 2.29809999 50.81394196 +2 2.66269994 -162.04731750 +2 3.50929999 313.81082153 +2 4.96980000 -309.75451660 +2 6.94840002 216.07606506 +1 17.70389938 20.86063194 +0 25.66029930 3.36120105 +Rb P +2 2.02160001 45.41232300 +2 2.33979988 -145.47238159 +2 3.07839990 283.18420410 +2 4.37570000 -305.10214233 +2 6.15859985 207.65396118 +1 16.77890015 12.15985012 +0 16.61680031 5.39989424 +Rb D +2 1.23380005 31.68275070 +2 1.41939998 -100.62529755 +2 1.83389997 186.52160645 +2 2.54550004 -239.76072693 +2 3.47009993 170.19052124 +1 10.62069988 9.91743755 +0 9.28610039 7.41062880 +Rb ul +2 1.96459997 -1.04400003 +2 5.02349997 -12.26854706 +2 12.31190014 -40.49360657 +2 39.43920136 -92.10794830 +1 116.43070221 -20.25083160 +Sr nelec 28 +Sr S +2 2.44670010 53.58986664 +2 2.86780000 -172.08218384 +2 3.86610007 345.58593750 +2 5.66069984 -351.22171021 +2 8.30790043 257.34286499 +1 23.49519920 12.91709232 +0 21.03380013 6.33449411 +Sr P +2 2.20950007 49.15122604 +2 2.58439994 -158.82582092 +2 3.46840000 320.08462524 +2 5.06430006 -349.10769653 +2 7.37960005 239.32991028 +1 22.22710037 11.86419868 +0 19.91810036 5.33859777 +Sr D +2 1.40730000 32.67572403 +2 1.62419999 -103.22133636 +2 2.11750007 193.27650452 +2 2.97239995 -248.63302612 +2 4.11800003 183.39566040 +1 12.52390003 10.14631939 +0 10.88269997 7.38135815 +Sr ul +2 2.23399997 -1.16187501 +2 5.67100000 -13.37399960 +2 13.88179970 -43.23659134 +2 44.81060028 -100.09903717 +1 132.90260315 -20.51813126 +Y nelec 28 +Y S +2 7.48804900 135.15384400 +2 3.74402500 15.55244100 +Y P +2 6.44537700 87.78499200 +2 3.22268900 11.56406600 +Y D +2 4.65844700 29.70100100 +2 2.32922400 5.53996800 +Y F +2 6.58421200 -19.12219800 +2 3.29210600 -2.43637500 +Y ul +2 1.00000000 0.00000000 +Zr nelec 28 +Zr S +2 8.20000000 150.26759100 +2 4.08972800 18.97621600 +Zr P +2 7.11000000 99.62212400 +2 3.59679800 14.16873300 +Zr D +2 5.35000000 35.04512400 +2 2.49182100 6.11125900 +Zr F +2 7.54000000 -21.09377600 +2 3.77000000 -3.08069400 +Zr ul +2 1.00000000 0.00000000 +Nb nelec 28 +Nb S +2 8.90000000 165.17914300 +2 4.43000000 21.99297400 +Nb P +2 7.77000000 111.79441400 +2 3.96000000 16.63348300 +Nb D +2 6.05000000 38.11224900 +2 2.84000000 8.03916700 +Nb F +2 8.49000000 -22.92955000 +2 4.25000000 -3.66631000 +Nb ul +2 1.00000000 0.00000000 +Mo nelec 28 +Mo S +2 9.71459400 180.10310800 +2 4.68050000 24.99722800 +Mo P +2 8.14213700 123.77275200 +2 4.62598600 19.53022800 +Mo D +2 6.61841500 48.37502200 +2 3.24875200 8.89205300 +Mo F +2 9.45000000 -24.80517700 +2 4.72000000 -4.15378200 +Mo ul +2 1.00000000 0.00000000 +Tc nelec 28 +Tc S +2 10.42234600 195.15916600 +2 5.03651600 28.09260300 +Tc P +2 8.95044900 135.28456600 +2 4.85443900 21.80650400 +Tc D +2 6.94569700 54.32972900 +2 3.97058500 11.15506800 +Tc F +2 10.40000000 -26.56244700 +2 5.20000000 -4.58568100 +Tc ul +2 1.00000000 0.00000000 +Ru nelec 28 +Ru S +2 11.10526900 209.82297100 +2 5.41474500 30.65472600 +Ru P +2 9.77127100 146.33618200 +2 5.07399100 24.12787700 +Ru D +2 7.67142300 67.51589700 +2 4.13656500 9.87010400 +Ru F +2 11.36000000 -28.34061600 +2 5.68000000 -4.94462900 +Ru ul +2 1.00000000 0.00000000 +Rh nelec 28 +Rh S +2 11.72000000 225.34775400 +2 5.82000000 32.82318900 +Rh P +2 10.42000000 158.70941200 +2 5.45000000 26.44410000 +Rh D +2 8.82000000 62.75862600 +2 3.87000000 10.97871900 +Rh F +2 12.31000000 -30.09345600 +2 6.16000000 -5.21848200 +Rh ul +2 1.00000000 0.00000000 +Pd nelec 28 +Pd S +2 12.43000000 240.22904000 +2 6.17075900 35.17194300 +Pd P +2 11.08000000 170.41727600 +2 5.82955400 28.47213300 +Pd D +2 9.51000000 69.01384500 +2 4.13978100 11.75086200 +Pd F +2 13.27000000 -31.92955400 +2 6.63000000 -5.39821700 +Pd ul +2 1.00000000 0.00000000 +Ag nelec 28 +Ag S +2 13.13000000 255.13936500 +2 6.51000000 36.86612200 +Ag P +2 11.74000000 182.18186900 +2 6.20000000 30.35775100 +Ag D +2 10.21000000 73.71926100 +2 4.38000000 12.50211700 +Ag F +2 14.22000000 -33.68992000 +2 7.11000000 -5.53112000 +Ag ul +2 1.00000000 0.00000000 +Cd nelec 28 +Cd S +2 13.83586900 270.00948300 +2 6.85727000 38.76730800 +Cd P +2 12.40497100 193.82962900 +2 6.56779900 31.89652500 +Cd D +2 10.89692500 79.19364700 +2 4.64116500 13.23082700 +Cd F +2 15.18479600 -35.47662600 +2 7.59239800 -5.61767700 +Cd ul +2 1.00000000 0.00000000 +In nelec 46 +In S +2 1.43509100 29.16521900 +2 0.69580500 -4.19080600 +In P +2 1.44083200 36.99054200 +2 0.70139200 -3.36582000 +In D +2 0.96123600 20.00053100 +In F +2 0.88436900 -6.01909200 +In ul +2 1.00000000 0.00000000 +Sn nelec 46 +Sn S +2 1.96972500 67.92534700 +2 0.97237500 -7.47854600 +Sn P +2 1.99921000 56.60288000 +2 0.99904200 -2.16177600 +Sn D +2 0.50036100 2.57633600 +Sn F +2 1.23088000 -10.10925300 +Sn ul +2 1.00000000 0.00000000 +Sb nelec 46 +Sb S +2 2.49109100 68.42793800 +2 1.34157500 -4.39863100 +Sb P +2 2.14386400 63.96546900 +2 0.58550300 -0.57872600 +Sb D +2 0.79540100 7.80366100 +Sb F +2 1.60925100 -14.51768700 +Sb ul +2 1.00000000 0.00000000 +Te nelec 46 +Te S +2 2.92379400 50.08380500 +2 1.15275400 1.96814000 +Te P +2 2.60308600 119.82070200 +2 0.98544800 -2.03904800 +Te D +2 1.43501900 37.75721400 +Te F +2 1.93927000 -17.86464100 +Te ul +2 1.00000000 0.00000000 +I nelec 46 +I S +2 3.51120000 83.11386300 +2 1.75560000 5.20187600 +I P +2 2.96880000 82.81110900 +2 1.48440000 3.37968200 +I D +2 1.90660000 10.30427700 +2 0.95330000 7.58803200 +I F +2 2.30750000 -21.47793600 +I ul +2 1.00000000 0.00000000 +Xe nelec 46 +Xe S +2 3.94026300 122.76382934 +2 2.27726400 8.30885115 +Xe P +2 3.02837300 68.82300437 +2 1.39431900 3.64674223 +Xe D +2 2.12260500 23.65207854 +2 0.79866900 3.25844113 +Xe F +2 6.16436000 -47.70319876 +2 1.54237400 -6.54113991 +Xe G +2 1.84789200 -7.10585060 +Xe ul +2 1.00000000 0.00000000 +Cs nelec 46 +Cs S +2 1.38530004 42.85466766 +2 1.63240004 -138.00901794 +2 2.20580006 275.99960327 +2 3.22149992 -280.45663452 +2 4.64960003 199.82038879 +1 15.15250015 27.73096657 +0 19.00049973 3.76870608 +Cs P +2 1.25950003 48.66250992 +2 1.44169998 -145.70526123 +2 1.87639999 264.46368408 +2 2.65750003 -279.85159302 +2 3.63870001 184.35585022 +1 10.65320015 23.30001831 +0 14.68060017 5.76792908 +Cs D +2 0.76810002 34.86072540 +2 0.87459999 -106.79302979 +2 1.10769999 188.23532104 +2 1.48230004 -217.63992310 +2 1.92019999 137.74559021 +1 6.21829987 34.42418671 +0 17.38809967 7.19875193 +Cs ul +2 0.93849999 -0.78916699 +2 2.31629992 -8.42115784 +2 6.00729990 -30.98544312 +2 20.37969971 -95.03477478 +1 59.32889938 -30.07960320 +Ba nelec 46 +Ba S +2 1.51549995 52.18550110 +2 1.80079997 -166.64633179 +2 2.46210003 336.98910522 +2 3.64910007 -346.60510254 +2 5.34859991 229.66429138 +1 17.15789986 20.49417496 +0 16.02389908 6.64949989 +Ba P +2 1.35119998 61.51347351 +2 1.55149996 -171.17402649 +2 2.06870008 303.61636353 +2 3.00740004 -324.12673950 +2 4.23050022 210.71342468 +1 14.21850014 19.11876488 +0 13.10439968 5.84502220 +Ba D +2 0.89740002 34.92659378 +2 1.02090001 -109.23178864 +2 1.28859997 196.23254395 +2 1.73850000 -224.63766479 +2 2.28509998 146.97143555 +1 7.33769989 37.01747894 +0 20.39410019 7.09744883 +Ba ul +2 0.97610003 -0.88013703 +2 2.66910005 -10.01861763 +2 7.10550022 -35.70346451 +2 24.84989929 -114.57715607 +1 75.09850311 -30.99500656 +La nelec 46 +La S +2 3.30990000 91.93217700 +2 1.65500000 -3.78876400 +La P +2 2.83680000 63.75948600 +2 1.41840000 -0.64795800 +La D +2 2.02130000 36.11617300 +2 1.01070000 0.21911400 +La F +2 4.02860000 -36.01001600 +La ul +2 1.00000000 0.00000000 +Ce nelec 46 +Ce S +2 1.89370130 -255.56238300 +2 1.97914860 307.31392800 +0 10.74296970 10.66990170 +Ce P +0 7.75592980 12.22921090 +2 1.81564130 124.94246600 +2 1.67164720 -84.59998680 +Ce D +2 1.70642050 24.94467550 +0 6.48933740 10.28614400 +Ce ul +1 9.20747930 -15.34875610 +1 1.86730120 -5.84323950 +Pr nelec 46 +Pr S +2 2.12955580 -223.64398200 +2 2.22746550 278.13451500 +0 7.28371960 12.62107180 +Pr P +0 7.80928040 12.52563900 +2 1.92977860 121.95278200 +2 1.76478920 -79.40475120 +Pr D +2 1.79797430 26.19266520 +0 6.54805150 9.87391210 +Pr ul +1 9.82972860 -15.44352190 +1 1.98070080 -5.89611280 +Nd nelec 46 +Nd S +2 2.22478560 -219.47084000 +2 2.34459590 280.52892800 +0 11.85798300 11.76232500 +Nd P +0 9.44306450 11.43612170 +2 1.98949080 120.33535600 +2 1.79805450 -75.78196430 +Nd D +2 1.88855050 27.38307130 +0 6.64366550 9.61741050 +Nd ul +1 10.42978840 -15.48300440 +1 2.09229130 -5.94833370 +Pm nelec 46 +Pm S +2 2.27365310 -215.58178700 +2 2.40502950 277.53404200 +0 10.76473100 11.55980120 +Pm P +0 9.85397700 10.33666460 +2 2.07626890 121.46769300 +2 1.86493490 -74.87664400 +Pm D +2 1.97798220 28.48611040 +0 6.67329420 9.28090580 +Pm ul +1 11.10670840 -15.58675570 +1 2.21044790 -6.01203460 +Sm nelec 46 +Sm S +2 2.37776610 -206.06726600 +2 2.52471590 270.34259800 +0 10.26438980 11.44490440 +Sm P +0 10.00542360 10.69510070 +2 2.20124120 121.87722800 +2 1.96809280 -72.78839030 +Sm D +2 2.06720440 29.52394000 +0 6.71288900 9.03717700 +Sm ul +1 11.75812390 -15.65862010 +1 2.32808300 -6.05376500 +Eu nelec 46 +Eu S +2 2.50382840 -196.63773800 +2 2.66926410 264.10344900 +0 10.36738240 11.80698020 +Eu P +0 10.13418310 10.93806010 +2 2.32439940 128.16026900 +2 2.08337490 -76.51532760 +Eu D +2 2.15438940 30.46732640 +0 6.69379730 8.74122690 +Eu ul +1 12.41835390 -15.72447990 +1 2.44755850 -6.09107370 +Gd nelec 46 +Gd S +2 2.70226520 -137.90212500 +2 2.93347740 208.96454000 +0 9.85080960 16.85869360 +Gd P +0 10.25620120 11.13083430 +2 2.45273210 131.65132100 +2 2.19693430 -77.44430080 +Gd D +2 2.24716920 31.46641180 +0 6.43064370 8.34507290 +Gd ul +1 13.09509080 -15.78672600 +1 2.56973860 -6.12777810 +Tb nelec 46 +Tb S +2 2.61817350 -140.10322400 +2 2.89041800 215.28993500 +0 17.04897130 11.68123370 +Tb P +0 10.12533140 11.12162130 +2 2.56570380 139.65713100 +2 2.30834840 -83.30982210 +Tb D +2 2.32603330 32.12176790 +0 6.59311800 8.22336260 +Tb ul +1 13.78498800 -15.84080810 +1 2.69386690 -6.16678470 +Dy nelec 46 +Dy S +2 2.73908100 -130.00434800 +2 3.04934710 208.63602400 +0 16.57473300 11.48546660 +Dy P +0 10.69436010 11.62238130 +2 2.74055830 125.41578200 +2 2.41123460 -65.82038280 +Dy D +2 2.40961370 32.79772690 +0 6.49588250 7.98023140 +Dy ul +1 14.49058170 -15.89723700 +1 2.82105130 -6.19939400 +Ho nelec 46 +Ho S +2 2.84745680 -110.58301100 +2 3.22421600 192.70173300 +0 16.15618810 11.35468650 +Ho P +0 9.92608330 10.84274330 +2 2.86230720 116.52006000 +2 2.47244920 -55.66109800 +Ho D +2 2.49115510 33.34249060 +0 6.35100910 7.73004920 +Ho ul +1 15.21414080 -15.95165880 +1 2.95124920 -6.23162570 +Er nelec 46 +Er S +2 3.03618530 -150.36700200 +2 3.35253480 236.19775600 +0 16.03924000 11.41646110 +Er P +0 11.04634630 11.89253580 +2 3.01799110 130.54350100 +2 2.64727180 -65.69229880 +Er D +2 2.57008760 33.73180760 +0 6.16564070 7.48868910 +Er ul +1 15.95965310 -16.00435820 +1 3.08505820 -6.26509470 +Tm nelec 46 +Tm S +2 3.18338300 -143.24595300 +2 3.53531470 233.19553800 +0 16.16013920 11.61094440 +Tm P +0 2.46378240 7.39619030 +2 2.72194360 119.57242900 +2 2.49552880 -84.01744530 +Tm D +2 2.64361640 33.84116700 +0 5.89426350 7.23350090 +Tm ul +1 16.72435540 -16.05536170 +1 3.22216200 -6.29832940 +Yb nelec 46 +Yb S +2 3.29988440 -106.50639900 +2 3.77177090 201.14955200 +0 16.63449170 12.01001670 +Yb P +0 2.65544200 7.59659140 +2 2.83854580 119.64934700 +2 2.59880230 -82.70469640 +Yb D +2 2.71026500 34.17132690 +0 6.26424190 7.27486230 +Yb ul +1 17.51231900 -16.10497510 +1 3.36313060 -6.33277960 +Lu nelec 46 +Lu S +2 3.34224800 -71.31717050 +2 4.00509410 169.40139300 +0 16.18095210 11.88611540 +Lu P +0 2.72110670 7.71934060 +2 2.98200450 116.10736300 +2 2.71480080 -78.17862840 +Lu D +2 2.77571170 33.91192520 +0 5.89380130 7.01871010 +Lu ul +1 18.32043650 -16.15200380 +1 3.50797850 -6.36766820 +Hf nelec 60 +Hf S +2 14.76995900 1499.28471100 +2 7.38497900 40.28210100 +Hf P +2 9.84949000 397.73300500 +2 4.92474500 19.31640600 +Hf D +2 6.09675600 101.32980500 +2 3.04837800 5.87343800 +Hf F +2 1.78577000 10.04672300 +Hf G +2 2.63240000 -9.55824400 +Hf ul +2 1.00000000 0.00000000 +Ta nelec 60 +Ta S +2 14.54640800 1345.88064700 +2 7.27320400 36.76680600 +Ta P +2 9.93556500 378.42530100 +2 4.96778200 22.29309100 +Ta D +2 6.34737700 104.88395600 +2 3.17368800 8.75584800 +Ta F +2 2.01788100 12.01796100 +Ta G +2 3.04033000 -11.72893300 +Ta ul +2 1.00000000 0.00000000 +W nelec 60 +W S +2 14.32285600 1192.39588200 +2 7.16142800 32.52293300 +W P +2 10.02164100 359.03196700 +2 5.01082000 24.03038000 +W D +2 6.59799700 108.30134900 +2 3.29899900 10.98252800 +W F +2 2.25888800 14.15257900 +W G +2 3.46411000 -14.05643500 +W ul +2 1.00000000 0.00000000 +Re nelec 60 +Re S +2 14.09930500 1038.95157200 +2 7.04965300 29.56173800 +Re P +2 10.10771700 339.54351000 +2 5.05385800 24.91369600 +Re D +2 6.84861800 111.69965300 +2 3.42430900 12.62432900 +Re F +2 2.50865100 16.44985200 +Re G +2 3.90124500 -16.50112000 +Re ul +2 1.00000000 0.00000000 +Os nelec 60 +Os S +2 13.87575400 885.40571900 +2 6.93787700 25.96704000 +Os P +2 10.19379300 320.08390200 +2 5.09689600 26.14876500 +Os D +2 7.09923800 115.04484300 +2 3.54961900 13.62257500 +Os F +2 2.76707500 18.90945700 +Os G +2 4.34990500 -19.02759500 +Os ul +2 1.00000000 0.00000000 +Ir nelec 60 +Ir S +2 13.65220300 732.26920000 +2 6.82610100 26.48472100 +Ir P +2 10.27986800 299.48947400 +2 5.13993400 26.46623400 +Ir D +2 7.34985900 124.45759500 +2 3.67492900 14.03599500 +Ir F +2 3.03407200 21.53103100 +Ir G +2 4.80885700 -21.60759700 +Ir ul +2 1.00000000 0.00000000 +Pt nelec 60 +Pt S +2 13.42865100 579.22386100 +2 6.71432600 29.66949100 +Pt P +2 10.36594400 280.86077400 +2 5.18297200 26.74538200 +Pt D +2 7.60047900 120.39644400 +2 3.80024000 15.81092100 +Pt F +2 3.30956900 24.31437600 +Pt G +2 5.27728900 -24.21867500 +Pt ul +2 1.00000000 0.00000000 +Au nelec 60 +Au S +2 13.20510000 426.84667900 +2 6.60255000 37.00708300 +Au P +2 10.45202000 261.19958000 +2 5.22601000 26.96249600 +Au D +2 7.85110000 124.79066600 +2 3.92555000 16.30072600 +Au F +2 4.78982000 30.49008900 +2 2.39491000 5.17107400 +Au ul +2 1.00000000 0.00000000 +Hg nelec 60 +Hg S +2 12.98154900 275.73721200 +2 6.49077400 49.08921200 +Hg P +2 10.53809600 241.54007400 +2 5.26904800 27.39659100 +Hg D +2 8.10172100 127.86700800 +2 4.05086000 16.60831200 +Hg F +2 3.88579100 30.36499600 +Hg G +2 6.24095500 -29.47311800 +Hg ul +2 1.00000000 0.00000000 +Tl nelec 78 +Tl S +2 0.32623800 -1.01649800 +2 1.97754100 51.70795900 +2 10.00000000 73.18668300 +Tl P +2 0.54306300 -2.96267300 +2 1.03214000 19.73043100 +Tl D +2 0.35481700 2.77269000 +2 0.70963300 -3.97943900 +Tl F +2 0.68915600 -4.42678600 +Tl G +2 0.82061700 -12.27054000 +Tl ul +2 1.00000000 0.00000000 +Pb nelec 78 +Pb S +2 0.52916100 -1.87334200 +2 1.45672700 20.86079700 +2 9.99991100 97.58795500 +Pb P +2 0.67811900 -7.76820900 +2 1.24901300 51.71925400 +Pb D +2 0.30744600 1.30076000 +2 0.74493000 2.64082200 +Pb F +2 0.84869900 -5.70605600 +Pb G +2 0.99994100 -7.48418400 +Pb ul +2 1.00000000 0.00000000 +Bi nelec 78 +Bi S +2 0.16115200 -0.16198800 +2 1.50983500 14.03169000 +2 10.00000000 122.04740100 +Bi P +2 0.76049000 -6.18852600 +2 1.42641500 51.04586800 +Bi D +2 0.78022600 20.53580400 +2 0.26007500 -0.13619600 +Bi F +2 0.97360800 -6.41422600 +Bi G +2 1.08819500 -6.65606400 +Bi ul +2 1.00000000 0.00000000 +Po nelec 78 +Po S +2 0.92238600 -4.15930400 +2 1.78191500 33.83035400 +2 10.00000000 146.33910100 +Po P +2 0.72429100 -4.12531100 +2 1.36386000 35.00707800 +Po D +2 0.47697900 1.20651800 +2 0.95395700 13.35612500 +Po F +2 1.07545400 -6.77517400 +Po G +2 1.12209600 -5.51544100 +Po ul +2 1.00000000 0.00000000 +At nelec 78 +At S +2 0.92238600 -5.52846100 +2 1.78191500 39.56886900 +2 10.00000000 170.71138600 +At P +2 0.72429100 -2.29538700 +2 1.36386000 25.49292000 +At D +2 0.63597200 4.86510700 +2 1.27194300 14.57941300 +At F +2 1.15410800 -6.85786700 +At G +2 1.34511500 -7.61303900 +At ul +2 1.00000000 0.00000000 +Rn nelec 78 +Rn S +2 0.92238600 -5.01900500 +2 1.78191500 37.03679000 +2 10.80460100 195.10330800 +Rn P +2 0.72429100 -1.96648100 +2 1.36386000 23.46405900 +Rn D +2 0.76940000 7.48345700 +2 1.53880000 9.36190000 +Rn F +2 1.21389700 -6.76315000 +Rn G +2 1.57646900 -9.91566200 +Rn ul +2 1.00000000 0.00000000 +END diff --git a/gpu4pyscf/drivers/h2o.xyz b/gpu4pyscf/drivers/h2o.xyz index 8c50538d..6072e217 100644 --- a/gpu4pyscf/drivers/h2o.xyz +++ b/gpu4pyscf/drivers/h2o.xyz @@ -1,5 +1,5 @@ 3 -O 99.814000000 100.835000000 101.232000000 -H 99.329200000 99.976800000 101.063000000 -H 99.151600000 101.561000000 101.414000000 +O 0.0000000000 -0.0000000000 0.1174000000 +H -0.7570000000 -0.0000000000 -0.4696000000 +H 0.7570000000 0.0000000000 -0.4696000000 diff --git a/gpu4pyscf/drivers/opt_3c_driver.py b/gpu4pyscf/drivers/opt_3c_driver.py new file mode 100644 index 00000000..20a97aa7 --- /dev/null +++ b/gpu4pyscf/drivers/opt_3c_driver.py @@ -0,0 +1,183 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +################################################################### +# This is a customized driver for three composite methods only +# It only works for b97-3c, r2scan-3c, and wb97x-3c +################################################################### + +import os +import json +import pyscf +import argparse +import tempfile +import shutil +import h5py +from types import MethodType +from pyscf import lib, gto +from pyscf import dft, scf +from pyscf.geomopt.geometric_solver import kernel + +from gpu4pyscf.drivers.dft_3c_driver import ( + parse_3c, gen_disp_fun, gen_disp_grad_fun) + +def opt_mol(mol_name, config, constraints, charge=None, spin=0): + xc = config.get('xc', 'b3lyp') + verbose = config.get('verbose', 4) + scf_conv_tol = config.get('scf_conv_tol', 1e-10) + with_df = config.get('with_df', True) + auxbasis = config.get('auxbasis', None) + with_gpu = config.get('with_gpu', True) + with_solvent = config.get('with_solvent', False) + maxsteps = config.get('maxsteps', 50) + convergence_set = config.get('convergence_set', 'GAU') + + default_solvent = {'method': 'iefpcm', 'eps': 78.3553, 'solvent': 'water'} + with_solvent = config.get('with_solvent', False) + solvent = config.get('solvent', default_solvent) + + # I/O + fp = tempfile.TemporaryDirectory() + local_dir = f'{fp.name}/' + logfile = f'{mol_name[:-4]}_pyscf.log' + + shutil.copyfile(config['input_dir']+mol_name, local_dir+mol_name) + if constraints is not None: + shutil.copyfile(config['input_dir']+constraints, local_dir+constraints) + + pyscf_xc, nlc, basis, ecp, (xc_disp, disp), xc_gcp = parse_3c(xc) + + lib.num_threads(8) + mol = pyscf.M( + atom=local_dir+mol_name, + basis=basis, + ecp=ecp, + max_memory=32000, + verbose=verbose, + charge=charge, + spin=spin, + output=f'{local_dir}/{logfile}') + mol.build() + + mf = dft.KS(mol, xc=pyscf_xc) + mf.grids.atom_grid = (99,590) + if mf._numint.libxc.is_nlc(mf.xc): + mf.nlcgrids.atom_grid = (50,194) + mf.disp = disp + if with_df: + pyscf_auxbasis = auxbasis + if auxbasis == "RIJK-def2-tzvp": + pyscf_auxbasis = 'def2-tzvp-jkfit' + mf = mf.density_fit(auxbasis=pyscf_auxbasis) + if with_gpu: + mf = mf.to_gpu() + + #### Changes for 3C methods ##### + # Setup dispersion correction and GCP + mf.nlc = nlc + mf.get_dispersion = MethodType(gen_disp_fun(xc_disp, xc_gcp), mf) + mf.do_disp = lambda: True + ################################# + + mf.chkfile = None + + if with_solvent: + if solvent['method'].endswith(('PCM', 'pcm')): + mf = mf.PCM() + mf.with_solvent.lebedev_order = 29 + mf.with_solvent.method = solvent['method'].replace('PCM','-PCM') + mf.with_solvent.eps = solvent['eps'] + elif with_solvent and solvent['method'].endswith(('smd', 'SMD')): + mf = mf.SMD() + mf.with_solvent.lebedev_order = 29 + mf.with_solvent.method = 'SMD' + mf.with_solvent.solvent = solvent['solvent'] + else: + raise NotImplementedError + + mf.direct_scf_tol = 1e-14 + mf.chkfile = None + mf.conv_tol = scf_conv_tol + + history = [] + def callback(envs): + result = { + 'energy': envs['energy'], + 'gradients': envs['gradients'], + 'coords': envs['coords'].tolist(), + 'e1': mf.scf_summary.get('e1', 0.0), + 'e_coul': mf.scf_summary.get('coul', 0.0), + 'e_xc': mf.scf_summary.get('exc', 0.0), + 'e_disp': mf.scf_summary.get('dispersion', 0.0) + } + history.append(result) + + grad_scanner = mf.nuc_grad_method().as_scanner() + get_disp = gen_disp_grad_fun(xc_disp, xc_gcp) + grad_scanner.get_dispersion = MethodType(get_disp, grad_scanner) + + geometric_log = f'{mol_name[:-4]}_geometric.log' + import sys + # PySCF forwards geometric log to sys.stderr + with open(f'{local_dir}/{geometric_log}', 'w') as log_file: + sys.stderr = log_file + conv, mol_eq = kernel( + grad_scanner, + maxsteps=maxsteps, + callback=callback, + convergence_set=convergence_set, + constraints=constraints) + sys.stderr = sys.__stderr__ + + # copy the files to destination folder + output_dir = config['output_dir'] + isExist = os.path.exists(output_dir) + if not isExist: + os.makedirs(output_dir) + optimized_xyz = f'{mol_name[:-4]}_opt.xyz' + hist_file = f'{mol_name[:-4]}_hist.h5' + mol_eq.tofile(f'{local_dir}/{optimized_xyz}', format='xyz') + + with h5py.File(f'{local_dir}/{hist_file}', 'w') as h5f: + #json.dump(history, f) + for step, info in enumerate(history): + group = h5f.create_group(f'step_{step}') + for key, array in info.items(): + group.create_dataset(key, data=array) + + shutil.copyfile(f'{local_dir}/{optimized_xyz}', f'{output_dir}/{optimized_xyz}') + shutil.copyfile(f'{local_dir}/{hist_file}', f'{output_dir}/{hist_file}') + shutil.copyfile(f'{local_dir}/{logfile}', f'{output_dir}/{logfile}') + shutil.copyfile(f'{local_dir}/{geometric_log}', f'{output_dir}/{geometric_log}') + if conv: + with open(f'{output_dir}/{mol_name[:-4]}_success.txt', 'w') as file: + file.write("Geometry optimization converged\n") + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Run DFT with GPU4PySCF for molecules') + parser.add_argument("--config", type=str, default='example.json') + parser.add_argument("--charge", type=int, default=None) + parser.add_argument("--spin", type=int, default=0) + args = parser.parse_args() + + with open(args.config) as f: + config = json.load(f) + if isinstance(config, list): + config = config[0] + for i, mol_name in enumerate(config['molecules']): + constraints = None + if 'constraints' in config and config['constraints']: + assert len(config['constraints']) == len(config['molecules']) + constraints = config['constraints'][i] + opt_mol(mol_name, config, constraints, charge=args.charge, spin=args.spin) diff --git a/gpu4pyscf/drivers/opt_b973c_sample.json b/gpu4pyscf/drivers/opt_b973c_sample.json new file mode 100644 index 00000000..c1e12ef3 --- /dev/null +++ b/gpu4pyscf/drivers/opt_b973c_sample.json @@ -0,0 +1,22 @@ +[{ + "input_dir": "./", + "output_dir": "./", + "molecules": [ + "ethane.xyz", + "ethane.xyz" + ], + "constraints": [ + "constraints.txt", + "constraints.txt" + ], + "xc": "b973c", + "auxbasis": "def2-universal-JFIT", + "verbose": 4, + "with_solvent": false, + "solvent": { + "eps": 78.3553, + "method": "CPCM" + }, + "with_gpu": true, + "with_df": true +}] diff --git a/gpu4pyscf/drivers/opt_r2scan3c_sample.json b/gpu4pyscf/drivers/opt_r2scan3c_sample.json new file mode 100644 index 00000000..d793f65a --- /dev/null +++ b/gpu4pyscf/drivers/opt_r2scan3c_sample.json @@ -0,0 +1,22 @@ +[{ + "input_dir": "./", + "output_dir": "./", + "molecules": [ + "ethane.xyz", + "ethane.xyz" + ], + "constraints": [ + "constraints.txt", + "constraints.txt" + ], + "xc": "r2scan3c", + "auxbasis": "def2-universal-JFIT", + "verbose": 4, + "with_solvent": false, + "solvent": { + "eps": 78.3553, + "method": "CPCM" + }, + "with_gpu": true, + "with_df": true +}] diff --git a/gpu4pyscf/drivers/opt_wb97x3c_sample.json b/gpu4pyscf/drivers/opt_wb97x3c_sample.json new file mode 100644 index 00000000..4f4ecb5b --- /dev/null +++ b/gpu4pyscf/drivers/opt_wb97x3c_sample.json @@ -0,0 +1,21 @@ +[{ + "input_dir": "./", + "output_dir": "./", + "molecules": [ + "ethane.xyz", + "ethane.xyz" + ], + "constraints": [ + "constraints.txt", + "constraints.txt" + ], + "xc": "wb97x3c", + "verbose": 4, + "with_solvent": false, + "solvent": { + "eps": 78.3553, + "method": "CPCM" + }, + "with_gpu": true, + "with_df": true +}] diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py index dd374cc3..94399043 100644 --- a/gpu4pyscf/grad/rhf.py +++ b/gpu4pyscf/grad/rhf.py @@ -27,7 +27,7 @@ from gpu4pyscf.scf.hf import KohnShamDFT from gpu4pyscf.lib.cupy_helper import tag_array, contract, condense, sandwich_dot, reduce_to_device from gpu4pyscf.__config__ import props as gpu_specs -from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.__config__ import _streams, num_devices from gpu4pyscf.df import int3c2e #TODO: move int3c2e to out of df from gpu4pyscf.lib import logger from gpu4pyscf.scf import jk @@ -127,7 +127,7 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None, if vhfopt is None: # Small group size for load balance group_size = None - if _num_devices > 1: + if num_devices > 1: group_size = jk.GROUP_SIZE vhfopt = _VHFOpt(mol).build(group_size=group_size) @@ -156,13 +156,13 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None, tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) + for device_id in range(num_devices): + task_list.append(tasks[device_id::num_devices]) cp.cuda.get_current_stream().synchronize() futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _ejk_ip1_task, mol, dms, vhfopt, task_list[device_id], diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index e0d535a4..25f43ef1 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -28,7 +28,7 @@ from gpu4pyscf.lib.cupy_helper import ( contract, get_avail_mem, add_sparse, tag_array, sandwich_dot, reduce_to_device) from gpu4pyscf.lib import logger -from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.__config__ import _streams, num_devices from pyscf import __config__ MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 128*128) @@ -223,8 +223,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, futures = [] cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _get_vxc_task, ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, diff --git a/gpu4pyscf/grad/uks.py b/gpu4pyscf/grad/uks.py index 90582d73..50e8fd05 100644 --- a/gpu4pyscf/grad/uks.py +++ b/gpu4pyscf/grad/uks.py @@ -29,7 +29,7 @@ from gpu4pyscf.lib.cupy_helper import ( contract, get_avail_mem, add_sparse, tag_array, reduce_to_device) from gpu4pyscf.lib import logger -from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.__config__ import _streams, num_devices from gpu4pyscf import __config__ MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 128*128) @@ -230,8 +230,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, futures = [] cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _get_vxc_task, ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, diff --git a/gpu4pyscf/gto/int3c1e.py b/gpu4pyscf/gto/int3c1e.py index 8e6ce88c..e445c458 100644 --- a/gpu4pyscf/gto/int3c1e.py +++ b/gpu4pyscf/gto/int3c1e.py @@ -24,7 +24,7 @@ from gpu4pyscf.scf.int4c2e import BasisProdCache from gpu4pyscf.df.int3c2e import sort_mol, _split_l_ctr_groups, get_pairing from gpu4pyscf.gto.mole import basis_seg_contraction -from gpu4pyscf.__config__ import _num_devices, _streams +from gpu4pyscf.__config__ import num_devices, _streams BLKSIZE = 128 @@ -132,7 +132,7 @@ def get_n_hermite_density_of_angular_pair(l): self.density_offset = np.append(0, np.cumsum(n_density_per_angular_pair)).astype(np.int32) self._bpcache = {} - for n in range(_num_devices): + for n in range(num_devices): with cp.cuda.Device(n), _streams[n]: bpcache = ctypes.POINTER(BasisProdCache)() scale_shellpair_diag = 1.0 diff --git a/gpu4pyscf/gto/int3c1e_ipip.py b/gpu4pyscf/gto/int3c1e_ipip.py new file mode 100644 index 00000000..b86abf46 --- /dev/null +++ b/gpu4pyscf/gto/int3c1e_ipip.py @@ -0,0 +1,410 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes +import cupy as cp +import numpy as np +from pyscf import lib +from pyscf.gto import ATOM_OF +from pyscf.lib import c_null_ptr +from gpu4pyscf.lib.cupy_helper import load_library, cart2sph, get_avail_mem +from gpu4pyscf.gto.int3c1e import VHFOpt + +libgint = load_library('libgint') + +def get_int3c1e_ipip1_charge_contracted(mol, grids, charge_exponents, charges, intopt): + omega = mol.omega + assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." + + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + assert charges.ndim == 1 and charges.shape[0] == grids.shape[0] + charges = cp.asarray(charges).astype(np.float64) + + charges = charges.reshape([-1, 1], order='C') + grids = cp.concatenate([grids, charges], axis=1) + + int1e_charge_contracted = cp.empty([3, 3, mol.nao, mol.nao], order='C') + for cp_ij_id, _ in enumerate(intopt.log_qs): + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + li = intopt.angular[cpi] + lj = intopt.angular[cpj] + + stream = cp.cuda.get_current_stream() + + log_q_ij = intopt.log_qs[cp_ij_id] + + nbins = 1 + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + + i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] + j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] + ni = i1 - i0 + nj = j1 - j0 + + ao_offsets = np.array([i0, j0], dtype=np.int32) + strides = np.array([ni, ni*nj], dtype=np.int32) + + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents.data.ptr + + ngrids = grids.shape[0] + # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid + # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points + n_charge_sum_per_thread = 100 # This number roughly optimize kernel performance on a large system + + int1e_angular_slice = cp.zeros([3, 3, j1-j0, i1-i0], order='C') + + err = libgint.GINTfill_int3c1e_ipip1_charge_contracted( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(grids.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p), + strides.ctypes.data_as(ctypes.c_void_p), + ao_offsets.ctypes.data_as(ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.c_double(omega), + ctypes.c_int(n_charge_sum_per_thread)) + + if err != 0: + raise RuntimeError('GINTfill_int3c1e_charge_contracted failed') + + int1e_angular_slice[1,0] = int1e_angular_slice[0,1] + int1e_angular_slice[2,0] = int1e_angular_slice[0,2] + int1e_angular_slice[2,1] = int1e_angular_slice[1,2] + + i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] + j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] + if not mol.cart: + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj) + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li) + + int1e_charge_contracted[:, :, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,1,3,2) + + return intopt.unsort_orbitals(int1e_charge_contracted, axis=[2,3]) + +def get_int3c1e_ipvip1_charge_contracted(mol, grids, charge_exponents, charges, intopt): + omega = mol.omega + assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." + + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + assert charges.ndim == 1 and charges.shape[0] == grids.shape[0] + charges = cp.asarray(charges).astype(np.float64) + + charges = charges.reshape([-1, 1], order='C') + grids = cp.concatenate([grids, charges], axis=1) + + int1e_charge_contracted = cp.empty([3, 3, mol.nao, mol.nao], order='C') + for cp_ij_id, _ in enumerate(intopt.log_qs): + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + li = intopt.angular[cpi] + lj = intopt.angular[cpj] + + stream = cp.cuda.get_current_stream() + + log_q_ij = intopt.log_qs[cp_ij_id] + + nbins = 1 + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + + i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] + j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] + ni = i1 - i0 + nj = j1 - j0 + + ao_offsets = np.array([i0, j0], dtype=np.int32) + strides = np.array([ni, ni*nj], dtype=np.int32) + + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents.data.ptr + + ngrids = grids.shape[0] + # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid + # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points + n_charge_sum_per_thread = 100 # This number roughly optimize kernel performance on a large system + + int1e_angular_slice = cp.zeros([3, 3, j1-j0, i1-i0], order='C') + + err = libgint.GINTfill_int3c1e_ipvip1_charge_contracted( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(grids.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p), + strides.ctypes.data_as(ctypes.c_void_p), + ao_offsets.ctypes.data_as(ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.c_double(omega), + ctypes.c_int(n_charge_sum_per_thread)) + + if err != 0: + raise RuntimeError('GINTfill_int3c1e_charge_contracted failed') + + i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] + j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] + if not mol.cart: + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj) + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li) + + int1e_charge_contracted[:, :, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,1,3,2) + + return intopt.unsort_orbitals(int1e_charge_contracted, axis=[2,3]) + +def get_int3c1e_ip1ip2_charge_contracted(mol, grids, charge_exponents, charges, intopt): + omega = mol.omega + assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." + + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + assert charges.ndim == 1 and charges.shape[0] == grids.shape[0] + charges = cp.asarray(charges).astype(np.float64) + + charges = charges.reshape([-1, 1], order='C') + grids = cp.concatenate([grids, charges], axis=1) + + int1e_charge_contracted = cp.empty([3, 3, mol.nao, mol.nao], order='C') + for cp_ij_id, _ in enumerate(intopt.log_qs): + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + li = intopt.angular[cpi] + lj = intopt.angular[cpj] + + stream = cp.cuda.get_current_stream() + + log_q_ij = intopt.log_qs[cp_ij_id] + + nbins = 1 + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + + i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] + j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] + ni = i1 - i0 + nj = j1 - j0 + + ao_offsets = np.array([i0, j0], dtype=np.int32) + strides = np.array([ni, ni*nj], dtype=np.int32) + + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents.data.ptr + + ngrids = grids.shape[0] + # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid + # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points + n_charge_sum_per_thread = 100 # This number roughly optimize kernel performance on a large system + + int1e_angular_slice = cp.zeros([3, 3, j1-j0, i1-i0], order='C') + + err = libgint.GINTfill_int3c1e_ip1ip2_charge_contracted( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(grids.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p), + strides.ctypes.data_as(ctypes.c_void_p), + ao_offsets.ctypes.data_as(ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.c_double(omega), + ctypes.c_int(n_charge_sum_per_thread)) + + if err != 0: + raise RuntimeError('GINTfill_int3c1e_charge_contracted failed') + + i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] + j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] + if not mol.cart: + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj) + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li) + + int1e_charge_contracted[:, :, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,1,3,2) + + return intopt.unsort_orbitals(int1e_charge_contracted, axis=[2,3]) + +def get_int3c1e_ipip2_density_contracted(mol, grids, charge_exponents, dm, intopt): + omega = mol.omega + assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." + + nao_cart = intopt._sorted_mol.nao + ngrids = grids.shape[0] + + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + dm = cp.asarray(dm) + assert dm.ndim == 2 + assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao + + dm = intopt.sort_orbitals(dm, [0,1]) + if not mol.cart: + cart2sph_transformation_matrix = intopt.cart2sph + # TODO: This part is inefficient (O(N^3)), should be changed to the O(N^2) algorithm + dm = cart2sph_transformation_matrix @ dm @ cart2sph_transformation_matrix.T + dm = dm.flatten(order='F') # Column major order matches (i + j * n_ao) access pattern in the C function + + dm = cp.asnumpy(dm) + + ao_loc_sorted_order = intopt._sorted_mol.ao_loc_nr(cart = True) + l_ij = intopt.l_ij.T.flatten() + bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten() + + n_total_hermite_density = intopt.density_offset[-1] + dm_pair_ordered = np.empty(n_total_hermite_density) + libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p), + dm_pair_ordered.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(1), ctypes.c_int(nao_cart), + ctypes.c_int(len(intopt.bas_pairs_locs) - 1), + intopt.bas_pair2shls.ctypes.data_as(ctypes.c_void_p), + intopt.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), + l_ij.ctypes.data_as(ctypes.c_void_p), + intopt.density_offset.ctypes.data_as(ctypes.c_void_p), + ao_loc_sorted_order.ctypes.data_as(ctypes.c_void_p), + bas_coords.ctypes.data_as(ctypes.c_void_p), + ctypes.c_bool(False)) + + dm_pair_ordered = cp.asarray(dm_pair_ordered) + + n_threads_per_block_1d = 16 + n_max_blocks_per_grid_1d = 65535 + n_max_threads_1d = n_threads_per_block_1d * n_max_blocks_per_grid_1d + n_grid_split = int(np.ceil(ngrids / n_max_threads_1d)) + if (n_grid_split > 100): + print(f"Grid dimension = {ngrids} is too large, more than 100 kernels for one electron integral will be launched.") + ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split + + int3c_density_contracted = cp.zeros([3, 3, ngrids], order='C') + + for p0, p1 in lib.prange(0, ngrids, ngrids_per_split): + for cp_ij_id, _ in enumerate(intopt.log_qs): + stream = cp.cuda.get_current_stream() + + log_q_ij = intopt.log_qs[cp_ij_id] + + nbins = 1 + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + exponents_slice = charge_exponents[p0:p1] + charge_exponents_pointer = exponents_slice.data.ptr + grids_slice = grids[p0:p1] + + # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid + # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type + n_pair_sum_per_thread = nao_cart + + err = libgint.GINTfill_int3c1e_ipip2_density_contracted( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), + ctypes.c_int(p1-p0), + ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p), + intopt.density_offset.ctypes.data_as(ctypes.c_void_p), + ctypes.cast(int3c_density_contracted[:, p0:p1].data.ptr, ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.c_double(omega), + ctypes.c_int(n_pair_sum_per_thread)) + + if err != 0: + raise RuntimeError('GINTfill_int3c1e_density_contracted failed') + + int3c_density_contracted[1,0] = int3c_density_contracted[0,1] + int3c_density_contracted[2,0] = int3c_density_contracted[0,2] + int3c_density_contracted[2,1] = int3c_density_contracted[1,2] + + return int3c_density_contracted + +def int1e_grids_ipip1(mol, grids, charge_exponents=None, charges=None, direct_scf_tol=1e-13, intopt=None): + assert grids is not None + assert charges is not None + + if intopt is None: + intopt = VHFOpt(mol) + intopt.build(direct_scf_tol, aosym=False) + else: + assert isinstance(intopt, VHFOpt), \ + f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object." + assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." + assert not intopt.aosym + + return get_int3c1e_ipip1_charge_contracted(mol, grids, charge_exponents, charges, intopt) + +def int1e_grids_ipvip1(mol, grids, charge_exponents=None, charges=None, direct_scf_tol=1e-13, intopt=None): + assert grids is not None + assert charges is not None + + if intopt is None: + intopt = VHFOpt(mol) + intopt.build(direct_scf_tol, aosym=False) + else: + assert isinstance(intopt, VHFOpt), \ + f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object." + assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." + assert not intopt.aosym + + return get_int3c1e_ipvip1_charge_contracted(mol, grids, charge_exponents, charges, intopt) + +def int1e_grids_ip1ip2(mol, grids, charge_exponents=None, charges=None, direct_scf_tol=1e-13, intopt=None): + assert grids is not None + assert charges is not None + + if intopt is None: + intopt = VHFOpt(mol) + intopt.build(direct_scf_tol, aosym=False) + else: + assert isinstance(intopt, VHFOpt), \ + f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object." + assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." + assert not intopt.aosym + + return get_int3c1e_ip1ip2_charge_contracted(mol, grids, charge_exponents, charges, intopt) + +def int1e_grids_ipip2(mol, grids, charge_exponents=None, dm=None, direct_scf_tol=1e-13, intopt=None): + assert grids is not None + assert dm is not None + + if intopt is None: + intopt = VHFOpt(mol) + intopt.build(direct_scf_tol, aosym=False) + else: + assert isinstance(intopt, VHFOpt), \ + f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object." + assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." + assert not intopt.aosym + + return get_int3c1e_ipip2_density_contracted(mol, grids, charge_exponents, dm, intopt) diff --git a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py index 56f87e4b..de68266b 100644 --- a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py +++ b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py @@ -364,5 +364,5 @@ def test_int1e_grids_ip1_density_contracted(self): cp.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) if __name__ == "__main__": - print("Full Tests for One Electron Coulomb Integrals") + print("Full Tests for One Electron Coulomb Integrals 1st Derivative") unittest.main() diff --git a/gpu4pyscf/gto/tests/test_int1e_grids_ipip.py b/gpu4pyscf/gto/tests/test_int1e_grids_ipip.py new file mode 100644 index 00000000..18f9fed9 --- /dev/null +++ b/gpu4pyscf/gto/tests/test_int1e_grids_ipip.py @@ -0,0 +1,480 @@ +# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import numpy as np +import cupy as cp +import pyscf +from pyscf import lib, gto, df +from gpu4pyscf.gto.int3c1e_ipip import int1e_grids_ipip1, int1e_grids_ipvip1, int1e_grids_ip1ip2, int1e_grids_ipip2 + +def setUpModule(): + global mol_sph, mol_cart, grid_points, integral_threshold, density_contraction_threshold, charge_contraction_threshold + atom = ''' +O 0.0000 0.7375 -0.0528 +O 0.0000 -0.7375 -0.1528 +H 0.8190 0.8170 0.4220 +H -0.8190 -0.8170 0.4220 +''' + bas='def2-qzvpp' + + mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000) + mol_sph.output = '/dev/null' + mol_sph.verbose = 0 + mol_sph.build() + + mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=True) + mol_cart.output = '/dev/null' + mol_cart.verbose = 0 + mol_cart.build() + + xs = np.arange(-2.01, 2.0, 0.5) + ys = np.arange(-2.02, 2.0, 0.5) + zs = np.arange(-2.03, 2.0, 0.5) + grid_points = lib.cartesian_prod([xs, ys, zs]) + + # All of the following thresholds bound the max value of the corresponding matrix / tensor. + integral_threshold = 1e-12 + density_contraction_threshold = 1e-10 + charge_contraction_threshold = 1e-12 + +def tearDownModule(): + global mol_sph, mol_cart, grid_points + mol_sph.stdout.close() + mol_cart.stdout.close() + del mol_sph, mol_cart, grid_points + +class KnownValues(unittest.TestCase): + ''' + Values are compared to PySCF CPU intor() function + ''' + def test_int1e_grids_ipip1_charge_contracted_cart(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + mol = mol_cart + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt) + ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges) + + assert isinstance(test_int1e_dAdA, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold) + + def test_int1e_grids_ipip1_charge_contracted_sph(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + mol = mol_sph + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt) + ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges) + + assert isinstance(test_int1e_dAdA, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold) + + def test_int1e_grids_ipip1_charge_contracted_gaussian_charge(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + + mol = mol_sph + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + + int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt) + ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges, charge_exponents = charge_exponents) + + assert isinstance(test_int1e_dAdA, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold) + + def test_int1e_grids_ipip1_charge_contracted_omega(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + omega = 0.8 + mol_sph_omega = mol_sph.copy() + mol_sph_omega.set_range_coulomb(omega) + + mol = mol_sph_omega + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt) + ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges) + + assert isinstance(test_int1e_dAdA, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold) + + def test_int1e_grids_ipip1_charge_contracted_gaussian_charge_omega(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + + omega = 0.8 + mol_sph_omega = mol_sph.copy() + mol_sph_omega.set_range_coulomb(omega) + + mol = mol_sph_omega + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + + int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt) + ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges, charge_exponents = charge_exponents) + + assert isinstance(test_int1e_dAdA, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold) + + # ^ ipip1 v ipvip1 + + def test_int1e_grids_ipvip1_charge_contracted_cart(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + mol = mol_cart + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt) + ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges) + + assert isinstance(test_int1e_dAdB, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold) + + def test_int1e_grids_ipvip1_charge_contracted_sph(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + mol = mol_sph + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt) + ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges) + + assert isinstance(test_int1e_dAdB, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold) + + def test_int1e_grids_ipvip1_charge_contracted_gaussian_charge(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + + mol = mol_sph + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + + int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt) + ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges, charge_exponents = charge_exponents) + + assert isinstance(test_int1e_dAdB, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold) + + def test_int1e_grids_ipvip1_charge_contracted_omega(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + omega = 0.8 + mol_sph_omega = mol_sph.copy() + mol_sph_omega.set_range_coulomb(omega) + + mol = mol_sph_omega + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt) + ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges) + + assert isinstance(test_int1e_dAdB, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold) + + def test_int1e_grids_ipvip1_charge_contracted_gaussian_charge_omega(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + + omega = 0.8 + mol_sph_omega = mol_sph.copy() + mol_sph_omega.set_range_coulomb(omega) + + mol = mol_sph_omega + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + + int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt) + ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges, charge_exponents = charge_exponents) + + assert isinstance(test_int1e_dAdB, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold) + + # ^ ipvip1 v ip1ip2 + + def test_int1e_grids_ip1ip2_charge_contracted_cart(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + mol = mol_cart + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt) + ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges) + + assert isinstance(test_int1e_dAdC, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold) + + def test_int1e_grids_ip1ip2_charge_contracted_sph(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + mol = mol_sph + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt) + ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges) + + assert isinstance(test_int1e_dAdC, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold) + + def test_int1e_grids_ip1ip2_charge_contracted_gaussian_charge(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + + mol = mol_sph + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + + int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt) + ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges, charge_exponents = charge_exponents) + + assert isinstance(test_int1e_dAdC, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold) + + def test_int1e_grids_ip1ip2_charge_contracted_omega(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + omega = 0.8 + mol_sph_omega = mol_sph.copy() + mol_sph_omega.set_range_coulomb(omega) + + mol = mol_sph_omega + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt) + ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges) + + assert isinstance(test_int1e_dAdC, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold) + + def test_int1e_grids_ip1ip2_charge_contracted_gaussian_charge_omega(self): + np.random.seed(12345) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + + omega = 0.8 + mol_sph_omega = mol_sph.copy() + mol_sph_omega.set_range_coulomb(omega) + + mol = mol_sph_omega + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + + int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt) + ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges) + ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao) + + test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges, charge_exponents = charge_exponents) + + assert isinstance(test_int1e_dAdC, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold) + + # ^ ip1ip2 v ipip2 + + def test_int1e_grids_ipip2_charge_contracted_cart(self): + np.random.seed(12345) + dm = np.random.uniform(-2.0, 2.0, (mol_cart.nao, mol_cart.nao)) + + mol = mol_cart + fakemol = gto.fakemol_for_charges(grid_points) + + # Note: we cannot compute ipip2 (dCdC) directly due to numerical problems, + # pyscf treat a point charge as a sharp Gaussian, and we cannot take 2nd derivative of it. + int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt) + v_nj = -v_nj - v_nj.transpose(0, 2, 1, 3) # dCdC = -dAdC - dBdC + ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm) + ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0]) + + test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm) + + assert isinstance(test_int1e_dCdC, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold) + + def test_int1e_grids_ipip2_charge_contracted_sph(self): + np.random.seed(12345) + dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao)) + + mol = mol_sph + fakemol = gto.fakemol_for_charges(grid_points) + + # Note: we cannot compute ipip2 (dCdC) directly due to numerical problems, + # pyscf treat a point charge as a sharp Gaussian, and we cannot take 2nd derivative of it. + int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt) + v_nj = -v_nj - v_nj.transpose(0, 2, 1, 3) # dCdC = -dAdC - dBdC + ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm) + ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0]) + + test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm) + + assert isinstance(test_int1e_dCdC, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold) + + def test_int1e_grids_ipip2_charge_contracted_gaussian_charge(self): + np.random.seed(12345) + dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao)) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + + mol = mol_sph + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + + int3c2e_ipip2 = mol._add_suffix('int3c2e_ipip2') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip2) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip2, aosym='s1', cintopt=cintopt) + ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm) + ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0]) + + test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm, charge_exponents = charge_exponents) + + assert isinstance(test_int1e_dCdC, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold) + + def test_int1e_grids_ipip2_charge_contracted_omega(self): + np.random.seed(12345) + dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao)) + + omega = 0.8 + mol_sph_omega = mol_sph.copy() + mol_sph_omega.set_range_coulomb(omega) + + mol = mol_sph_omega + fakemol = gto.fakemol_for_charges(grid_points) + + # Note: we cannot compute ipip2 (dCdC) directly due to numerical problems, + # pyscf treat a point charge as a sharp Gaussian, and we cannot take 2nd derivative of it. + int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt) + v_nj = -v_nj - v_nj.transpose(0, 2, 1, 3) # dCdC = -dAdC - dBdC + ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm) + ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0]) + + test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm) + + assert isinstance(test_int1e_dCdC, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold) + + def test_int1e_grids_ipip2_charge_contracted_gaussian_charge_omega(self): + np.random.seed(12345) + dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao)) + charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0]) + + omega = 0.8 + mol_sph_omega = mol_sph.copy() + mol_sph_omega.set_range_coulomb(omega) + + mol = mol_sph_omega + fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents) + + int3c2e_ipip2 = mol._add_suffix('int3c2e_ipip2') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip2) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip2, aosym='s1', cintopt=cintopt) + ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm) + ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0]) + + test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm, charge_exponents = charge_exponents) + + assert isinstance(test_int1e_dCdC, cp.ndarray) + cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold) + +if __name__ == "__main__": + print("Full Tests for One Electron Coulomb Integrals 2nd Derivative") + unittest.main() diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py index 65edff6b..5ed768c2 100644 --- a/gpu4pyscf/hessian/jk.py +++ b/gpu4pyscf/hessian/jk.py @@ -33,7 +33,7 @@ reduce_to_device, contract) from gpu4pyscf.__config__ import props as gpu_specs -from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.__config__ import _streams, num_devices from gpu4pyscf.lib import logger @@ -174,7 +174,7 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, if vhfopt is None: # Small group size for load balance group_size = None - if _num_devices > 1: + if num_devices > 1: group_size = jk.GROUP_SIZE vhfopt = _VHFOpt(mol).build(group_size=group_size) @@ -202,13 +202,13 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) + for device_id in range(num_devices): + task_list.append(tasks[device_id::num_devices]) cp.cuda.get_current_stream().synchronize() futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _jk_task, mol, dms, mo_coeff, mo_occ, vhfopt, task_list[device_id], hermi=hermi, diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 775a6e98..b39aab8e 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -32,7 +32,7 @@ contract, tag_array, sandwich_dot, transpose_sum, get_avail_mem, condense, krylov) from gpu4pyscf.__config__ import props as gpu_specs -from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.__config__ import _streams, num_devices from gpu4pyscf.lib import logger from gpu4pyscf.scf.jk import ( LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, GROUP_SIZE, libvhf_rys, _VHFOpt, @@ -271,7 +271,7 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non if vhfopt is None: # Small group size for load balance group_size = None - if _num_devices > 1: + if num_devices > 1: group_size = GROUP_SIZE vhfopt = _VHFOpt(mol).build(group_size=group_size) @@ -296,13 +296,13 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) + for device_id in range(num_devices): + task_list.append(tasks[device_id::num_devices]) cp.cuda.get_current_stream().synchronize() futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _ejk_ip2_task, mol, dms, vhfopt, task_list[device_id], @@ -494,7 +494,7 @@ def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=Non vhfopt.tile = 1 # Small group size for load balance group_size = None - if _num_devices > 1: + if num_devices > 1: group_size = GROUP_SIZE vhfopt.build(group_size=group_size) @@ -532,13 +532,13 @@ def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=Non tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) + for device_id in range(num_devices): + task_list.append(tasks[device_id::num_devices]) cp.cuda.get_current_stream().synchronize() futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _build_jk_ip1_task, mol, dms, vhfopt, task_list[device_id], atoms_slice, @@ -908,7 +908,7 @@ def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ, with mol.with_range_coulomb(omega): # Small group size for load balance group_size = None - if _num_devices > 1: + if num_devices > 1: group_size = GROUP_SIZE vhfopt = _VHFOpt(mol, mf.direct_scf_tol).build(group_size=group_size) mf._opt_gpu[omega] = vhfopt diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index a1c01079..c12ef0e2 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -30,7 +30,7 @@ from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, reduce_to_device, transpose_sum) from gpu4pyscf.lib import logger -from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.__config__ import _streams, num_devices from gpu4pyscf.hessian import jk def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, @@ -49,7 +49,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, dm0 = cupy.dot(mocc, mocc.T) * 2 if mf.do_nlc(): - raise NotImplementedError + raise NotImplementedError("2nd derivative of NLC is not implemented.") omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) with_k = ni.libxc.is_hybrid_xc(mf.xc) @@ -524,8 +524,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): futures = [] cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _get_vxc_deriv2_task, hessobj, grids, mo_coeff, mo_occ, max_memory, @@ -550,7 +550,6 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id ngrids_glob = grids.coords.shape[0] grid_start, grid_end = numint.gen_grid_range(ngrids_glob, device_id) - with cupy.cuda.Device(device_id), _streams[device_id]: mo_occ = cupy.asarray(mo_occ) mo_coeff = cupy.asarray(mo_coeff) @@ -688,8 +687,8 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): futures = [] cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _get_vxc_deriv1_task, hessobj, grids, mo_coeff, mo_occ, max_memory, @@ -796,8 +795,8 @@ def nr_rks_fxc_mo(ni, mol, grids, xc_code, dm0=None, dms=None, mo_coeff=None, re futures = [] cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit( _nr_rks_fxc_mo_task, ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc, diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py index 0c2995f8..620331e0 100644 --- a/gpu4pyscf/hessian/uks.py +++ b/gpu4pyscf/hessian/uks.py @@ -47,8 +47,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, dm0b = moccb.dot(moccb.T) dm0 = cp.asarray((dm0a, dm0b)) - if mf.nlc != '': - raise NotImplementedError + if mf.do_nlc(): + raise NotImplementedError("2nd derivative of NLC is not implemented.") omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) with_k = ni.libxc.is_hybrid_xc(mf.xc) diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index fe197c71..4c62d8db 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -23,8 +23,8 @@ from gpu4pyscf.gto import mole from gpu4pyscf.lib.cutensor import contract from gpu4pyscf.lib.cusolver import eigh, cholesky #NOQA -from gpu4pyscf.lib.memcpy import copy_array #NOQA -from gpu4pyscf.__config__ import _streams, _num_devices, _p2p_access +from gpu4pyscf.lib.memcpy import copy_array, p2p_transfer #NOQA +from gpu4pyscf.__config__ import _streams, num_devices, _p2p_access LMAX_ON_GPU = 7 DSOLVE_LINDEP = 1e-13 @@ -81,23 +81,6 @@ def get_avail_mem(): mem_avail = cupy.cuda.runtime.memGetInfo()[0] return mem_avail + total_mem - used_mem -def p2p_transfer(a, b): - ''' If the direct P2P data transfer is not available, transfer data via CPU memory - ''' - if a.device == b.device: - a[:] = b - elif _p2p_access: - a[:] = b - ''' - elif a.strides == b.strides and a.flags.c_contiguous and a.dtype == b.dtype: - # cupy supports a direct copy from different devices without p2p. See also - # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L48 - # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L1015 - a[:] = b - ''' - else: - copy_array(b, a) - def concatenate(array_list): ''' Concatenate axis=0 only ''' @@ -126,8 +109,8 @@ def reduce_to_device(array_list, inplace=False): ''' Reduce a list of ndarray in different devices to device 0 TODO: reduce memory footprint, improve throughput ''' - assert len(array_list) == _num_devices - if _num_devices == 1: + assert len(array_list) == num_devices + if num_devices == 1: return array_list[0] out_shape = array_list[0].shape diff --git a/gpu4pyscf/lib/cusolver.py b/gpu4pyscf/lib/cusolver.py index 5c8d2dd6..393d7d96 100644 --- a/gpu4pyscf/lib/cusolver.py +++ b/gpu4pyscf/lib/cusolver.py @@ -16,11 +16,13 @@ import numpy as np import cupy import ctypes +from ctypes.util import find_library from cupy_backends.cuda.libs import cusolver from cupy_backends.cuda.libs import cublas from cupy.cuda import device -libcusolver = ctypes.CDLL('libcusolver.so') +libcusolver = find_library('cusolver') +libcusolver = ctypes.CDLL(libcusolver) CUSOLVER_EIG_TYPE_1 = 1 CUSOLVER_EIG_TYPE_2 = 2 diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py index 0599e39a..034076ab 100644 --- a/gpu4pyscf/lib/cutensor.py +++ b/gpu4pyscf/lib/cutensor.py @@ -103,10 +103,11 @@ def contraction( ws = cupy.empty(ws_size, dtype=np.int8) out = c - alpha = np.asarray(alpha) - beta = np.asarray(beta) + alpha = np.asarray(alpha, dtype=dtype) + beta = np.asarray(beta, dtype=dtype) - cutensor_backend.contract(cutensor._get_handle().ptr, plan.ptr, + handler = cutensor._get_handle() + cutensor_backend.contract(handler.ptr, plan.ptr, alpha.ctypes.data, a.data.ptr, b.data.ptr, beta.ctypes.data, c.data.ptr, out.data.ptr, ws.data.ptr, ws_size) @@ -114,13 +115,10 @@ def contraction( return out import os -if 'CONTRACT_ENGINE' in os.environ: - contract_engine = os.environ['CONTRACT_ENGINE'] -else: - contract_engine = None - +contract_engine = None if cutensor is None: contract_engine = 'cupy' # default contraction engine +contract_engine = os.environ.get('CONTRACT_ENGINE', contract_engine) # override the 'contract' function if einsum is customized or cutensor is not found if contract_engine is not None: @@ -139,10 +137,15 @@ def contraction( warnings.warn(f'using {contract_engine} as the tensor contraction engine.') def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None): if out is None: - return cupy.asarray(einsum(pattern, a, b), order='C') + out = einsum(pattern, a, b) + out *= alpha + elif beta == 0.: + out[:] = einsum(pattern, a, b) + out *= alpha else: - out[:] = alpha*einsum(pattern, a, b) + beta*out - return cupy.asarray(out, order='C') + out *= beta + out += alpha*einsum(pattern, a, b) + return cupy.asarray(out, order='C') else: def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None): ''' diff --git a/gpu4pyscf/lib/gint/CMakeLists.txt b/gpu4pyscf/lib/gint/CMakeLists.txt index 464efed6..7647c2c3 100644 --- a/gpu4pyscf/lib/gint/CMakeLists.txt +++ b/gpu4pyscf/lib/gint/CMakeLists.txt @@ -26,6 +26,7 @@ add_library(gint SHARED nr_fill_ao_ints.cu nr_fill_ao_int3c1e.cu nr_fill_ao_int3c1e_ip.cu + nr_fill_ao_int3c1e_ipip.cu nr_fill_ao_int3c2e.cu nr_fill_ao_int3c2e_ip1.cu nr_fill_ao_int3c2e_ip2.cu diff --git a/gpu4pyscf/lib/gint/g3c1e_ipip.cu b/gpu4pyscf/lib/gint/g3c1e_ipip.cu new file mode 100644 index 00000000..87ebb270 --- /dev/null +++ b/gpu4pyscf/lib/gint/g3c1e_ipip.cu @@ -0,0 +1,635 @@ +/* + * Copyright 2021-2024 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gint.h" + +template +__device__ +static void GINTwrite_int3c1e_ipip1_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double prefactor, const int i_l, const int j_l) +{ + const int *idx = c_idx; + const int *idy = c_idx + TOT_NF; + const int *idz = c_idx + TOT_NF * 2; + + const int g_size = NROOTS * (i_l + 2 + 1) * (j_l + 1); + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + g_size; + const double* __restrict__ gz = g + g_size * 2; + + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + const int n_density_elements_ij = n_density_elements_i * n_density_elements_j; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const int loc_j = c_l_locs[j_l] + j; + const int loc_i = c_l_locs[i_l] + i; + const int ix = idx[loc_i]; + const int iy = idy[loc_i]; + const int iz = idz[loc_i]; + const int jx = idx[loc_j]; + const int jy = idy[loc_j]; + const int jz = idz[loc_j]; + const int gx_offset = ix + jx * (i_l + 2 + 1); + const int gy_offset = iy + jy * (i_l + 2 + 1); + const int gz_offset = iz + jz * (i_l + 2 + 1); + + double d2eri_dAxdAx = 0; + double d2eri_dAxdAy = 0; + double d2eri_dAxdAz = 0; + double d2eri_dAydAy = 0; + double d2eri_dAydAz = 0; + double d2eri_dAzdAz = 0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_minus_2 = (ix >= 2 ? gx[(gx_offset - 2) * NROOTS + i_root] : 0); + const double gy_minus_2 = (iy >= 2 ? gy[(gy_offset - 2) * NROOTS + i_root] : 0); + const double gz_minus_2 = (iz >= 2 ? gz[(gz_offset - 2) * NROOTS + i_root] : 0); + const double gx_minus_1 = (ix >= 1 ? gx[(gx_offset - 1) * NROOTS + i_root] : 0); + const double gy_minus_1 = (iy >= 1 ? gy[(gy_offset - 1) * NROOTS + i_root] : 0); + const double gz_minus_1 = (iz >= 1 ? gz[(gz_offset - 1) * NROOTS + i_root] : 0); + const double gx_0 = gx[gx_offset * NROOTS + i_root]; + const double gy_0 = gy[gy_offset * NROOTS + i_root]; + const double gz_0 = gz[gz_offset * NROOTS + i_root]; + const double gx_1 = gx[(gx_offset + 1) * NROOTS + i_root]; + const double gy_1 = gy[(gy_offset + 1) * NROOTS + i_root]; + const double gz_1 = gz[(gz_offset + 1) * NROOTS + i_root]; + const double gx_2 = gx[(gx_offset + 2) * NROOTS + i_root]; + const double gy_2 = gy[(gy_offset + 2) * NROOTS + i_root]; + const double gz_2 = gz[(gz_offset + 2) * NROOTS + i_root]; + const double dgx_dAx = ix * gx_minus_1 + minus_two_a * gx_1; + const double dgy_dAy = iy * gy_minus_1 + minus_two_a * gy_1; + const double dgz_dAz = iz * gz_minus_1 + minus_two_a * gz_1; + const double d2gx_dAx2 = ix * (ix - 1) * gx_minus_2 + minus_two_a * (2 * ix + 1) * gx_0 + minus_two_a * minus_two_a * gx_2; + const double d2gy_dAy2 = iy * (iy - 1) * gy_minus_2 + minus_two_a * (2 * iy + 1) * gy_0 + minus_two_a * minus_two_a * gy_2; + const double d2gz_dAz2 = iz * (iz - 1) * gz_minus_2 + minus_two_a * (2 * iz + 1) * gz_0 + minus_two_a * minus_two_a * gz_2; + d2eri_dAxdAx += d2gx_dAx2 * gy_0 * gz_0; + d2eri_dAxdAy += dgx_dAx * dgy_dAy * gz_0; + d2eri_dAxdAz += dgx_dAx * gy_0 * dgz_dAz; + d2eri_dAydAy += gx_0 * d2gy_dAy2 * gz_0; + d2eri_dAydAz += gx_0 * dgy_dAy * dgz_dAz; + d2eri_dAzdAz += gx_0 * gy_0 * d2gz_dAz2; + } + local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += d2eri_dAxdAx * prefactor; + local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += d2eri_dAxdAy * prefactor; + local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += d2eri_dAxdAz * prefactor; + local_output[i + j * n_density_elements_i + 3 * n_density_elements_ij] += d2eri_dAydAy * prefactor; + local_output[i + j * n_density_elements_i + 4 * n_density_elements_ij] += d2eri_dAydAz * prefactor; + local_output[i + j * n_density_elements_i + 5 * n_density_elements_ij] += d2eri_dAzdAz * prefactor; + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ipip1_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + if (task_ij >= ntasks_ij) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + const double* __restrict__ a_exponents = c_bpcache.a1; + + constexpr int l_sum_max = (NROOTS - 1) * 2 + 1; + constexpr int l_i_max_density_elements = (l_sum_max + 1) / 2; + constexpr int l_j_max_density_elements = l_sum_max - l_i_max_density_elements; + double output_cache[(l_i_max_density_elements + 1) * (l_i_max_density_elements + 2) / 2 + * (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2 + * 6] { 0.0 }; + + for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) { + const double* grid_point = grid_points + task_grid * 4; + const double charge = grid_point[3]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double g[GSIZE_INT3C_1E]; + + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + GINT_g1e(g, grid_point, ish, jsh, ij, i_l + 2, j_l, charge_exponent, omega); + const double minus_two_a = -2.0 * a_exponents[ij]; + GINTwrite_int3c1e_ipip1_charge_contracted(g, output_cache, minus_two_a, charge, i_l, j_l); + } + } + + const int* ao_loc = c_bpcache.ao_loc; + + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + const int n_density_elements_ij = n_density_elements_i * n_density_elements_j; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const double d2eri_dAxdAx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij]; + const double d2eri_dAxdAy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij]; + const double d2eri_dAxdAz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij]; + const double d2eri_dAydAy = output_cache[i + j * n_density_elements_i + 3 * n_density_elements_ij]; + const double d2eri_dAydAz = output_cache[i + j * n_density_elements_i + 4 * n_density_elements_ij]; + const double d2eri_dAzdAz = output_cache[i + j * n_density_elements_i + 5 * n_density_elements_ij]; + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), d2eri_dAxdAx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), d2eri_dAxdAy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), d2eri_dAxdAz); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 4 * stride_ij), d2eri_dAydAy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 5 * stride_ij), d2eri_dAydAz); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 8 * stride_ij), d2eri_dAzdAz); + } + } +} + +template +__device__ +static void GINTwrite_int3c1e_ipvip1_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double minus_two_b, const double prefactor, const int i_l, const int j_l) +{ + const int *idx = c_idx; + const int *idy = c_idx + TOT_NF; + const int *idz = c_idx + TOT_NF * 2; + + const int g_size = NROOTS * (i_l + 1 + 1) * (j_l + 1 + 1); + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + g_size; + const double* __restrict__ gz = g + g_size * 2; + + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + const int n_density_elements_ij = n_density_elements_i * n_density_elements_j; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const int loc_j = c_l_locs[j_l] + j; + const int loc_i = c_l_locs[i_l] + i; + const int ix = idx[loc_i]; + const int iy = idy[loc_i]; + const int iz = idz[loc_i]; + const int jx = idx[loc_j]; + const int jy = idy[loc_j]; + const int jz = idz[loc_j]; + const int j_offset = i_l + 1 + 1; + + double d2eri_dAxdBx = 0; + double d2eri_dAxdBy = 0; + double d2eri_dAxdBz = 0; + double d2eri_dAydBx = 0; + double d2eri_dAydBy = 0; + double d2eri_dAydBz = 0; + double d2eri_dAzdBx = 0; + double d2eri_dAzdBy = 0; + double d2eri_dAzdBz = 0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_i_minus_1_j_minus_1 = ix * jx * (ix >= 1 && jx >= 1 ? gx[(ix - 1 + (jx - 1) * j_offset) * NROOTS + i_root] : 0); + const double gy_i_minus_1_j_minus_1 = iy * jy * (iy >= 1 && jy >= 1 ? gy[(iy - 1 + (jy - 1) * j_offset) * NROOTS + i_root] : 0); + const double gz_i_minus_1_j_minus_1 = iz * jz * (iz >= 1 && jz >= 1 ? gz[(iz - 1 + (jz - 1) * j_offset) * NROOTS + i_root] : 0); + const double gx_i_minus_1_j_1 = ix * minus_two_b * (ix >= 1 ? gx[(ix - 1 + (jx + 1) * j_offset) * NROOTS + i_root] : 0); + const double gy_i_minus_1_j_1 = iy * minus_two_b * (iy >= 1 ? gy[(iy - 1 + (jy + 1) * j_offset) * NROOTS + i_root] : 0); + const double gz_i_minus_1_j_1 = iz * minus_two_b * (iz >= 1 ? gz[(iz - 1 + (jz + 1) * j_offset) * NROOTS + i_root] : 0); + const double gx_i_1_j_minus_1 = jx * minus_two_a * (jx >= 1 ? gx[(ix + 1 + (jx - 1) * j_offset) * NROOTS + i_root] : 0); + const double gy_i_1_j_minus_1 = jy * minus_two_a * (jy >= 1 ? gy[(iy + 1 + (jy - 1) * j_offset) * NROOTS + i_root] : 0); + const double gz_i_1_j_minus_1 = jz * minus_two_a * (jz >= 1 ? gz[(iz + 1 + (jz - 1) * j_offset) * NROOTS + i_root] : 0); + const double gx_i_1_j_1 = minus_two_a * minus_two_b * gx[(ix + 1 + (jx + 1) * j_offset) * NROOTS + i_root]; + const double gy_i_1_j_1 = minus_two_a * minus_two_b * gy[(iy + 1 + (jy + 1) * j_offset) * NROOTS + i_root]; + const double gz_i_1_j_1 = minus_two_a * minus_two_b * gz[(iz + 1 + (jz + 1) * j_offset) * NROOTS + i_root]; + const double gx_0 = gx[(ix + jx * j_offset) * NROOTS + i_root]; + const double gy_0 = gy[(iy + jy * j_offset) * NROOTS + i_root]; + const double gz_0 = gz[(iz + jz * j_offset) * NROOTS + i_root]; + const double gx_i_1_j_0 = minus_two_a * gx[(ix + 1 + jx * j_offset) * NROOTS + i_root]; + const double gy_i_1_j_0 = minus_two_a * gy[(iy + 1 + jy * j_offset) * NROOTS + i_root]; + const double gz_i_1_j_0 = minus_two_a * gz[(iz + 1 + jz * j_offset) * NROOTS + i_root]; + const double gx_i_minus_1_j_0 = ix * (ix >= 1 ? gx[(ix - 1 + jx * j_offset) * NROOTS + i_root] : 0); + const double gy_i_minus_1_j_0 = iy * (iy >= 1 ? gy[(iy - 1 + jy * j_offset) * NROOTS + i_root] : 0); + const double gz_i_minus_1_j_0 = iz * (iz >= 1 ? gz[(iz - 1 + jz * j_offset) * NROOTS + i_root] : 0); + const double gx_i_0_j_1 = minus_two_b * gx[(ix + (jx + 1) * j_offset) * NROOTS + i_root]; + const double gy_i_0_j_1 = minus_two_b * gy[(iy + (jy + 1) * j_offset) * NROOTS + i_root]; + const double gz_i_0_j_1 = minus_two_b * gz[(iz + (jz + 1) * j_offset) * NROOTS + i_root]; + const double gx_i_0_j_minus_1 = jx * (jx >= 1 ? gx[(ix + (jx - 1) * j_offset) * NROOTS + i_root] : 0); + const double gy_i_0_j_minus_1 = jy * (jy >= 1 ? gy[(iy + (jy - 1) * j_offset) * NROOTS + i_root] : 0); + const double gz_i_0_j_minus_1 = jz * (jz >= 1 ? gz[(iz + (jz - 1) * j_offset) * NROOTS + i_root] : 0); + + d2eri_dAxdBx += (gx_i_minus_1_j_minus_1 + gx_i_minus_1_j_1 + gx_i_1_j_minus_1 + gx_i_1_j_1) * gy_0 * gz_0; + d2eri_dAxdBy += (gx_i_minus_1_j_0 + gx_i_1_j_0) * (gy_i_0_j_minus_1 + gy_i_0_j_1) * gz_0; + d2eri_dAxdBz += (gx_i_minus_1_j_0 + gx_i_1_j_0) * gy_0 * (gz_i_0_j_minus_1 + gz_i_0_j_1); + d2eri_dAydBx += (gx_i_0_j_minus_1 + gx_i_0_j_1) * (gy_i_minus_1_j_0 + gy_i_1_j_0) * gz_0; + d2eri_dAydBy += gx_0 * (gy_i_minus_1_j_minus_1 + gy_i_minus_1_j_1 + gy_i_1_j_minus_1 + gy_i_1_j_1) * gz_0; + d2eri_dAydBz += gx_0 * (gy_i_minus_1_j_0 + gy_i_1_j_0) * (gz_i_0_j_minus_1 + gz_i_0_j_1); + d2eri_dAzdBx += (gx_i_0_j_minus_1 + gx_i_0_j_1) * gy_0 * (gz_i_minus_1_j_0 + gz_i_1_j_0); + d2eri_dAzdBy += gx_0 * (gy_i_0_j_minus_1 + gy_i_0_j_1) * (gz_i_minus_1_j_0 + gz_i_1_j_0); + d2eri_dAzdBz += gx_0 * gy_0 * (gz_i_minus_1_j_minus_1 + gz_i_minus_1_j_1 + gz_i_1_j_minus_1 + gz_i_1_j_1); + } + local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += d2eri_dAxdBx * prefactor; + local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += d2eri_dAxdBy * prefactor; + local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += d2eri_dAxdBz * prefactor; + local_output[i + j * n_density_elements_i + 3 * n_density_elements_ij] += d2eri_dAydBx * prefactor; + local_output[i + j * n_density_elements_i + 4 * n_density_elements_ij] += d2eri_dAydBy * prefactor; + local_output[i + j * n_density_elements_i + 5 * n_density_elements_ij] += d2eri_dAydBz * prefactor; + local_output[i + j * n_density_elements_i + 6 * n_density_elements_ij] += d2eri_dAzdBx * prefactor; + local_output[i + j * n_density_elements_i + 7 * n_density_elements_ij] += d2eri_dAzdBy * prefactor; + local_output[i + j * n_density_elements_i + 8 * n_density_elements_ij] += d2eri_dAzdBz * prefactor; + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + if (task_ij >= ntasks_ij) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + const double* __restrict__ a_exponents = c_bpcache.a1; + const double* __restrict__ b_exponents = c_bpcache.a2; + + constexpr int l_sum_max = (NROOTS - 1) * 2 + 1; + constexpr int l_i_max_density_elements = (l_sum_max + 1) / 2; + constexpr int l_j_max_density_elements = l_sum_max - l_i_max_density_elements; + double output_cache[(l_i_max_density_elements + 1) * (l_i_max_density_elements + 2) / 2 + * (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2 + * 9] { 0.0 }; + + for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) { + const double* grid_point = grid_points + task_grid * 4; + const double charge = grid_point[3]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double g[GSIZE_INT3C_1E]; + + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + GINT_g1e(g, grid_point, ish, jsh, ij, i_l + 1, j_l + 1, charge_exponent, omega); + const double minus_two_a = -2.0 * a_exponents[ij]; + const double minus_two_b = -2.0 * b_exponents[ij]; + GINTwrite_int3c1e_ipvip1_charge_contracted(g, output_cache, minus_two_a, minus_two_b, charge, i_l, j_l); + } + } + + const int* ao_loc = c_bpcache.ao_loc; + + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + const int n_density_elements_ij = n_density_elements_i * n_density_elements_j; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const double d2eri_dAxdBx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij]; + const double d2eri_dAxdBy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij]; + const double d2eri_dAxdBz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij]; + const double d2eri_dAydBx = output_cache[i + j * n_density_elements_i + 3 * n_density_elements_ij]; + const double d2eri_dAydBy = output_cache[i + j * n_density_elements_i + 4 * n_density_elements_ij]; + const double d2eri_dAydBz = output_cache[i + j * n_density_elements_i + 5 * n_density_elements_ij]; + const double d2eri_dAzdBx = output_cache[i + j * n_density_elements_i + 6 * n_density_elements_ij]; + const double d2eri_dAzdBy = output_cache[i + j * n_density_elements_i + 7 * n_density_elements_ij]; + const double d2eri_dAzdBz = output_cache[i + j * n_density_elements_i + 8 * n_density_elements_ij]; + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), d2eri_dAxdBx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), d2eri_dAxdBy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), d2eri_dAxdBz); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 3 * stride_ij), d2eri_dAydBx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 4 * stride_ij), d2eri_dAydBy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 5 * stride_ij), d2eri_dAydBz); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 6 * stride_ij), d2eri_dAzdBx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 7 * stride_ij), d2eri_dAzdBy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 8 * stride_ij), d2eri_dAzdBz); + } + } +} + +template +__device__ +static void GINTwrite_int3c1e_ip1ip2_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double* u2, const double* AC, const double prefactor, const int i_l, const int j_l) +{ + const int *idx = c_idx; + const int *idy = c_idx + TOT_NF; + const int *idz = c_idx + TOT_NF * 2; + + const int g_size = NROOTS * (i_l + 2 + 1) * (j_l + 1); + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + g_size; + const double* __restrict__ gz = g + g_size * 2; + + const double ACx = AC[0]; + const double ACy = AC[1]; + const double ACz = AC[2]; + + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + const int n_density_elements_ij = n_density_elements_i * n_density_elements_j; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const int loc_j = c_l_locs[j_l] + j; + const int loc_i = c_l_locs[i_l] + i; + const int ix = idx[loc_i]; + const int iy = idy[loc_i]; + const int iz = idz[loc_i]; + const int jx = idx[loc_j]; + const int jy = idy[loc_j]; + const int jz = idz[loc_j]; + const int gx_offset = ix + jx * (i_l + 2 + 1); + const int gy_offset = iy + jy * (i_l + 2 + 1); + const int gz_offset = iz + jz * (i_l + 2 + 1); + + double d2eri_dAxdCx = 0; + double d2eri_dAxdCy = 0; + double d2eri_dAxdCz = 0; + double d2eri_dAydCx = 0; + double d2eri_dAydCy = 0; + double d2eri_dAydCz = 0; + double d2eri_dAzdCx = 0; + double d2eri_dAzdCy = 0; + double d2eri_dAzdCz = 0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_minus_1 = (ix >= 1 ? gx[(gx_offset - 1) * NROOTS + i_root] : 0); + const double gy_minus_1 = (iy >= 1 ? gy[(gy_offset - 1) * NROOTS + i_root] : 0); + const double gz_minus_1 = (iz >= 1 ? gz[(gz_offset - 1) * NROOTS + i_root] : 0); + const double gx_0 = gx[gx_offset * NROOTS + i_root]; + const double gy_0 = gy[gy_offset * NROOTS + i_root]; + const double gz_0 = gz[gz_offset * NROOTS + i_root]; + const double gx_1 = gx[(gx_offset + 1) * NROOTS + i_root]; + const double gy_1 = gy[(gy_offset + 1) * NROOTS + i_root]; + const double gz_1 = gz[(gz_offset + 1) * NROOTS + i_root]; + const double gx_2 = gx[(gx_offset + 2) * NROOTS + i_root]; + const double gy_2 = gy[(gy_offset + 2) * NROOTS + i_root]; + const double gz_2 = gz[(gz_offset + 2) * NROOTS + i_root]; + + const double two_u2 = 2.0 * u2[i_root]; + const double dgx_dAx = ix * gx_minus_1 + minus_two_a * gx_1; + const double dgy_dAy = iy * gy_minus_1 + minus_two_a * gy_1; + const double dgz_dAz = iz * gz_minus_1 + minus_two_a * gz_1; + const double dgx_dCx = two_u2 * (ACx * gx_0 + gx_1); + const double dgy_dCy = two_u2 * (ACy * gy_0 + gy_1); + const double dgz_dCz = two_u2 * (ACz * gz_0 + gz_1); + const double d2gx_dAxdCx = two_u2 * (ix * ACx * gx_minus_1 + ix * gx_0 + minus_two_a * ACx * gx_1 + minus_two_a * gx_2); + const double d2gy_dAydCy = two_u2 * (iy * ACy * gy_minus_1 + iy * gy_0 + minus_two_a * ACy * gy_1 + minus_two_a * gy_2); + const double d2gz_dAzdCz = two_u2 * (iz * ACz * gz_minus_1 + iz * gz_0 + minus_two_a * ACz * gz_1 + minus_two_a * gz_2); + + d2eri_dAxdCx += - d2gx_dAxdCx * gy_0 * gz_0; + d2eri_dAxdCy += - dgx_dAx * dgy_dCy * gz_0; + d2eri_dAxdCz += - dgx_dAx * gy_0 * dgz_dCz; + d2eri_dAydCx += - dgx_dCx * dgy_dAy * gz_0; + d2eri_dAydCy += - gx_0 * d2gy_dAydCy * gz_0; + d2eri_dAydCz += - gx_0 * dgy_dAy * dgz_dCz; + d2eri_dAzdCx += - dgx_dCx * gy_0 * dgz_dAz; + d2eri_dAzdCy += - gx_0 * dgy_dCy * dgz_dAz; + d2eri_dAzdCz += - gx_0 * gy_0 * d2gz_dAzdCz; + } + local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += d2eri_dAxdCx * prefactor; + local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += d2eri_dAxdCy * prefactor; + local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += d2eri_dAxdCz * prefactor; + local_output[i + j * n_density_elements_i + 3 * n_density_elements_ij] += d2eri_dAydCx * prefactor; + local_output[i + j * n_density_elements_i + 4 * n_density_elements_ij] += d2eri_dAydCy * prefactor; + local_output[i + j * n_density_elements_i + 5 * n_density_elements_ij] += d2eri_dAydCz * prefactor; + local_output[i + j * n_density_elements_i + 6 * n_density_elements_ij] += d2eri_dAzdCx * prefactor; + local_output[i + j * n_density_elements_i + 7 * n_density_elements_ij] += d2eri_dAzdCy * prefactor; + local_output[i + j * n_density_elements_i + 8 * n_density_elements_ij] += d2eri_dAzdCz * prefactor; + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + if (task_ij >= ntasks_ij) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + const double* __restrict__ a_exponents = c_bpcache.a1; + + const int nbas = c_bpcache.nbas; + const double* __restrict__ bas_x = c_bpcache.bas_coords; + const double* __restrict__ bas_y = bas_x + nbas; + const double* __restrict__ bas_z = bas_y + nbas; + const double Ax = bas_x[ish]; + const double Ay = bas_y[ish]; + const double Az = bas_z[ish]; + + constexpr int l_sum_max = (NROOTS - 1) * 2 + 1; + constexpr int l_i_max_density_elements = (l_sum_max + 1) / 2; + constexpr int l_j_max_density_elements = l_sum_max - l_i_max_density_elements; + double output_cache[(l_i_max_density_elements + 1) * (l_i_max_density_elements + 2) / 2 + * (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2 + * 9] { 0.0 }; + + for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) { + const double* grid_point = grid_points + task_grid * 4; + const double Cx = grid_point[0]; + const double Cy = grid_point[1]; + const double Cz = grid_point[2]; + const double charge = grid_point[3]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + const double AC[3] { Ax - Cx, Ay - Cy, Az - Cz }; + + double g[GSIZE_INT3C_1E]; + double u2[NROOTS]; + + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + GINT_g1e_save_u2(g, u2, grid_point, ish, jsh, ij, i_l + 2, j_l, charge_exponent, omega); + const double minus_two_a = -2.0 * a_exponents[ij]; + GINTwrite_int3c1e_ip1ip2_charge_contracted(g, output_cache, minus_two_a, u2, AC, charge, i_l, j_l); + } + } + + const int* ao_loc = c_bpcache.ao_loc; + + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + const int n_density_elements_ij = n_density_elements_i * n_density_elements_j; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const double d2eri_dAxdCx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij]; + const double d2eri_dAxdCy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij]; + const double d2eri_dAxdCz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij]; + const double d2eri_dAydCx = output_cache[i + j * n_density_elements_i + 3 * n_density_elements_ij]; + const double d2eri_dAydCy = output_cache[i + j * n_density_elements_i + 4 * n_density_elements_ij]; + const double d2eri_dAydCz = output_cache[i + j * n_density_elements_i + 5 * n_density_elements_ij]; + const double d2eri_dAzdCx = output_cache[i + j * n_density_elements_i + 6 * n_density_elements_ij]; + const double d2eri_dAzdCy = output_cache[i + j * n_density_elements_i + 7 * n_density_elements_ij]; + const double d2eri_dAzdCz = output_cache[i + j * n_density_elements_i + 8 * n_density_elements_ij]; + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), d2eri_dAxdCx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), d2eri_dAxdCy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), d2eri_dAxdCz); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 3 * stride_ij), d2eri_dAydCx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 4 * stride_ij), d2eri_dAydCy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 5 * stride_ij), d2eri_dAydCz); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 6 * stride_ij), d2eri_dAzdCx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 7 * stride_ij), d2eri_dAzdCy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 8 * stride_ij), d2eri_dAzdCz); + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ipip2_density_contracted_kernel_general(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, + const BasisProdOffsets offsets, const int nprim_ij, + const double omega, const double* grid_points, const double* charge_exponents) +{ + constexpr int NROOTS = (L_SUM + 2) / 2 + 1; + + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_grid = blockIdx.y * blockDim.y + threadIdx.y; + if (task_grid >= ngrids) { + return; + } + + const double* grid_point = grid_points + task_grid * 3; + const double Cx = grid_point[0]; + const double Cy = grid_point[1]; + const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double d2eri_dCxdCx_pair_sum = 0.0; + double d2eri_dCxdCy_pair_sum = 0.0; + double d2eri_dCxdCz_pair_sum = 0.0; + double d2eri_dCydCy_pair_sum = 0.0; + double d2eri_dCydCz_pair_sum = 0.0; + double d2eri_dCzdCz_pair_sum = 0.0; + for (int task_ij = blockIdx.x * blockDim.x + threadIdx.x; task_ij < ntasks_ij; task_ij += gridDim.x * blockDim.x) { + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + // const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + // const int jsh = bas_pair2ket[bas_ij]; + const int nbas = c_bpcache.nbas; + const double* __restrict__ bas_x = c_bpcache.bas_coords; + const double* __restrict__ bas_y = bas_x + nbas; + const double* __restrict__ bas_z = bas_y + nbas; + const double Ax = bas_x[ish]; + const double Ay = bas_y[ish]; + const double Az = bas_z[ish]; + + const double ACx = Ax - Cx; + const double ACy = Ay - Cy; + const double ACz = Az - Cz; + + double D_hermite[(L_SUM + 1) * (L_SUM + 2) * (L_SUM + 3) / 6]; +#pragma unroll + for (int i_t = 0; i_t < (L_SUM + 1) * (L_SUM + 2) * (L_SUM + 3) / 6; i_t++) { + D_hermite[i_t] = density[bas_ij - hermite_density_offsets.pair_offset_of_angular_pair + hermite_density_offsets.density_offset_of_angular_pair + i_t * hermite_density_offsets.n_pair_of_angular_pair]; + } + + double d2eri_dCxdCx = 0.0; + double d2eri_dCxdCy = 0.0; + double d2eri_dCxdCz = 0.0; + double d2eri_dCydCy = 0.0; + double d2eri_dCydCz = 0.0; + double d2eri_dCzdCz = 0.0; + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + double g[NROOTS * (L_SUM + 2 + 1) * 3]; + double u2[NROOTS]; + GINT_g1e_without_hrr_save_u2(g, u2, Cx, Cy, Cz, ish, ij, charge_exponent, omega); + + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + NROOTS * (L_SUM + 2 + 1); + const double* __restrict__ gz = g + NROOTS * (L_SUM + 2 + 1) * 2; + +#pragma unroll + for (int i_x = 0, i_t = 0; i_x <= L_SUM; i_x++) { +#pragma unroll + for (int i_y = 0; i_x + i_y <= L_SUM; i_y++) { +#pragma unroll + for (int i_z = 0; i_x + i_y + i_z <= L_SUM; i_z++, i_t++) { + double d2eri_dCxdCx_per_hermite = 0.0; + double d2eri_dCxdCy_per_hermite = 0.0; + double d2eri_dCxdCz_per_hermite = 0.0; + double d2eri_dCydCy_per_hermite = 0.0; + double d2eri_dCydCz_per_hermite = 0.0; + double d2eri_dCzdCz_per_hermite = 0.0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_0 = gx[i_root + NROOTS * i_x]; + const double gy_0 = gy[i_root + NROOTS * i_y]; + const double gz_0 = gz[i_root + NROOTS * i_z]; + const double gx_1 = gx[i_root + NROOTS * (i_x + 1)]; + const double gy_1 = gy[i_root + NROOTS * (i_y + 1)]; + const double gz_1 = gz[i_root + NROOTS * (i_z + 1)]; + const double gx_2 = gx[i_root + NROOTS * (i_x + 2)]; + const double gy_2 = gy[i_root + NROOTS * (i_y + 2)]; + const double gz_2 = gz[i_root + NROOTS * (i_z + 2)]; + const double two_u2 = 2.0 * u2[i_root]; + const double dgx_dCx = two_u2 * (gx_1 + ACx * gx_0); + const double dgy_dCy = two_u2 * (gy_1 + ACy * gy_0); + const double dgz_dCz = two_u2 * (gz_1 + ACz * gz_0); + const double d2gx_dCx2 = two_u2 * (-gx_0 + two_u2 * (gx_2 + ACx * gx_1 * 2 + ACx * ACx * gx_0)); + const double d2gy_dCy2 = two_u2 * (-gy_0 + two_u2 * (gy_2 + ACy * gy_1 * 2 + ACy * ACy * gy_0)); + const double d2gz_dCz2 = two_u2 * (-gz_0 + two_u2 * (gz_2 + ACz * gz_1 * 2 + ACz * ACz * gz_0)); + d2eri_dCxdCx_per_hermite += d2gx_dCx2 * gy_0 * gz_0; + d2eri_dCxdCy_per_hermite += dgx_dCx * dgy_dCy * gz_0; + d2eri_dCxdCz_per_hermite += dgx_dCx * gy_0 * dgz_dCz; + d2eri_dCydCy_per_hermite += gx_0 * d2gy_dCy2 * gz_0; + d2eri_dCydCz_per_hermite += gx_0 * dgy_dCy * dgz_dCz; + d2eri_dCzdCz_per_hermite += gx_0 * gy_0 * d2gz_dCz2; + } + const double D_t = D_hermite[i_t]; + d2eri_dCxdCx += d2eri_dCxdCx_per_hermite * D_t; + d2eri_dCxdCy += d2eri_dCxdCy_per_hermite * D_t; + d2eri_dCxdCz += d2eri_dCxdCz_per_hermite * D_t; + d2eri_dCydCy += d2eri_dCydCy_per_hermite * D_t; + d2eri_dCydCz += d2eri_dCydCz_per_hermite * D_t; + d2eri_dCzdCz += d2eri_dCzdCz_per_hermite * D_t; + } + } + } + } + d2eri_dCxdCx_pair_sum += d2eri_dCxdCx; + d2eri_dCxdCy_pair_sum += d2eri_dCxdCy; + d2eri_dCxdCz_pair_sum += d2eri_dCxdCz; + d2eri_dCydCy_pair_sum += d2eri_dCydCy; + d2eri_dCydCz_pair_sum += d2eri_dCydCz; + d2eri_dCzdCz_pair_sum += d2eri_dCzdCz; + } + atomicAdd(output + task_grid + ngrids * 0, d2eri_dCxdCx_pair_sum); + atomicAdd(output + task_grid + ngrids * 1, d2eri_dCxdCy_pair_sum); + atomicAdd(output + task_grid + ngrids * 2, d2eri_dCxdCz_pair_sum); + atomicAdd(output + task_grid + ngrids * 4, d2eri_dCydCy_pair_sum); + atomicAdd(output + task_grid + ngrids * 5, d2eri_dCydCz_pair_sum); + atomicAdd(output + task_grid + ngrids * 8, d2eri_dCzdCz_pair_sum); +} diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu new file mode 100644 index 00000000..4f3a3dee --- /dev/null +++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu @@ -0,0 +1,361 @@ +/* + * Copyright 2021-2024 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include "gint.h" +#include "gint1e.h" +#include "cuda_alloc.cuh" +#include "cint2e.cuh" + +#include "rys_roots.cu" +#include "g1e.cu" +#include "g3c1e_ipip.cu" + +static int GINTfill_int3c1e_ipip1_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents, + const int n_charge_sum_per_thread, const cudaStream_t stream) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread; + + const dim3 threads(THREADSX, THREADSY); + const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + switch (nrys_roots) { + case 2: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<2, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<3, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 6: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<6, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + default: + fprintf(stderr, "nrys_roots = %d out of range\n", nrys_roots); + return 1; + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err)); + return 1; + } + return 0; +} + +static int GINTfill_int3c1e_ipvip1_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents, + const int n_charge_sum_per_thread, const cudaStream_t stream) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread; + + const dim3 threads(THREADSX, THREADSY); + const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + switch (nrys_roots) { + case 2: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<2, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<3, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 6: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<6, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + default: + fprintf(stderr, "nrys_roots = %d out of range\n", nrys_roots); + return 1; + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err)); + return 1; + } + return 0; +} + +static int GINTfill_int3c1e_ip1ip2_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const double omega, const double* grid_points, const double* charge_exponents, + const int n_charge_sum_per_thread, const cudaStream_t stream) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread; + + const dim3 threads(THREADSX, THREADSY); + const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + switch (nrys_roots) { + case 2: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<2, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<3, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + case 6: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<6, GSIZE6_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break; + default: + fprintf(stderr, "nrys_roots = %d out of range\n", nrys_roots); + return 1; + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err)); + return 1; + } + return 0; +} + +static int GINTfill_int3c1e_ipip2_density_contracted_tasks(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, + const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const double omega, const double* grid_points, const double* charge_exponents, + const int n_pair_sum_per_thread, const cudaStream_t stream) +{ + const int ntasks_ij = (offsets.ntasks_ij + n_pair_sum_per_thread - 1) / n_pair_sum_per_thread; + const int ngrids = offsets.ntasks_kl; + + const dim3 threads(THREADSX, THREADSY); + const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); + switch (i_l + j_l) { + case 0: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 0> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 1: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 1> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 2: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 2> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 3> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 4> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 5> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 6: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 6> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 7: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 7> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + case 8: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 8> <<>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break; + // Up to g + g = 8 now + default: + fprintf(stderr, "i_l + j_l = %d out of range\n", i_l + j_l); + return 1; + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err)); + return 1; + } + return 0; +} + +extern "C" { +int GINTfill_int3c1e_ipip1_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, + const double* grid_points, const double* charge_exponents, const int ngrids, + double* integral_charge_contracted, + const int* strides, const int* ao_offsets, + const int* bins_locs_ij, const int nbins, + const int cp_ij_id, const double omega, const int n_charge_sum_per_thread) +{ + const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; + const int i_l = cp_ij->l_bra; + const int j_l = cp_ij->l_ket; + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + const int nprim_ij = cp_ij->nprim_12; + + if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) { + fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots); + return 2; + } + + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + + const int* bas_pairs_locs = bpcache->bas_pairs_locs; + const int* primitive_pairs_locs = bpcache->primitive_pairs_locs; + for (int ij_bin = 0; ij_bin < nbins; ij_bin++) { + const int bas_ij0 = bins_locs_ij[ij_bin]; + const int bas_ij1 = bins_locs_ij[ij_bin + 1]; + const int ntasks_ij = bas_ij1 - bas_ij0; + if (ntasks_ij <= 0) { + continue; + } + + BasisProdOffsets offsets; + offsets.ntasks_ij = ntasks_ij; + offsets.ntasks_kl = ngrids; + offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0; + offsets.bas_kl = -1; + offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij; + offsets.primitive_kl = -1; + + const int err = GINTfill_int3c1e_ipip1_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij, + strides[0], strides[1], ao_offsets[0], ao_offsets[1], + omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream); + + if (err != 0) { + return err; + } + } + + return 0; +} + +int GINTfill_int3c1e_ipvip1_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, + const double* grid_points, const double* charge_exponents, const int ngrids, + double* integral_charge_contracted, + const int* strides, const int* ao_offsets, + const int* bins_locs_ij, const int nbins, + const int cp_ij_id, const double omega, const int n_charge_sum_per_thread) +{ + const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; + const int i_l = cp_ij->l_bra; + const int j_l = cp_ij->l_ket; + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + const int nprim_ij = cp_ij->nprim_12; + + if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) { + fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots); + return 2; + } + + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + + const int* bas_pairs_locs = bpcache->bas_pairs_locs; + const int* primitive_pairs_locs = bpcache->primitive_pairs_locs; + for (int ij_bin = 0; ij_bin < nbins; ij_bin++) { + const int bas_ij0 = bins_locs_ij[ij_bin]; + const int bas_ij1 = bins_locs_ij[ij_bin + 1]; + const int ntasks_ij = bas_ij1 - bas_ij0; + if (ntasks_ij <= 0) { + continue; + } + + BasisProdOffsets offsets; + offsets.ntasks_ij = ntasks_ij; + offsets.ntasks_kl = ngrids; + offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0; + offsets.bas_kl = -1; + offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij; + offsets.primitive_kl = -1; + + const int err = GINTfill_int3c1e_ipvip1_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij, + strides[0], strides[1], ao_offsets[0], ao_offsets[1], + omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream); + + if (err != 0) { + return err; + } + } + + return 0; +} + +int GINTfill_int3c1e_ip1ip2_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, + const double* grid_points, const double* charge_exponents, const int ngrids, + double* integral_charge_contracted, + const int* strides, const int* ao_offsets, + const int* bins_locs_ij, const int nbins, + const int cp_ij_id, const double omega, const int n_charge_sum_per_thread) +{ + const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; + const int i_l = cp_ij->l_bra; + const int j_l = cp_ij->l_ket; + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + const int nprim_ij = cp_ij->nprim_12; + + if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) { + fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots); + return 2; + } + + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + + const int* bas_pairs_locs = bpcache->bas_pairs_locs; + const int* primitive_pairs_locs = bpcache->primitive_pairs_locs; + for (int ij_bin = 0; ij_bin < nbins; ij_bin++) { + const int bas_ij0 = bins_locs_ij[ij_bin]; + const int bas_ij1 = bins_locs_ij[ij_bin + 1]; + const int ntasks_ij = bas_ij1 - bas_ij0; + if (ntasks_ij <= 0) { + continue; + } + + BasisProdOffsets offsets; + offsets.ntasks_ij = ntasks_ij; + offsets.ntasks_kl = ngrids; + offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0; + offsets.bas_kl = -1; + offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij; + offsets.primitive_kl = -1; + + const int err = GINTfill_int3c1e_ip1ip2_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij, + strides[0], strides[1], ao_offsets[0], ao_offsets[1], + omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream); + + if (err != 0) { + return err; + } + } + + return 0; +} + +int GINTfill_int3c1e_ipip2_density_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, + const double* grid_points, const double* charge_exponents, const int ngrids, + const double* dm_pair_ordered, const int* density_offset, + double* integral_density_contracted, + const int* bins_locs_ij, const int nbins, + const int cp_ij_id, const double omega, const int n_pair_sum_per_thread) +{ + const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; + const int i_l = cp_ij->l_bra; + const int j_l = cp_ij->l_ket; + const int nrys_roots = (i_l + j_l + 2) / 2 + 1; + const int nprim_ij = cp_ij->nprim_12; + + if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) { + fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots); + return 2; + } + + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + + const int* bas_pairs_locs = bpcache->bas_pairs_locs; + const int* primitive_pairs_locs = bpcache->primitive_pairs_locs; + for (int ij_bin = 0; ij_bin < nbins; ij_bin++) { + const int bas_ij0 = bins_locs_ij[ij_bin]; + const int bas_ij1 = bins_locs_ij[ij_bin + 1]; + const int ntasks_ij = bas_ij1 - bas_ij0; + if (ntasks_ij <= 0) { + continue; + } + + BasisProdOffsets offsets; + offsets.ntasks_ij = ntasks_ij; + offsets.ntasks_kl = ngrids; + offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0; + offsets.bas_kl = -1; + offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij; + offsets.primitive_kl = -1; + + HermiteDensityOffsets hermite_density_offsets; + hermite_density_offsets.density_offset_of_angular_pair = density_offset[cp_ij_id]; + hermite_density_offsets.pair_offset_of_angular_pair = bas_pairs_locs[cp_ij_id]; + hermite_density_offsets.n_pair_of_angular_pair = bas_pairs_locs[cp_ij_id + 1] - bas_pairs_locs[cp_ij_id]; + + const int err = GINTfill_int3c1e_ipip2_density_contracted_tasks(integral_density_contracted, dm_pair_ordered, hermite_density_offsets, + offsets, i_l, j_l, nprim_ij, + omega, grid_points, charge_exponents, n_pair_sum_per_thread, stream); + + if (err != 0) { + return err; + } + } + + return 0; +} +} diff --git a/gpu4pyscf/lib/gvhf-rys/cart2xyz.c b/gpu4pyscf/lib/gvhf-rys/cart2xyz.c index ee564cf9..ba10aca6 100644 --- a/gpu4pyscf/lib/gvhf-rys/cart2xyz.c +++ b/gpu4pyscf/lib/gvhf-rys/cart2xyz.c @@ -3,6 +3,9 @@ #include #include "vhf.cuh" +// up to l=7 +#define L_SLOTS 8 + static int _LEN_CART0[] = { 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, 136 }; @@ -32,9 +35,9 @@ static void _get_dm_to_dm_xyz_coeff(double* pcx, double* rij, int lmax) { int lmax1 = lmax + 1; int l, lx; - double rx_pow[LMAX1]; - double ry_pow[LMAX1]; - double rz_pow[LMAX1]; + double rx_pow[L_SLOTS]; + double ry_pow[L_SLOTS]; + double rz_pow[L_SLOTS]; rx_pow[0] = 1.0; ry_pow[0] = 1.0; @@ -67,7 +70,7 @@ static void _dm_to_dm_xyz(double* dm_xyz, double* dm, int nao, int li, int lj, d int lij = li + lj; int l1 = lij + 1; int l1l1 = l1 * l1; - double pcx[LMAX1*LMAX1*3]; + double pcx[L_SLOTS*L_SLOTS*3]; double *pcy = pcx + lj1 * lj1; double *pcz = pcy + lj1 * lj1; _get_dm_to_dm_xyz_coeff(pcx, rij, lj); @@ -116,7 +119,7 @@ static void _dm_xyz_to_dm(double* dm_xyz, double* dm, int nao, int li, int lj, d int lj1 = lj + 1; int l1 = li + lj + 1; int l1l1 = l1 * l1; - double pcx[LMAX1*LMAX1*3]; + double pcx[L_SLOTS*L_SLOTS*3]; double *pcy = pcx + lj1 * lj1; double *pcz = pcy + lj1 * lj1; _get_dm_to_dm_xyz_coeff(pcx, rij, lj); @@ -152,7 +155,7 @@ void transform_cart_to_xyz(double *dm_xyz, double *dm, int *ao_loc, int *pair_lo int *bas, int nbas, double *env) { int nao = ao_loc[nbas]; - double cache[(LMAX*2+1)*(LMAX*2+1)*(LMAX*2+1)]; + double cache[L_SLOTS*L_SLOTS*L_SLOTS*8]; for (int ish = 0; ish < nbas; ish++) { int i0 = ao_loc[ish]; int li = bas[ish*BAS_SLOTS+ANG_OF]; @@ -182,7 +185,7 @@ void transform_xyz_to_cart(double *vj, double *vj_xyz, int *ao_loc, int *pair_lo int *bas, int nbas, double *env) { int nao = ao_loc[nbas]; - double cache[(LMAX*2+1)*(LMAX*2+1)*(LMAX*2+1)]; + double cache[L_SLOTS*L_SLOTS*L_SLOTS*8]; for (int ish = 0; ish < nbas; ish++) { int i0 = ao_loc[ish]; int li = bas[ish*BAS_SLOTS+ANG_OF]; diff --git a/gpu4pyscf/lib/gvhf-rys/create_tasks.cu b/gpu4pyscf/lib/gvhf-rys/create_tasks.cu index ae8ef8ad..262f9de0 100644 --- a/gpu4pyscf/lib/gvhf-rys/create_tasks.cu +++ b/gpu4pyscf/lib/gvhf-rys/create_tasks.cu @@ -97,39 +97,35 @@ static int _fill_jk_tasks(ShellQuartet *shl_quartet_idx, } // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda - extern __shared__ int thread_offsets[]; - thread_offsets[t_id] = count; + extern __shared__ int cum_count[]; + cum_count[t_id] = count; // Up-sweep phase for (int stride = 1; stride < threads; stride *= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; if (index < threads) { - thread_offsets[index] += thread_offsets[index-stride]; + cum_count[index] += cum_count[index-stride]; } } __syncthreads(); - if (t_id == threads-1) { thread_offsets[threads-1] = 0; } // Down-sweep phase - for (int stride = threads/2; stride > 0; stride /= 2) { + for (int stride = threads/4; stride > 0; stride /= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; - if (index < threads) { - int temp = thread_offsets[index - stride]; - thread_offsets[index - stride] = thread_offsets[index]; - thread_offsets[index] += temp; + if (index + stride < threads) { + cum_count[index + stride] += cum_count[index]; } } __syncthreads(); - __shared__ int ntasks; - if (t_id == threads-1) { - ntasks = thread_offsets[threads-1] + count; - } - __syncthreads(); + int ntasks = cum_count[threads-1]; if (ntasks == 0) { return ntasks; } - int offset = thread_offsets[t_id]; + int offset = 0; + if (t_id > 0) { + offset = cum_count[t_id-1]; + } for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) { int tile_kl = tile_kl_mapping[t_kl_id]; if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) { @@ -311,7 +307,7 @@ static int _fill_sr_jk_tasks(ShellQuartet *shl_quartet_idx, float ypq = yij - ykl; float zpq = zij - zkl; float rr = xpq*xpq + ypq*ypq + zpq*zpq; - float theta_rr = logf(rr + 1e-30f) + theta * rr; + float theta_rr = logf(rr + 1.f) + theta * rr; d_cutoff = skl_cutoff - s_estimator[bas_kl] + theta_rr; if (d_cutoff > 0) { continue; @@ -332,39 +328,35 @@ static int _fill_sr_jk_tasks(ShellQuartet *shl_quartet_idx, } // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda - extern __shared__ int thread_offsets[]; - thread_offsets[t_id] = count; + extern __shared__ int cum_count[]; + cum_count[t_id] = count; // Up-sweep phase for (int stride = 1; stride < threads; stride *= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; if (index < threads) { - thread_offsets[index] += thread_offsets[index-stride]; + cum_count[index] += cum_count[index-stride]; } } __syncthreads(); - if (t_id == threads-1) { thread_offsets[threads-1] = 0; } // Down-sweep phase - for (int stride = threads/2; stride > 0; stride /= 2) { + for (int stride = threads/4; stride > 0; stride /= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; - if (index < threads) { - int temp = thread_offsets[index - stride]; - thread_offsets[index - stride] = thread_offsets[index]; - thread_offsets[index] += temp; + if (index + stride < threads) { + cum_count[index + stride] += cum_count[index]; } } __syncthreads(); - __shared__ int ntasks; - if (t_id == threads-1) { - ntasks = thread_offsets[threads-1] + count; - } - __syncthreads(); + int ntasks = cum_count[threads-1]; if (ntasks == 0) { return ntasks; } - int offset = thread_offsets[t_id]; + int offset = 0; + if (t_id > 0) { + offset = cum_count[t_id-1]; + } for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) { int tile_kl = tile_kl_mapping[t_kl_id]; if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) { @@ -457,7 +449,7 @@ static int _fill_sr_jk_tasks(ShellQuartet *shl_quartet_idx, float ypq = yij - ykl; float zpq = zij - zkl; float rr = xpq*xpq + ypq*ypq + zpq*zpq; - float theta_rr = logf(rr + 1e-30f) + theta * rr; + float theta_rr = logf(rr + 1.f) + theta * rr; d_cutoff = skl_cutoff - s_estimator[bas_kl] + theta_rr; if (d_cutoff > 0) { continue; diff --git a/gpu4pyscf/lib/gvhf-rys/create_tasks_ip1.cu b/gpu4pyscf/lib/gvhf-rys/create_tasks_ip1.cu index 83803180..6ec7132e 100644 --- a/gpu4pyscf/lib/gvhf-rys/create_tasks_ip1.cu +++ b/gpu4pyscf/lib/gvhf-rys/create_tasks_ip1.cu @@ -93,40 +93,35 @@ static int _fill_ejk_tasks(ShellQuartet *shl_quartet_idx, } } - // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda - extern __shared__ int thread_offsets[]; - thread_offsets[t_id] = count; + extern __shared__ int cum_count[]; + cum_count[t_id] = count; // Up-sweep phase for (int stride = 1; stride < threads; stride *= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; if (index < threads) { - thread_offsets[index] += thread_offsets[index-stride]; + cum_count[index] += cum_count[index-stride]; } } __syncthreads(); - if (t_id == threads-1) { thread_offsets[threads-1] = 0; } // Down-sweep phase - for (int stride = threads/2; stride > 0; stride /= 2) { + for (int stride = threads/4; stride > 0; stride /= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; - if (index < threads) { - int temp = thread_offsets[index - stride]; - thread_offsets[index - stride] = thread_offsets[index]; - thread_offsets[index] += temp; + if (index + stride < threads) { + cum_count[index + stride] += cum_count[index]; } } __syncthreads(); - __shared__ int ntasks; - if (t_id == threads-1) { - ntasks = thread_offsets[threads-1] + count; - } - __syncthreads(); + int ntasks = cum_count[threads-1]; if (ntasks == 0) { return ntasks; } - int offset = thread_offsets[t_id]; + int offset = 0; + if (t_id > 0) { + offset = cum_count[t_id-1]; + } for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) { int tile_kl = tile_kl_mapping[t_kl_id]; if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) { @@ -317,40 +312,35 @@ static int _fill_sr_ejk_tasks(ShellQuartet *shl_quartet_idx, } } - // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda - extern __shared__ int thread_offsets[]; - thread_offsets[t_id] = count; + extern __shared__ int cum_count[]; + cum_count[t_id] = count; // Up-sweep phase for (int stride = 1; stride < threads; stride *= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; if (index < threads) { - thread_offsets[index] += thread_offsets[index-stride]; + cum_count[index] += cum_count[index-stride]; } } __syncthreads(); - if (t_id == threads-1) { thread_offsets[threads-1] = 0; } // Down-sweep phase - for (int stride = threads/2; stride > 0; stride /= 2) { + for (int stride = threads/4; stride > 0; stride /= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; - if (index < threads) { - int temp = thread_offsets[index - stride]; - thread_offsets[index - stride] = thread_offsets[index]; - thread_offsets[index] += temp; + if (index + stride < threads) { + cum_count[index + stride] += cum_count[index]; } } __syncthreads(); - __shared__ int ntasks; - if (t_id == threads-1) { - ntasks = thread_offsets[threads-1] + count; - } - __syncthreads(); + int ntasks = cum_count[threads-1]; if (ntasks == 0) { return ntasks; } - int offset = thread_offsets[t_id]; + int offset = 0; + if (t_id > 0) { + offset = cum_count[t_id-1]; + } for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) { int tile_kl = tile_kl_mapping[t_kl_id]; if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) { @@ -504,40 +494,35 @@ static int _fill_jk_tasks_s2kl(ShellQuartet *shl_quartet_idx, } } - // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda - extern __shared__ int thread_offsets[]; - thread_offsets[t_id] = count; + extern __shared__ int cum_count[]; + cum_count[t_id] = count; // Up-sweep phase for (int stride = 1; stride < threads; stride *= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; if (index < threads) { - thread_offsets[index] += thread_offsets[index-stride]; + cum_count[index] += cum_count[index-stride]; } } __syncthreads(); - if (t_id == threads-1) { thread_offsets[threads-1] = 0; } // Down-sweep phase - for (int stride = threads/2; stride > 0; stride /= 2) { + for (int stride = threads/4; stride > 0; stride /= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; - if (index < threads) { - int temp = thread_offsets[index - stride]; - thread_offsets[index - stride] = thread_offsets[index]; - thread_offsets[index] += temp; + if (index + stride < threads) { + cum_count[index + stride] += cum_count[index]; } } __syncthreads(); - __shared__ int ntasks; - if (t_id == threads-1) { - ntasks = thread_offsets[threads-1] + count; - } - __syncthreads(); + int ntasks = cum_count[threads-1]; if (ntasks == 0) { return ntasks; } - int offset = thread_offsets[t_id]; + int offset = 0; + if (t_id > 0) { + offset = cum_count[t_id-1]; + } ShellQuartet sq = {(uint16_t)ish, (uint16_t)jsh}; for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) { int bas_kl = pair_kl_mapping[t_kl_id]; @@ -562,156 +547,3 @@ static int _fill_jk_tasks_s2kl(ShellQuartet *shl_quartet_idx, } return ntasks; } - -__device__ -static int _fill_ejk_tasks_tmp(ShellQuartet *shl_quartet_idx, - RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds, - int batch_ij, int batch_kl) -{ - int nbas = envs.nbas; - int *tile_ij_mapping = bounds.tile_ij_mapping; - int *tile_kl_mapping = bounds.tile_kl_mapping; - float *q_cond = bounds.q_cond; - float *tile_q_cond = bounds.tile_q_cond; - float *dm_cond = bounds.dm_cond; - float cutoff = bounds.cutoff; - int t_id = threadIdx.y * blockDim.x + threadIdx.x; - int t_kl0 = batch_kl * TILES_IN_BATCH; - int t_kl1 = MIN(t_kl0 + TILES_IN_BATCH, bounds.ntile_kl_pairs); - int threads = blockDim.x * blockDim.y; - - int tile_ij = tile_ij_mapping[batch_ij]; - int nbas_tiles = nbas / TILE; - int tile_i = tile_ij / nbas_tiles; - int tile_j = tile_ij % nbas_tiles; - int ish0 = tile_i * TILE; - int jsh0 = tile_j * TILE; - int ish1 = ish0 + TILE; - int jsh1 = jsh0 + TILE; - int do_j = jk.vj != NULL; - int do_k = jk.vk != NULL; - - int count = 0; - float tile_q_ij = tile_q_cond[tile_ij]; - for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) { - int tile_kl = tile_kl_mapping[t_kl_id]; - if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) { - break; - } - int tile_k = tile_kl / nbas_tiles; - int tile_l = tile_kl % nbas_tiles; - int ksh0 = tile_k * TILE; - int lsh0 = tile_l * TILE; - int ksh1 = ksh0 + TILE; - int lsh1 = lsh0 + TILE; - for (int ish = ish0; ish < ish1; ++ish) { - for (int jsh = jsh0; jsh < MIN(ish+1, jsh1); ++jsh) { - float q_ij = q_cond [ish*nbas+jsh]; - float d_ij = dm_cond[ish*nbas+jsh]; - int bas_ij = ish * nbas + jsh; - for (int ksh = ksh0; ksh < MIN(ish+1, ksh1); ++ksh) { - float d_ik = dm_cond[ish*nbas+ksh]; - float d_jk = dm_cond[jsh*nbas+ksh]; - for (int lsh = lsh0; lsh < MIN(ksh+1, lsh1); ++lsh) { - int bas_kl = ksh * nbas + lsh; - if (bas_ij < bas_kl) { - continue; - } - float q_ijkl = q_ij + q_cond[ksh*nbas+lsh]; - if (q_ijkl < cutoff) { - continue; - } - float d_cutoff = cutoff - q_ijkl; - if ((do_k && (d_ik+dm_cond[jsh*nbas+lsh] > d_cutoff || - d_jk+dm_cond[ish*nbas+lsh] > d_cutoff)) || - (do_j && d_ij+dm_cond[ksh*nbas+lsh] > d_cutoff)) { - ++count; - } - } - } - } - } - } - - // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda - extern __shared__ int thread_offsets[]; - thread_offsets[t_id] = count; - // Up-sweep phase - for (int stride = 1; stride < threads; stride *= 2) { - __syncthreads(); - int index = (t_id + 1) * stride * 2 - 1; - if (index < threads) { - thread_offsets[index] += thread_offsets[index-stride]; - } - } - __syncthreads(); - if (t_id == threads-1) { thread_offsets[threads-1] = 0; } - // Down-sweep phase - for (int stride = threads/2; stride > 0; stride /= 2) { - __syncthreads(); - int index = (t_id + 1) * stride * 2 - 1; - if (index < threads) { - int temp = thread_offsets[index - stride]; - thread_offsets[index - stride] = thread_offsets[index]; - thread_offsets[index] += temp; - } - } - __syncthreads(); - __shared__ int ntasks; - if (t_id == threads-1) { - ntasks = thread_offsets[threads-1] + count; - } - __syncthreads(); - if (ntasks == 0) { - return ntasks; - } - - int offset = thread_offsets[t_id]; - for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) { - int tile_kl = tile_kl_mapping[t_kl_id]; - if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) { - break; - } - int tile_k = tile_kl / nbas_tiles; - int tile_l = tile_kl % nbas_tiles; - int ksh0 = tile_k * TILE; - int lsh0 = tile_l * TILE; - int ksh1 = ksh0 + TILE; - int lsh1 = lsh0 + TILE; - ShellQuartet sq; - for (int ish = ish0; ish < ish1; ++ish) { - for (int jsh = jsh0; jsh < MIN(ish+1, jsh1); ++jsh) { - float q_ij = q_cond [ish*nbas+jsh]; - float d_ij = dm_cond[ish*nbas+jsh]; - int bas_ij = ish * nbas + jsh; - sq.i = ish; - sq.j = jsh; - for (int ksh = ksh0; ksh < MIN(ish+1, ksh1); ++ksh) { - float d_ik = dm_cond[ish*nbas+ksh]; - float d_jk = dm_cond[jsh*nbas+ksh]; - for (int lsh = lsh0; lsh < MIN(ksh+1, lsh1); ++lsh) { - int bas_kl = ksh * nbas + lsh; - if (bas_ij < bas_kl) { - continue; - } - float q_ijkl = q_ij + q_cond[ksh*nbas+lsh]; - if (q_ijkl < cutoff) { - continue; - } - float d_cutoff = cutoff - q_ijkl; - if ((do_k && (d_ik+dm_cond[jsh*nbas+lsh] > d_cutoff || - d_jk+dm_cond[ish*nbas+lsh] > d_cutoff)) || - (do_j && d_ij+dm_cond[ksh*nbas+lsh] > d_cutoff)) { - sq.k = ksh; - sq.l = lsh; - shl_quartet_idx[offset] = sq; - ++offset; - } - } - } - } - } - } - return ntasks; -} - diff --git a/gpu4pyscf/lib/gvhf-rys/create_tasks_ip2.cu b/gpu4pyscf/lib/gvhf-rys/create_tasks_ip2.cu index ef62227a..df22b535 100644 --- a/gpu4pyscf/lib/gvhf-rys/create_tasks_ip2.cu +++ b/gpu4pyscf/lib/gvhf-rys/create_tasks_ip2.cu @@ -71,40 +71,35 @@ static int _fill_ejk_ip2_type2_tasks(ShellQuartet *shl_quartet_idx, } } - // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda - extern __shared__ int thread_offsets[]; - thread_offsets[t_id] = count; + extern __shared__ int cum_count[]; + cum_count[t_id] = count; // Up-sweep phase for (int stride = 1; stride < threads; stride *= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; if (index < threads) { - thread_offsets[index] += thread_offsets[index-stride]; + cum_count[index] += cum_count[index-stride]; } } __syncthreads(); - if (t_id == threads-1) { thread_offsets[threads-1] = 0; } // Down-sweep phase - for (int stride = threads/2; stride > 0; stride /= 2) { + for (int stride = threads/4; stride > 0; stride /= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; - if (index < threads) { - int temp = thread_offsets[index - stride]; - thread_offsets[index - stride] = thread_offsets[index]; - thread_offsets[index] += temp; + if (index + stride < threads) { + cum_count[index + stride] += cum_count[index]; } } __syncthreads(); - __shared__ int ntasks; - if (t_id == threads-1) { - ntasks = thread_offsets[threads-1] + count; - } - __syncthreads(); + int ntasks = cum_count[threads-1]; if (ntasks == 0) { return ntasks; } - int offset = thread_offsets[t_id]; + int offset = 0; + if (t_id > 0) { + offset = cum_count[t_id-1]; + } for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) { int tile_kl = tile_kl_mapping[t_kl_id]; if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) { @@ -218,40 +213,35 @@ static int _fill_ejk_ip2_type3_tasks(ShellQuartet *shl_quartet_idx, } } - // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda - extern __shared__ int thread_offsets[]; - thread_offsets[t_id] = count; + extern __shared__ int cum_count[]; + cum_count[t_id] = count; // Up-sweep phase for (int stride = 1; stride < threads; stride *= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; if (index < threads) { - thread_offsets[index] += thread_offsets[index-stride]; + cum_count[index] += cum_count[index-stride]; } } __syncthreads(); - if (t_id == threads-1) { thread_offsets[threads-1] = 0; } // Down-sweep phase - for (int stride = threads/2; stride > 0; stride /= 2) { + for (int stride = threads/4; stride > 0; stride /= 2) { __syncthreads(); int index = (t_id + 1) * stride * 2 - 1; - if (index < threads) { - int temp = thread_offsets[index - stride]; - thread_offsets[index - stride] = thread_offsets[index]; - thread_offsets[index] += temp; + if (index + stride < threads) { + cum_count[index + stride] += cum_count[index]; } } __syncthreads(); - __shared__ int ntasks; - if (t_id == threads-1) { - ntasks = thread_offsets[threads-1] + count; - } - __syncthreads(); + int ntasks = cum_count[threads-1]; if (ntasks == 0) { return ntasks; } - int offset = thread_offsets[t_id]; + int offset = 0; + if (t_id > 0) { + offset = cum_count[t_id-1]; + } for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) { int tile_kl = tile_kl_mapping[t_kl_id]; if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) { diff --git a/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu b/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu index 1b2b79b3..6cbd22a5 100644 --- a/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu +++ b/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu @@ -23,6 +23,9 @@ #include "rys_roots.cu" #include "create_tasks.cu" +// TODO: benchmark performance for 34, 36, 41, 43, 45, 47, 51, 57 +#define GOUT_WIDTH 42 + __device__ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds, ShellQuartet *shl_quartet_idx, int ntasks) @@ -69,7 +72,7 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds, double *g = rw + nsq_per_block * nroots*2; double *Rpa_cicj = g + nsq_per_block * g_size*3; double Rqc[3], Rpq[3]; - double gout[GWIDTH]; + double gout[GOUT_WIDTH]; for (int task0 = 0; task0 < ntasks; task0 += nsq_per_block) { __syncthreads(); @@ -126,9 +129,10 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds, double Kab = exp(-theta_ij * (xjxi*xjxi+yjyi*yjyi+zjzi*zjzi)); Rpa[sq_id+3*nsq_per_block] = fac_sym * ci[ip] * cj[jp] * Kab; } - for (int gout_start = 0; gout_start < nfij*nfkl; gout_start+=gout_stride*GWIDTH) { + for (int gout_start = 0; gout_start < nfij*nfkl; + gout_start+=gout_stride*GOUT_WIDTH) { #pragma unroll - for (int n = 0; n < GWIDTH; ++n) { gout[n] = 0; } + for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; } for (int klp = 0; klp < kprim*lprim; ++klp) { int kp = klp / lprim; @@ -197,11 +201,6 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds, } double rt = rw[sq_id + irys*2*nsq_per_block]; double rt_aa = rt / (aij + akl); - double rt_aij = rt_aa * akl; - double rt_akl = rt_aa * aij; - double b00 = .5 * rt_aa; - double b10 = .5/aij * (1 - rt_aij); - double b01 = .5/akl * (1 - rt_akl); // TRR //for i in range(lij): @@ -211,6 +210,8 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds, // trr(i,k+1) = c0p * trr(i,k) + k*b01 * trr(i,k-1) + i*b00 * trr(i-1,k) if (lij > 0) { __syncthreads(); + double rt_aij = rt_aa * akl; + double b10 = .5/aij * (1 - rt_aij); // gx(0,n+1) = c0*gx(0,n) + n*b10*gx(0,n-1) for (int n = gout_id; n < 3; n += gout_stride) { double *_gx = g + n * g_size * nsq_per_block; @@ -230,6 +231,9 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds, if (lkl > 0) { int lij3 = (lij+1)*3; + double rt_akl = rt_aa * aij; + double b00 = .5 * rt_aa; + double b01 = .5/akl * (1 - rt_akl); for (int n = gout_id; n < lij3+gout_id; n += gout_stride) { __syncthreads(); int i = n / 3; //for i in range(lij+1): @@ -315,7 +319,7 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds, double *gy = gx + nsq_per_block * g_size; double *gz = gy + nsq_per_block * g_size; #pragma unroll - for (int n = 0; n < GWIDTH; ++n) { + for (int n = 0; n < GOUT_WIDTH; ++n) { int ijkl = (gout_start + n*gout_stride+gout_id); int kl = ijkl / nfij; int ij = ijkl % nfij; @@ -338,7 +342,7 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds, int do_k = vk != NULL; for (int i_dm = 0; i_dm < jk.n_dm; ++i_dm) { #pragma unroll - for (int n = 0; n < GWIDTH; ++n) { + for (int n = 0; n < GOUT_WIDTH; ++n) { int ijkl = (gout_start + n*gout_stride+gout_id); int kl = ijkl / nfij; int ij = ijkl % nfij; @@ -422,7 +426,7 @@ static void rys_sr_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds double *g = rw + nsq_per_block * nroots*2; double *Rpa_cicj = g + nsq_per_block * g_size*3; double Rqc[3], Rpq[3]; - double gout[GWIDTH]; + double gout[GOUT_WIDTH]; for (int task0 = 0; task0 < ntasks; task0 += nsq_per_block) { __syncthreads(); @@ -479,9 +483,10 @@ static void rys_sr_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds double Kab = exp(-theta_ij * (xjxi*xjxi+yjyi*yjyi+zjzi*zjzi)); Rpa[sq_id+3*nsq_per_block] = fac_sym * ci[ip] * cj[jp] * Kab; } - for (int gout_start = 0; gout_start < nfij*nfkl; gout_start+=gout_stride*GWIDTH) { + for (int gout_start = 0; gout_start < nfij*nfkl; + gout_start+=gout_stride*GOUT_WIDTH) { #pragma unroll - for (int n = 0; n < GWIDTH; ++n) { gout[n] = 0; } + for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; } for (int klp = 0; klp < kprim*lprim; ++klp) { int kp = klp / lprim; @@ -669,7 +674,7 @@ static void rys_sr_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds double *gy = gx + nsq_per_block * g_size; double *gz = gy + nsq_per_block * g_size; #pragma unroll - for (int n = 0; n < GWIDTH; ++n) { + for (int n = 0; n < GOUT_WIDTH; ++n) { int ijkl = gout_start + n*gout_stride+gout_id; int kl = ijkl / nfij; int ij = ijkl % nfij; @@ -692,7 +697,7 @@ static void rys_sr_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds int do_k = vk != NULL; for (int i_dm = 0; i_dm < jk.n_dm; ++i_dm) { #pragma unroll - for (int n = 0; n < GWIDTH; ++n) { + for (int n = 0; n < GOUT_WIDTH; ++n) { int ijkl = (gout_start + n*gout_stride+gout_id); int kl = ijkl / nfij; int ij = ijkl % nfij; diff --git a/gpu4pyscf/lib/gvhf-rys/rys_jk_driver.cu b/gpu4pyscf/lib/gvhf-rys/rys_jk_driver.cu index 04c6d3ee..ba3c14a5 100644 --- a/gpu4pyscf/lib/gvhf-rys/rys_jk_driver.cu +++ b/gpu4pyscf/lib/gvhf-rys/rys_jk_driver.cu @@ -201,7 +201,7 @@ int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao, int gout_stride = scheme[1]; int ij_prims = iprim * jprim; dim3 threads(quartets_per_block, gout_stride); - int buflen = (nroots*4 + g_size*3 + ij_prims*4) * quartets_per_block;// + ij_prims*4*TILE2; + int buflen = (nroots*2 + g_size*3 + ij_prims*4) * quartets_per_block;// + ij_prims*4*TILE2; rys_sr_jk_kernel<<>>(envs, jk, bounds, pool, batch_head); } cudaError_t err = cudaGetLastError(); @@ -329,7 +329,7 @@ int RYS_per_atom_jk_ip1(double *ejk, double j_factor, double k_factor, int ij_prims = iprim * jprim; dim3 threads(quartets_per_block, gout_stride); int buflen = (nroots*2 + g_size*3 + ij_prims*4) * quartets_per_block; - buflen = MAX(buflen, 9*gout_stride*quartets_per_block); + buflen = MAX(buflen, 12*gout_stride*quartets_per_block); rys_ejk_ip1_kernel<<>>(envs, jk, bounds, pool, batch_head); } cudaError_t err = cudaGetLastError(); diff --git a/gpu4pyscf/lib/logger.py b/gpu4pyscf/lib/logger.py index c715976e..54713c43 100644 --- a/gpu4pyscf/lib/logger.py +++ b/gpu4pyscf/lib/logger.py @@ -17,9 +17,6 @@ import cupy from pyscf import lib -from pyscf.lib import parameters as param -import pyscf.__config__ - INFO = lib.logger.INFO NOTE = lib.logger.NOTE WARN = lib.logger.WARN @@ -29,66 +26,63 @@ TIMER_LEVEL = lib.logger.TIMER_LEVEL flush = lib.logger.flush -if sys.version_info < (3, 0): - process_clock = time.clock - perf_counter = time.time -else: - process_clock = time.process_time - perf_counter = time.perf_counter +process_clock = time.process_time +perf_counter = time.perf_counter def init_timer(rec): - if rec.verbose >= TIMER_LEVEL: - e0 = cupy.cuda.Event() - e0.record() - return (process_clock(), perf_counter(), e0) - elif rec.verbose >= DEBUG: - return (process_clock(), perf_counter()) - else: - return process_clock(), + e0 = cupy.cuda.Event() + e0.record() + return (process_clock(), perf_counter(), e0) def timer(rec, msg, cpu0=None, wall0=None, gpu0=None): - if cpu0 is None: - cpu0 = rec._t0 - if wall0 and gpu0: - rec._t0, rec._w0, rec._e0 = process_clock(), perf_counter(), cupy.cuda.Event() + if gpu0: + t0, w0, e0 = process_clock(), perf_counter(), cupy.cuda.Event() + e0.record() if rec.verbose >= TIMER_LEVEL: - rec._e0.record() - rec._e0.synchronize() - + e0.synchronize() flush(rec, ' CPU time for %-50s %9.2f sec, wall time %9.2f sec, GPU time %9.2f ms' - % (msg, rec._t0-cpu0, rec._w0-wall0, cupy.cuda.get_elapsed_time(gpu0,rec._e0))) - return rec._t0, rec._w0, rec._e0 + % (msg, t0-cpu0, w0-wall0, cupy.cuda.get_elapsed_time(gpu0,e0))) + return t0, w0, e0 elif wall0: - rec._t0, rec._w0 = process_clock(), perf_counter() + t0, w0 = process_clock(), perf_counter() if rec.verbose >= TIMER_LEVEL: flush(rec, ' CPU time for %s %9.2f sec, wall time %9.2f sec' - % (msg, rec._t0-cpu0, rec._w0-wall0)) - return rec._t0, rec._w0 + % (msg, t0-cpu0, w0-wall0)) + return t0, w0 else: - rec._t0 = process_clock() + t0 = process_clock() if rec.verbose >= TIMER_LEVEL: - flush(rec, ' CPU time for %s %9.2f sec' % (msg, rec._t0-cpu0)) - return rec._t0, + flush(rec, ' CPU time for %s %9.2f sec' % (msg, t0-cpu0)) + return t0, def _timer_debug1(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True): if rec.verbose >= DEBUG1: return timer(rec, msg, cpu0, wall0, gpu0) - elif wall0 and gpu0: - rec._t0, rec._w0, rec._e0 = process_clock(), perf_counter(), cupy.cuda.Event() - rec._e0.record() - return rec._t0, rec._w0, rec._e0 + elif gpu0: + t0, w0, e0 = process_clock(), perf_counter(), cupy.cuda.Event() + e0.record() + return t0, w0, e0 elif wall0: - rec._t0, rec._w0 = process_clock(), perf_counter() - return rec._t0, rec._w0 + t0, w0 = process_clock(), perf_counter() + return t0, w0 else: - rec._t0 = process_clock() - return rec._t0, + t0 = process_clock() + return t0, def _timer_debug2(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True): if rec.verbose >= DEBUG2: return timer(rec, msg, cpu0, wall0, gpu0) - return cpu0, wall0, gpu0 + elif gpu0: + t0, w0, e0 = process_clock(), perf_counter(), cupy.cuda.Event() + e0.record() + return t0, w0, e0 + elif wall0: + t0, w0 = process_clock(), perf_counter() + return t0, w0 + else: + t0 = process_clock() + return t0, info = lib.logger.info note = lib.logger.note diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py index c961a9a2..ce52046a 100644 --- a/gpu4pyscf/lib/memcpy.py +++ b/gpu4pyscf/lib/memcpy.py @@ -15,6 +15,27 @@ import cupy import numpy as np +from gpu4pyscf.__config__ import _p2p_access + +__all__ = ['p2p_transfer', 'copy_array'] + +def p2p_transfer(a, b): + ''' If the direct P2P data transfer is not available, transfer data via CPU memory + ''' + if a.device == b.device: + a[:] = b + elif _p2p_access: + a[:] = b + ''' + elif a.strides == b.strides and a.flags.c_contiguous and a.dtype == b.dtype: + # cupy supports a direct copy from different devices without p2p. See also + # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L48 + # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L1015 + a[:] = b + ''' + else: + copy_array(b, a) + return a def find_contiguous_chunks(shape, h_strides, d_strides): """ diff --git a/gpu4pyscf/lib/multi_gpu.py b/gpu4pyscf/lib/multi_gpu.py new file mode 100644 index 00000000..f9e1e8ee --- /dev/null +++ b/gpu4pyscf/lib/multi_gpu.py @@ -0,0 +1,153 @@ +# Copyright 2025 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from concurrent.futures import ThreadPoolExecutor +import cupy as cp +import numpy as np +from pyscf.lib import prange +from gpu4pyscf.lib.memcpy import p2p_transfer +from gpu4pyscf.__config__ import num_devices + +def run(func, args=(), kwargs={}, non_blocking=False): + '''Execute a function on each GPU. + + Kwargs: + non_blocking: If `True`, functions are executed in parallel using multi-threads. + ''' + if num_devices == 1: + return [func(*args, *kwargs)] + + def proc(device_id): + with cp.cuda.Device(device_id): + return func(*args, **kwargs) + + if not non_blocking: + return [proc(i) for i in range(num_devices)] + + with ThreadPoolExecutor(max_workers=num_devices) as ex: + futures = [ex.submit(proc, i) for i in range(num_devices)] + return [fut.result() for fut in futures] + +def map(func, tasks, args=(), kwargs={}, schedule='dynamic') -> list: + '''Distributes tasks to multiple GPU devices for parallel computation. + + Kwargs: + schedule: controls how the tasks are distributed. Can be 'static' or 'dynamic'. + If 'static', tasks are distributed in the round-robin fashion; + If 'dynamic', tasks are scheduled dynamically, with better load balance. + ''' + if num_devices == 1: + return [func(t, *args, *kwargs) for t in tasks] + + tasks = list(enumerate(tasks)) + result = [None] * len(tasks) + + def consumer(): + if schedule == 'dynamic': + stream = cp.cuda.stream.get_current_stream() + while tasks: + try: + key, t = tasks.pop() + except IndexError: + return + result[key] = func(t, *args, **kwargs) + stream.synchronize() + else: + device_id = cp.cuda.device.get_device_id() + for key, t in tasks[device_id::num_devices]: + result[key] = func(t, *args, **kwargs) + + run(consumer, non_blocking=True) + return result + +def reduce(func, tasks, args=(), kwargs={}, schedule='dynamic'): + '''Processes tasks on multiple GPU devices and returns the sum of the results. + ''' + result = map(func, tasks, args, kwargs) + dtype = cp.result_type(*result) + if num_devices == 1: + out = result[0].astype(dtype=dtype, copy=False) + for r in result[1:]: + out += r + return out + + groups = [None] * num_devices + for r in result: + device_id = r.device.id + if groups[device_id] is None: + groups[device_id] = r.astype(dtype, copy=False) + else: + groups[device_id] += r + + for i in num_devices: + if groups[i] is None: + groups[i] = cp.zeros(result[0].shape, dtype=dtype) + return array_reduce(groups, inplace=True) + +def array_broadcast(a): + '''Broadcast a cupy ndarray to all devices, return a list of cupy ndarrays. + ''' + if num_devices == 1: + return [a] + + out = [None] * num_devices + out[0] = a + + # Tree broadcast + step = num_devices >> 1 + while step > 0: + for device_id in range(0, num_devices, 2*step): + if device_id + step < num_devices: + with cp.cuda.Device(device_id+step): + out[device_id+step] = dst = cp.empty_like(a) + p2p_transfer(dst, a) + step >>= 1 + return out + +def array_reduce(array_list, inplace=False): + '''The sum of cupy ndarrays from all devices to device 0. + ''' + assert len(array_list) == num_devices + if num_devices == 1: + return array_list[0] + + a0 = array_list[0] + out_shape = a0.shape + size = a0.size + dtype = a0.dtype + assert all(x.dtype == dtype for x in array_list) + + array_list = list(array_list) + for device_id in range(num_devices): + with cp.cuda.Device(device_id): + if inplace or device_id % 2 == 1: + array_list[device_id] = array_list[device_id].ravel() + else: + array_list[device_id] = array_list[device_id].copy().ravel() + + blksize = 1024*1024*1024 // dtype.itemsize # 1GB + # Tree-reduce + step = 1 + while step < num_devices: + for device_id in range(0, num_devices, 2*step): + if device_id + step < num_devices: + with cp.cuda.Device(device_id): + dst = array_list[device_id] + src = array_list[device_id+step] + buf = cp.empty_like(dst[:blksize]) + for p0, p1 in prange(0, size, blksize): + dst[p0:p1] += p2p_transfer(buf[:p1-p0], src[p0:p1]) + step *= 2 + return array_list[0].reshape(out_shape) diff --git a/gpu4pyscf/lib/pbc/CMakeLists.txt b/gpu4pyscf/lib/pbc/CMakeLists.txt index a961cbc2..f8d7a842 100644 --- a/gpu4pyscf/lib/pbc/CMakeLists.txt +++ b/gpu4pyscf/lib/pbc/CMakeLists.txt @@ -2,6 +2,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")# -maxrregcount=12 add_library(pbc SHARED pbc_driver.cu ft_ao.cu unrolled_ft_ao.cu + fill_int3c2e.cu unrolled_int3c2e.cu ) set_target_properties(pbc PROPERTIES diff --git a/gpu4pyscf/lib/pbc/fill_int3c2e.cu b/gpu4pyscf/lib/pbc/fill_int3c2e.cu new file mode 100644 index 00000000..55aa3fcc --- /dev/null +++ b/gpu4pyscf/lib/pbc/fill_int3c2e.cu @@ -0,0 +1,702 @@ +/* + * Copyright 2024-2025 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "gvhf-rys/vhf.cuh" +#include "rys_roots.cu" +#include "int3c2e.cuh" + +#define THREADS (WARP_SIZE*WARPS) +// TODO: benchmark performance for 32, 38, 40, 45, 54 +#define GOUT_WIDTH 45 + +__global__ +void pbc_int3c2e_kernel(double *out, PBCInt3c2eEnvVars envs, PBCInt3c2eBounds bounds) +{ + int nksh_per_block = blockDim.x; + int gout_stride = blockDim.y; + int nsp_per_block = blockDim.z; + int ksh_id = threadIdx.x; + int gout_id = threadIdx.y; + int sp_id = threadIdx.z; + int sp_block_id = blockIdx.x; + int ksh_block_id = blockIdx.y; + + int nksp_per_block = nksh_per_block * nsp_per_block; + int ksp_id = nksh_per_block * sp_id + ksh_id; + int thread_id = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x; + int warp_id = thread_id / WARP_SIZE; + int nimgs = envs.nimgs; + int sp0_this_block = sp_block_id * nsp_per_block * SPTAKS_PER_BLOCK; + int ksh0_this_block = ksh_block_id * nksh_per_block; + int nksh = MIN(bounds.nksh - ksh0_this_block, nksh_per_block); + int ksh0 = ksh0_this_block + bounds.ksh0; + + int li = bounds.li; + int lj = bounds.lj; + int lk = bounds.lk; + int lij = li + lj; + int nroots = bounds.nroots; + int nfi = bounds.nfi; + int nfij = bounds.nfij; + int nfk = bounds.nfk; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int stride_j = bounds.stride_j; + int stride_k = bounds.stride_k; + int g_size = bounds.g_size; + int *idx_ij = c_g_pair_idx + c_g_pair_offsets[li*LMAX1+lj]; + int *idy_ij = idx_ij + nfij; + int *idz_ij = idy_ij + nfij; + int lk_offset = lk * (lk + 1) * (lk + 2) / 2; + int *idx_k = c_g_cart_idx + lk_offset; + int *idy_k = idx_k + nfk; + int *idz_k = idy_k + nfk; + int *bas = envs.bas; + double *env = envs.env; + double *img_coords = envs.img_coords; + int *img_idx = bounds.img_idx; + int *sp_img_offsets = bounds.img_offsets; + double omega = env[PTR_RANGE_OMEGA]; + + int gx_len = g_size * nksp_per_block; + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + ksp_id; + double *g = rw + nksp_per_block * nroots*2; + double *gx = g; + double *gy = gx + gx_len; + double *gz = gy + gx_len; + double *rjri = gz + gx_len; + double *Rpq = rjri + nksp_per_block * 3; + __shared__ int img_counts_in_warp[WARPS]; + double gout[GOUT_WIDTH]; + + int ntasks = nksh * nsp_per_block * SPTAKS_PER_BLOCK; + for (int task_id = 0; task_id < ntasks; task_id += nksp_per_block) { + // convert task_id to ish, jsh, ksh + int ijk_idx = task_id + ksp_id; + int ksh = ijk_idx % nksh + ksh0; + int pair_ij_idx = ijk_idx / nksh + sp0_this_block; + int img1 = 1; + int pair_ij = pair_ij_idx; + if (pair_ij_idx >= bounds.npairs_ij) { + pair_ij = sp0_this_block; + } else { + img1 = sp_img_offsets[pair_ij_idx+1]; + } + int bas_ij = bounds.bas_ij_idx[pair_ij]; + int img0 = sp_img_offsets[pair_ij]; + int thread_id_in_warp = thread_id % WARP_SIZE; + if (thread_id_in_warp == 0) { + img_counts_in_warp[warp_id] = 0; + } + atomicMax(&img_counts_in_warp[warp_id], img1-img0); + __syncthreads(); + + int nbas = envs.cell0_nbas * envs.bvk_ncells; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + + for (int gout_start = 0; gout_start < nfij*nfk; + gout_start+=gout_stride*GOUT_WIDTH) { +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; } + + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + __syncthreads(); + if (gout_id == 0) { + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + gy[0] = fac; + } + int img_counts = img_counts_in_warp[warp_id]; + for (int img = 0; img < img_counts; ++img) { + int img_id = img0 + img; + __syncthreads(); + if (img_id >= img1) { + // ensure the same number of images processed in the same warp + img_id = img0; + if (gout_id == 0) { + gy[0] = 0.; + } + } + int img_ij = img_idx[img_id]; + int iL = img_ij / nimgs; + int jL = img_ij % nimgs; + double xi = ri[0] + img_coords[iL*3+0]; + double yi = ri[1] + img_coords[iL*3+1]; + double zi = ri[2] + img_coords[iL*3+2]; + double xj = rj[0] + img_coords[jL*3+0]; + double yj = rj[1] + img_coords[jL*3+1]; + double zj = rj[2] + img_coords[jL*3+2]; + double xjxi = xj - xi; + double yjyi = yj - yi; + double zjzi = zj - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * ak / (aij + ak); + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double theta_rr = theta * rr; +// Somehow, this screening test does not filter out many integrals. +// More benchmarks are needed +#if 0 + __shared__ int8_t img_mask[WARPS]; + if (thread_id_in_warp == 0) { + img_mask[warp_id] = 0; + } + float Kab_f32 = Kab; + // IMPORTANT: run the screening test on each warp. + // When nksh_per_block*gout_stride>32, gout is evaluated across warps. + // If tests are skipped for some warps, g[xyz] vectors and + // gout on these warps will never be evaluated. These warps + // may proceeed to a wrong __syncthreads() barrier and + // produce wrong g[xyz]. + float log_cutoff = envs.log_cutoff; + if ((thread_id_in_warp / nksh_per_block == 0) && + img0+img < img1 && 5.f+2.f*lij-Kab_f32 > log_cutoff) { + // check any not vanished integrals + float ai_f32 = ai; + float aj_f32 = aj; + float aij_f32 = aij; + float ak_f32 = ak; + float fi = ai_f32 / aij_f32; + float fj = aj_f32 / aij_f32; + // fac_guess = log(sqrt(2.x/(omega*sqrt(pi))) * ((2*li+1)*(2*lj+1)*(2*lk+1))**.5/(4*pi)**1.5) + // ~ between [0, 2] + float fac_guess = 1.f; + // fac in Eq 63 of arXiv:2302.11307 ~ log(ci*cj*ck * (pi^2/(aij*ak))**1.5) + float log_fac = logf(fabs(cijk)) + 3.434f - 1.5f*logf(aij_f32*ak_f32) + fac_guess; + float theta_fac_rr = (float)theta_fac * (float)theta_rr; + float rt_aa = sqrtf((float)rr) / (aij_f32+ak_f32) + 1e-9f; + float rt_aij = rt_aa * ak_f32; + float rt_akl = rt_aa * aij_f32; + float r = sqrtf((float)rr_ij); + float ti = fj * r + rt_aij; + float tj = fi * r + rt_aij; + float ti_fac = .5f*li * logf(ti*ti + .5f*li/aij_f32); + float tj_fac = .5f*lj * logf(tj*tj + .5f*lj/aij_f32); + float tk_fac = .5f*lk * logf(rt_akl*rt_akl + .5f*lk/ak_f32); + float estimator = log_fac + ti_fac + tj_fac + tk_fac - Kab_f32 - theta_fac_rr; + if (estimator > log_cutoff) { + img_mask[warp_id] = 1; + } + } + __syncthreads(); + if (img_mask[warp_id] == 0) { + continue; + } +#endif + if (gout_id == 0) { + rjri[0*nksp_per_block] = xjxi; + rjri[1*nksp_per_block] = yjyi; + rjri[2*nksp_per_block] = zjzi; + Rpq[0*nksp_per_block] = xpq; + Rpq[1*nksp_per_block] = ypq; + Rpq[2*nksp_per_block] = zpq; + gx[0] = exp(-Kab); + } + int _nroots = nroots/2; + rys_roots(_nroots, theta_rr, rw+nroots*nksp_per_block, + nksp_per_block, gout_id, gout_stride); + rys_roots(_nroots, theta_fac*theta_rr, rw, + nksp_per_block, gout_id, gout_stride); + __syncthreads(); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = gout_id; irys < _nroots; irys+=gout_stride) { + rw[ irys*2 *nksp_per_block] *= theta_fac; + rw[(irys*2+1)*nksp_per_block] *= sqrt_theta_fac; + } + double s0x, s1x, s2x; + for (int irys = 0; irys < nroots; ++irys) { + __syncthreads(); + if (gout_id == 0) { + gz[0] = rw[(irys*2+1)*nksp_per_block]; + } + double rt = rw[ irys*2 *nksp_per_block]; + double rt_aa = rt / (aij + ak); + + if (lij > 0) { + __syncthreads(); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + // gx(0,n+1) = c0*gx(0,n) + n*b10*gx(0,n-1) + for (int n = gout_id; n < 3; n += gout_stride) { + double *_gx = gx + n * gx_len; + double xpa = rjri[n*nksp_per_block] * aj_aij; + //double c0x = Rpa[ir] - rt_aij * Rpq[n]; + double c0x = xpa - rt_aij * Rpq[n*nksp_per_block]; + s0x = _gx[0]; + s1x = c0x * s0x; + _gx[nksp_per_block] = s1x; + for (int i = 1; i < lij; ++i) { + s2x = c0x * s1x + i * b10 * s0x; + _gx[(i+1)*nksp_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + } + } + + if (lk > 0) { + int lij3 = (lij+1)*3; + double rt_ak = rt_aa * aij; + double b00 = .5 * rt_aa; + double b01 = .5/ak * (1 - rt_ak ); + for (int n = gout_id; n < lij3+gout_id; n += gout_stride) { + __syncthreads(); + int i = n / 3; //for i in range(lij+1): + int _ix = n % 3; // TODO: remove _ix for nroots > 2 + double *_gx = gx + (i + _ix * g_size) * nksp_per_block; + double cpx = rt_ak * Rpq[_ix*nksp_per_block]; + //for i in range(lij+1): + // trr(i,1) = c0p * trr(i,0) + i*b00 * trr(i-1,0) + if (n < lij3) { + s0x = _gx[0]; + s1x = cpx * s0x; + if (i > 0) { + s1x += i * b00 * _gx[-nksp_per_block]; + } + _gx[stride_k*nksp_per_block] = s1x; + } + //for k in range(1, lk): + // for i in range(lij+1): + // trr(i,k+1) = cp * trr(i,k) + k*b01 * trr(i,k-1) + i*b00 * trr(i-1,k) + for (int k = 1; k < lk; ++k) { + __syncthreads(); + if (n < lij3) { + s2x = cpx*s1x + k*b01*s0x; + if (i > 0) { + s2x += i * b00 * _gx[(k*stride_k-1)*nksp_per_block]; + } + _gx[(k*stride_k+stride_k)*nksp_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + } + } + } + + // hrr + // g(i,j+1) = rirj * g(i,j) + g(i+1,j) + // g(...,k,l+1) = rkrl * g(...,k,l) + g(...,k+1,l) + if (lj > 0) { + __syncthreads(); + if (task_id < ntasks) { + int lk3 = (lk+1)*3; + for (int m = gout_id; m < lk3; m += gout_stride) { + int k = m / 3; + int _ix = m % 3; + double xjxi = rjri[_ix*nksp_per_block]; + double *_gx = g + (_ix*g_size + k*stride_k) * nksp_per_block; + for (int j = 0; j < lj; ++j) { + int ij = (lij-j) + j*stride_j; + s1x = _gx[ij*nksp_per_block]; + for (--ij; ij >= j*stride_j; --ij) { + s0x = _gx[ij*nksp_per_block]; + _gx[(ij+stride_j)*nksp_per_block] = s1x - xjxi * s0x; + s1x = s0x; + } + } + } + } + } + + __syncthreads(); +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { + int ijk = gout_start + n*gout_stride+gout_id; + int k = ijk / nfij; + int ij = ijk % nfij; + if (k >= nfk) break; + int addrx = (idx_ij[ij] + idx_k[k] * stride_k) * nksp_per_block; + int addry = (idy_ij[ij] + idy_k[k] * stride_k) * nksp_per_block; + int addrz = (idz_ij[ij] + idz_k[k] * stride_k) * nksp_per_block; + gout[n] += gx[addrx] * gy[addry] * gz[addrz]; + } + } + } + } + + if (pair_ij_idx < bounds.npairs_ij) { + int *ao_loc = envs.ao_loc; + int nbasp = envs.cell0_nbas; + int ncells = envs.bvk_ncells; + int cell_i = ish / nbasp; + int cell0_ish = ish % nbasp; + int cell_j = jsh / nbasp; + int cell0_jsh = jsh % nbasp; + int nrow = bounds.nrow; + int ncol = bounds.ncol; + size_t naux = bounds.naux; + int i0 = ao_loc[cell0_ish] - ao_loc[bounds.ish0]; + int j0 = ao_loc[cell0_jsh] - ao_loc[bounds.jsh0]; + int k0 = ao_loc[ksh] - ao_loc[bounds.ksh0]; + double *eri_tensor = out + (((cell_i * nrow + i0) * ncells + + cell_j) * ncol + j0) * naux + k0; + int nKj = ncells * ncol; + for (int n = 0; n < GOUT_WIDTH; ++n) { + int ijk = gout_start + n*gout_stride+gout_id; + size_t k = ijk / nfij; + size_t ij = ijk % nfij; + if (k >= nfk) break; + size_t i = ij % nfi; + size_t j = ij / nfi; + size_t addr = (i*nKj+j)*naux + k; + eri_tensor[addr] = gout[n]; + } + } + } + } +} + +__global__ +void sr_int3c2e_img_counts_kernel(int *img_counts, PBCInt3c2eEnvVars envs, + float *exps, float *log_coeff, float *aux_exps, + int ish0, int jsh0, int nish, int njsh) +{ + int Ki = blockIdx.x; + int Kj = blockIdx.y; + int cell_i = Ki / nish; + int cell_j = Kj / njsh; + int cell0_ish = Ki % nish + ish0; + int cell0_jsh = Kj % njsh + jsh0; + int nbasp = envs.cell0_nbas; + int ish = cell_i * nbasp + cell0_ish; + int jsh = cell_j * nbasp + cell0_jsh; + int ncells = envs.bvk_ncells; + int nKj = ncells * njsh; + int thread_id = threadIdx.x; + int threads = blockDim.x; + int nimgs = envs.nimgs; + int nimgs2 = nimgs * nimgs; + int cell0_natm = envs.cell0_natm; + int *atm = envs.atm; + int *bas = envs.bas; + double *env = envs.env; + double *img_coords = envs.img_coords; + extern __shared__ float x_cache[]; + float *y_cache = x_cache + cell0_natm; + float *z_cache = y_cache + cell0_natm; + for (int k = thread_id; k < cell0_natm; k += threads) { + double *rk = env + atm[k*ATM_SLOTS+PTR_COORD]; + x_cache[k] = rk[0]; + y_cache[k] = rk[1]; + z_cache[k] = rk[2]; + } + __syncthreads(); + + int li = bas[ANG_OF + ish0*BAS_SLOTS]; + int lj = bas[ANG_OF + jsh0*BAS_SLOTS]; + float ai = exps[cell0_ish]; + float aj = exps[cell0_jsh]; + float log_ci = log_coeff[cell0_ish]; + float log_cj = log_coeff[cell0_jsh]; + float aij = ai + aj; + float u = .5f / aij; + float fi = ai / aij; + float fj = aj / aij; + float theta_ij = ai * aj / aij; + float omega = env[PTR_RANGE_OMEGA]; + if (omega == 0) { + omega = 0.1f; + } + float omega2 = omega * omega; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + float xi = ri[0]; + float yi = ri[1]; + float zi = ri[2]; + float xj = rj[0]; + float yj = rj[1]; + float zj = rj[2]; + float log_cutoff = envs.log_cutoff; + + // fac_guess = log(sqrt(2.x/(omega*sqrt(pi))) * ((2*li+1)*(2*lj+1)*(2*lk+1))**.5/(4*pi)**1.5) + // ~ between [0, 2] + float fac_guess = .5f - logf(omega2)/4; + float log_fac = log_ci + log_cj + 1.717f - 1.5f*logf(aij) + fac_guess; + + int count = 0; + for (int ijL = thread_id; ijL < nimgs2; ijL += threads) { + int iL = ijL / nimgs; + int jL = ijL % nimgs; + float xiL = xi + img_coords[iL*3+0]; + float yiL = yi + img_coords[iL*3+1]; + float ziL = zi + img_coords[iL*3+2]; + float xjL = xj + img_coords[jL*3+0]; + float yjL = yj + img_coords[jL*3+1]; + float zjL = zj + img_coords[jL*3+2]; + float xjxi = xjL - xiL; + float yjyi = yjL - yiL; + float zjzi = zjL - ziL; + float xij = xjxi * fj + xiL; + float yij = yjyi * fj + yiL; + float zij = zjzi * fj + ziL; + float theta = (omega2 * aij) / (omega2 + aij); + float rr_min = 1e3f; + float theta_rr_min = 1e6f; + for (int k = 0; k < cell0_natm; ++k) { + float dx = xij - x_cache[k]; + float dy = yij - y_cache[k]; + float dz = zij - z_cache[k]; + float rr = dx * dx + dy * dy + dz * dz; + float ak = aux_exps[k]; + float theta_k = theta * ak / (theta + ak); + float theta_rr = theta_k * rr; + if (theta_rr < theta_rr_min) { + theta_rr_min = theta_rr; + rr_min = rr; + } + } + + // exp(- 1/(1/aij+1/ak+1/omega^2) * r_guess^2) < 1e-9 + // => ~ exp(- omega^2 * r_guess^2) < 1e-9 + // => r_guess > 5/omega + // 1/(1/aij+1/ak+1/omega^2)*r_guess/aij in Eq 64 of arXiv:2302.11307 + // ~ omega^2*r_guess/aij ~ omega/aij * 5.f + //float rt_aij = fabs(omega)/aij * 5.; + float rt_aij = omega2 * sqrtf(rr_min) / aij + 1e-9f; + float rr_ij = xjxi * xjxi + yjyi * yjyi + zjzi * zjzi; + float dr = sqrtf(rr_ij); + float dri = fj * dr + rt_aij; + float drj = fi * dr + rt_aij; + float dri_fac = .5f*li * logf(dri*dri + li*u); + float drj_fac = .5f*lj * logf(drj*drj + lj*u); + float estimator = log_fac + dri_fac + drj_fac - theta_ij*rr_ij - theta_rr_min; + if (estimator > log_cutoff) { + count += 1; + } + } + + extern __shared__ int counts[]; + counts[thread_id] = count; + __syncthreads(); + for (int stride = threads / 2; stride > 0; stride /= 2) { + if (thread_id < stride) { + counts[thread_id] += counts[thread_id + stride]; + } + __syncthreads(); + } + if (thread_id == 0) { + img_counts[Ki*nKj+Kj] = counts[0]; + } +} + +__global__ +void sr_int3c2e_img_idx_kernel(int *img_idx, int *img_offsets, int *bas_mapping, + PBCInt3c2eEnvVars envs, + float *exps, float *log_coeff, float *aux_exps, + int ish0, int jsh0, int nish, int njsh) +{ + int thread_id = threadIdx.x; + int threads = blockDim.x; + int ncells = envs.bvk_ncells; + int nKj = ncells * njsh; + int row_id = blockIdx.x; + int bas_ij = bas_mapping[row_id]; + int Ki = bas_ij / nKj; + int Kj = bas_ij % nKj; + int cell_i = Ki / nish; + int cell_j = Kj / njsh; + int cell0_ish = Ki % nish + ish0; + int cell0_jsh = Kj % njsh + jsh0; + int nbasp = envs.cell0_nbas; + int ish = cell_i * nbasp + cell0_ish; + int jsh = cell_j * nbasp + cell0_jsh; + int nimgs = envs.nimgs; + int nimgs2 = nimgs * nimgs; + int cell0_natm = envs.cell0_natm; + int *atm = envs.atm; + int *bas = envs.bas; + double *env = envs.env; + double *img_coords = envs.img_coords; + extern __shared__ int8_t mask[]; + uint16_t* cum_count = (uint16_t *)(mask + IMG_BLOCK); + float *x_cache = (float *)(cum_count + threads); + float *y_cache = x_cache + cell0_natm; + float *z_cache = y_cache + cell0_natm; + for (int k = thread_id; k < cell0_natm; k += threads) { + double *rk = env + atm[k*ATM_SLOTS+PTR_COORD]; + x_cache[k] = rk[0]; + y_cache[k] = rk[1]; + z_cache[k] = rk[2]; + } + for (int i = thread_id; i < IMG_BLOCK; i += threads) { + mask[i] = 0; + } + __syncthreads(); + + int li = bas[ANG_OF + ish0*BAS_SLOTS]; + int lj = bas[ANG_OF + jsh0*BAS_SLOTS]; + float ai = exps[cell0_ish]; + float aj = exps[cell0_jsh]; + float log_ci = log_coeff[cell0_ish]; + float log_cj = log_coeff[cell0_jsh]; + float aij = ai + aj; + float u = .5f / aij; + float fi = ai / aij; + float fj = aj / aij; + float theta_ij = ai * aj / aij; + float omega = env[PTR_RANGE_OMEGA]; + if (omega == 0) { + omega = 0.1f; + } + float omega2 = omega * omega; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + float xi = ri[0]; + float yi = ri[1]; + float zi = ri[2]; + float xj = rj[0]; + float yj = rj[1]; + float zj = rj[2]; + float log_cutoff = envs.log_cutoff; + + // fac_guess = log(sqrt(2.x/(omega*sqrt(pi))) * ((2*li+1)*(2*lj+1)*(2*lk+1))**.5/(4*pi)**1.5) + // ~ between [0, 2] + float fac_guess = .5f - logf(omega2)/4; + float log_fac = log_ci + log_cj + 1.717f - 1.5f*logf(aij) + fac_guess; + int offset_start = img_offsets[row_id]; + + for (int img_start = 0; img_start < nimgs2; img_start += IMG_BLOCK) { + int block_nimgs2 = MIN(IMG_BLOCK, nimgs2-img_start); + int bacth_size = (block_nimgs2 + threads - 1) / threads; + int ij0 = img_start + thread_id * bacth_size; + int ij1 = MIN(ij0 + bacth_size, nimgs2); + + int count = 0; + for (int ijL = ij0; ijL < ij1; ++ijL) { + int iL = ijL / nimgs; + int jL = ijL % nimgs; + float xiL = xi + img_coords[iL*3+0]; + float yiL = yi + img_coords[iL*3+1]; + float ziL = zi + img_coords[iL*3+2]; + float xjL = xj + img_coords[jL*3+0]; + float yjL = yj + img_coords[jL*3+1]; + float zjL = zj + img_coords[jL*3+2]; + float xjxi = xjL - xiL; + float yjyi = yjL - yiL; + float zjzi = zjL - ziL; + float xij = xjxi * fj + xiL; + float yij = yjyi * fj + yiL; + float zij = zjzi * fj + ziL; + float theta = (omega2 * aij) / (omega2 + aij); + float rr_min = 1e3f; + float theta_rr_min = 1e6f; + for (int k = 0; k < cell0_natm; ++k) { + float dx = xij - x_cache[k]; + float dy = yij - y_cache[k]; + float dz = zij - z_cache[k]; + float rr = dx * dx + dy * dy + dz * dz; + float ak = aux_exps[k]; + float theta_k = theta * ak / (theta + ak); + float theta_rr = theta_k * rr; + if (theta_rr < theta_rr_min) { + theta_rr_min = theta_rr; + rr_min = rr; + } + } + + // exp(- 1/(1/aij+1/ak+1/omega^2) * r_guess^2) < 1e-9 + // => ~ exp(- omega^2 * r_guess^2) < 1e-9 + // => r_guess > 5/omega + // 1/(1/aij+1/ak+1/omega^2)*r_guess/aij in Eq 64 of arXiv:2302.11307 + // ~ omega^2*r_guess/aij ~ omega/aij * 5.f + //float rt_aij = fabs(omega)/aij * 5.; + float rt_aij = omega2 * sqrtf(rr_min) / aij + 1e-9f; + float rr_ij = xjxi * xjxi + yjyi * yjyi + zjzi * zjzi; + float dr = sqrtf(rr_ij); + float dri = fj * dr + rt_aij; + float drj = fi * dr + rt_aij; + float dri_fac = .5f*li * logf(dri*dri + li*u); + float drj_fac = .5f*lj * logf(drj*drj + lj*u); + float estimator = log_fac + dri_fac + drj_fac - theta_ij*rr_ij - theta_rr_min; + if (estimator > log_cutoff) { + mask[ijL - img_start] = 1; + count += 1; + } + } + + cum_count[thread_id] = count; + // Up-sweep phase + for (int stride = 1; stride < threads; stride *= 2) { + __syncthreads(); + int index = (thread_id + 1) * stride * 2 - 1; + if (index < threads) { + cum_count[index] += cum_count[index-stride]; + } + } + __syncthreads(); + // Down-sweep phase + for (int stride = threads/4; stride > 0; stride /= 2) { + __syncthreads(); + int index = (thread_id + 1) * stride * 2 - 1; + if (index + stride < threads) { + cum_count[index + stride] += cum_count[index]; + } + } + __syncthreads(); + + int offset = offset_start; + if (thread_id > 0) { + offset += cum_count[thread_id-1]; + } + for (int ijL = ij0; ijL < ij1; ++ijL) { + if (mask[ijL-img_start]) { + img_idx[offset] = ijL; + mask[ijL-img_start] = 0; + ++offset; + } + } + offset_start += cum_count[threads-1]; + __syncthreads(); + } +} diff --git a/gpu4pyscf/lib/pbc/ft_ao.cu b/gpu4pyscf/lib/pbc/ft_ao.cu index d9b6d5e2..40438340 100644 --- a/gpu4pyscf/lib/pbc/ft_ao.cu +++ b/gpu4pyscf/lib/pbc/ft_ao.cu @@ -20,7 +20,7 @@ #include #include "gvhf-rys/vhf.cuh" -#include "ft_ao.h" +#include "ft_ao.cuh" #define GOUT_WIDTH 19 // pi^1.5 @@ -204,7 +204,7 @@ void ft_aopair_kernel(double *out, AFTIntEnvVars envs, AFTBoundsInfo bounds) #pragma unroll for (int n = 0; n < GOUT_WIDTH; ++n) { int ij = n*gout_stride + gout_id; - if (ij >= nfij) continue; + if (ij >= nfij) break; int addrx = idx_ij[ij] * nGv_per_block; int addry = idy_ij[ij] * nGv_per_block; int addrz = idz_ij[ij] * nGv_per_block; @@ -237,7 +237,7 @@ void ft_aopair_kernel(double *out, AFTIntEnvVars envs, AFTBoundsInfo bounds) + Gv_block_id*nGv_per_block + Gv_id) * OF_COMPLEX; for (int n = 0; n < GOUT_WIDTH; ++n) { int ij = n*gout_stride + gout_id; - if (ij >= nfij) continue; + if (ij >= nfij) break; size_t i = ij % nfi; size_t j = ij / nfi; size_t addr = (i*nao+j)*nGv; diff --git a/gpu4pyscf/lib/pbc/ft_ao.h b/gpu4pyscf/lib/pbc/ft_ao.cuh similarity index 100% rename from gpu4pyscf/lib/pbc/ft_ao.h rename to gpu4pyscf/lib/pbc/ft_ao.cuh diff --git a/gpu4pyscf/lib/pbc/int3c2e.cuh b/gpu4pyscf/lib/pbc/int3c2e.cuh new file mode 100644 index 00000000..746fa138 --- /dev/null +++ b/gpu4pyscf/lib/pbc/int3c2e.cuh @@ -0,0 +1,75 @@ +/* + * Copyright 2024 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#define WARP_SIZE 32 +// corresponding to 256 threads +#define WARPS 8 +#define IMG_MASK_SLOTS 1024 +#define L_AUX_MAX 6 +#define SPTAKS_PER_BLOCK 32 +#define IMG_BLOCK 16384 + +#ifndef HAVE_DEFINED_PBCINT3CENVVAS_H +#define HAVE_DEFINED_PBCINT3CENVVAS_H +typedef struct { + uint16_t cell0_natm; // in the reference cell + uint16_t cell0_nbas; // in the reference cell + uint16_t bvk_ncells; // in bvk-cell + uint16_t nimgs; // number of images in lattice sum + int *atm; + int *bas; + double *env; + int *ao_loc; // in bvk-cell + double *img_coords; // vectors in lattice sum + float log_cutoff; +} PBCInt3c2eEnvVars; + +typedef struct { + uint8_t li; + uint8_t lj; + uint8_t lk; + uint8_t nroots; + uint8_t nfi; + uint8_t nfij; + uint8_t nfk; + uint8_t iprim; + uint8_t jprim; + uint8_t kprim; + uint8_t stride_i; + uint8_t stride_j; + uint8_t stride_k; + uint8_t g_size; + uint16_t nrow; + uint16_t ncol; + uint16_t naux; + uint16_t nksh; + uint16_t ish0; + uint16_t jsh0; + uint16_t ksh0; + int npairs_ij; + int *bas_ij_idx; + int *img_idx; // indices of img_coords in each shell-pair + int *img_offsets; // offset img_idx for each shell-pair +} PBCInt3c2eBounds; + +#ifdef __CUDACC__ +extern __constant__ int c_g_pair_idx[]; +extern __constant__ int c_g_pair_offsets[]; +extern __constant__ int c_g_cart_idx[]; +#endif +#endif diff --git a/gpu4pyscf/lib/pbc/pbc_driver.cu b/gpu4pyscf/lib/pbc/pbc_driver.cu index b800efd6..45ab84cb 100644 --- a/gpu4pyscf/lib/pbc/pbc_driver.cu +++ b/gpu4pyscf/lib/pbc/pbc_driver.cu @@ -1,3 +1,19 @@ +/* + * Copyright 2024-2025 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include #include #include @@ -5,23 +21,37 @@ #include #include "gvhf-rys/vhf.cuh" -#include "ft_ao.h" +#include "int3c2e.cuh" +#include "ft_ao.cuh" -__constant__ int c_g_pair_idx[3675]; +__constant__ int c_g_pair_idx[3675]; // corresponding to LMAX=4 __constant__ int c_g_pair_offsets[LMAX1*LMAX1]; +__constant__ int c_g_cart_idx[252]; // corresponding to LMAX=6 extern __global__ void ft_aopair_kernel(double *out, AFTIntEnvVars envs, AFTBoundsInfo bounds); extern __global__ void ft_aopair_fill_triu(double *out, int *conj_mapping, int bvk_ncells, int nGv); +extern __global__ +void pbc_int3c2e_kernel(double *out, PBCInt3c2eEnvVars envs, PBCInt3c2eBounds bounds); +extern __global__ +void sr_int3c2e_img_counts_kernel(int *img_counts, PBCInt3c2eEnvVars envs, + float *exps, float *log_coeff, float *aux_exps, + int ish0, int jsh0, int nish, int njsh); +extern __global__ +void sr_int3c2e_img_idx_kernel(int *img_idx, int *img_offsets, int *bas_mapping, + PBCInt3c2eEnvVars envs, + float *exps, float *log_coeff, float *aux_exps, + int ish0, int jsh0, int nish, int njsh); int ft_ao_unrolled(double *out, AFTIntEnvVars *envs, AFTBoundsInfo *bounds, int *scheme); +int int3c2e_unrolled(double *out, PBCInt3c2eEnvVars *envs, PBCInt3c2eBounds *bounds); extern "C" { -int PBC_build_ft_ao(double *out, AFTIntEnvVars *envs, - int *scheme, int *shls_slice, int npairs_ij, int ngrids, - int *ish_in_pair, int *jsh_in_pair, double *grids, - int *atm, int natm, int *bas, int nbas, double *env) +int build_ft_ao(double *out, AFTIntEnvVars *envs, + int *scheme, int *shls_slice, int npairs_ij, int ngrids, + int *ish_in_pair, int *jsh_in_pair, double *grids, + int *atm, int natm, int *bas, int nbas, double *env) { uint16_t ish0 = shls_slice[0]; uint16_t jsh0 = shls_slice[2]; @@ -53,13 +83,13 @@ int PBC_build_ft_ao(double *out, AFTIntEnvVars *envs, } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - fprintf(stderr, "CUDA Error in PBC_build_ft_ao: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "CUDA Error in build_ft_ao: %s\n", cudaGetErrorString(err)); return 1; } return 0; } -int PBC_ft_aopair_fill_triu(double *out, int *conj_mapping, int nao, int bvk_ncells, int nGv) +int ft_aopair_fill_triu(double *out, int *conj_mapping, int nao, int bvk_ncells, int nGv) { int nGv2 = nGv * 2; // *2 for complex number int threads = 1024; @@ -67,18 +97,147 @@ int PBC_ft_aopair_fill_triu(double *out, int *conj_mapping, int nao, int bvk_nce ft_aopair_fill_triu<<>>(out, conj_mapping, bvk_ncells, nGv2); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - fprintf(stderr, "CUDA Error in PBC_ft_aopair_fill_triu: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "CUDA Error in ft_aopair_fill_triu: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int fill_int3c2e(double *out, PBCInt3c2eEnvVars *envs, + int *scheme, int *shls_slice, int bvk_ncells, + int nrow, int ncol, int naux, int npairs_ij, + int *bas_ij_idx, int *img_idx, int *img_offsets, + int *atm, int natm, int *bas, int nbas, double *env) +{ + uint16_t ish0 = shls_slice[0]; + uint16_t jsh0 = shls_slice[2]; + uint16_t ksh0 = shls_slice[4] + nbas; + uint16_t ksh1 = shls_slice[5] + nbas; + uint16_t nksh = ksh1 - ksh0; + uint8_t li = bas[ANG_OF + ish0*BAS_SLOTS]; + uint8_t lj = bas[ANG_OF + jsh0*BAS_SLOTS]; + uint8_t lk = bas[ANG_OF + ksh0*BAS_SLOTS]; + uint8_t iprim = bas[NPRIM_OF + ish0*BAS_SLOTS]; + uint8_t jprim = bas[NPRIM_OF + jsh0*BAS_SLOTS]; + uint8_t kprim = bas[NPRIM_OF + ksh0*BAS_SLOTS]; + uint8_t nfi = (li+1)*(li+2)/2; + uint8_t nfj = (lj+1)*(lj+2)/2; + uint8_t nfk = (lk+1)*(lk+2)/2; + uint8_t nfij = nfi * nfj; + uint8_t order = li + lj + lk; + uint8_t nroots = order / 2 + 1; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { // SR ERIs + nroots *= 2; + } + uint8_t stride_i = 1; + uint8_t stride_j = li + 1; + uint8_t stride_k = stride_j * (lj + 1); + // up to (gg|i) + uint8_t g_size = stride_k * (lk + 1); + PBCInt3c2eBounds bounds = {li, lj, lk, nroots, nfi, nfij, nfk, + iprim, jprim, kprim, stride_i, stride_j, stride_k, g_size, + (uint16_t)nrow, (uint16_t)ncol, (uint16_t)naux, nksh, ish0, jsh0, ksh0, + npairs_ij, bas_ij_idx, img_idx, img_offsets}; + + if (!int3c2e_unrolled(out, envs, &bounds)) { + int nksh_per_block = scheme[0]; + int gout_stride = scheme[1]; + int nsp_per_block = scheme[2]; + dim3 threads(nksh_per_block, gout_stride, nsp_per_block); + int tasks_per_block = SPTAKS_PER_BLOCK * nsp_per_block; + int sp_blocks = (npairs_ij + tasks_per_block - 1) / tasks_per_block; + int ksh_blocks = (nksh + nksh_per_block - 1) / nksh_per_block; + dim3 blocks(sp_blocks, ksh_blocks); + int buflen = (nroots*2+g_size*3+6) * (nksh_per_block * nsp_per_block) * sizeof(double); + pbc_int3c2e_kernel<<>>(out, *envs, bounds); + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in fill_int3c2e: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int int3c2e_img_counts(int *img_counts, PBCInt3c2eEnvVars *envs, + int *shls_slice, float *exps, float *log_cs, float *aux_exps, + int bvk_ncells, int cell0_natm) +{ + int ish0 = shls_slice[0]; + int ish1 = shls_slice[1]; + int jsh0 = shls_slice[2]; + int jsh1 = shls_slice[3]; + int nish = ish1 - ish0; + int njsh = jsh1 - jsh0; + dim3 blocks(bvk_ncells*nish, bvk_ncells*njsh); + int buflen = cell0_natm * 3 * sizeof(float); + int threads = 512; + buflen = MAX(buflen, threads*sizeof(int)); + sr_int3c2e_img_counts_kernel<<>>( + img_counts, *envs, exps, log_cs, aux_exps, ish0, jsh0, nish, njsh); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in int3c2e_q_mask: %s\n", cudaGetErrorString(err)); return 1; } return 0; } -int PBC_FT_init_constant(int *g_pair_idx, int *offsets, - double *env, int env_size, int shm_size) +int int3c2e_img_idx(int *img_idx, int *img_offsets, int *bas_mapping, int nrow, + PBCInt3c2eEnvVars *envs, + int *shls_slice, float *exps, float *log_cs, float *aux_exps, + int bvk_ncells, int cell0_natm) + +{ + int ish0 = shls_slice[0]; + int ish1 = shls_slice[1]; + int jsh0 = shls_slice[2]; + int jsh1 = shls_slice[3]; + int nish = ish1 - ish0; + int njsh = jsh1 - jsh0; + dim3 blocks(bvk_ncells*nish, bvk_ncells*njsh); + int buflen = cell0_natm * 3 * sizeof(float); + int threads = 512; + buflen = buflen + threads*sizeof(uint16_t) + IMG_BLOCK; + sr_int3c2e_img_idx_kernel<<>>( + img_idx, img_offsets, bas_mapping, *envs, + exps, log_cs, aux_exps, ish0, jsh0, nish, njsh); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in int3c2e_img_idx: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int init_constant(int *g_pair_idx, int *offsets, + double *env, int env_size, int shm_size) { cudaMemcpyToSymbol(c_g_pair_idx, g_pair_idx, 3675*sizeof(int)); cudaMemcpyToSymbol(c_g_pair_offsets, offsets, sizeof(int) * LMAX1*LMAX1); + + int *g_cart_idx = (int *)malloc(252*sizeof(int)); + int *idx, *idy, *idz; + idx = g_cart_idx; + for (int l = 0; l <= L_AUX_MAX; ++l) { + int nf = (l + 1) * (l + 2) / 2; + idy = idx + nf; + idz = idy + nf; + for (int i = 0, ix = l; ix >= 0; --ix) { + for (int iy = l - ix; iy >= 0; --iy, ++i) { + int iz = l - ix - iy; + idx[i] = ix; + idy[i] = iy; + idz[i] = iz; + } } + idx += nf * 3; + } + cudaMemcpyToSymbol(c_g_cart_idx, g_cart_idx, 252*sizeof(int)); + free(g_cart_idx); + cudaFuncSetAttribute(ft_aopair_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + cudaFuncSetAttribute(pbc_int3c2e_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size, diff --git a/gpu4pyscf/lib/pbc/rys_roots.cu b/gpu4pyscf/lib/pbc/rys_roots.cu new file mode 100644 index 00000000..a8700bd2 --- /dev/null +++ b/gpu4pyscf/lib/pbc/rys_roots.cu @@ -0,0 +1,84 @@ +/* + * Copyright 2024-2025 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gvhf-rys/rys_roots.cuh" + +#define SQRTPIE4 .8862269254527580136 +#define PIE4 .7853981633974483096 + +__device__ +static void rys_roots(int nroots, double x, double *rw, + int block_size, int worker_id, int workers) +{ + if (x < 3.e-7){ + int off = nroots * (nroots - 1) / 2; + for (int i = worker_id; i < nroots; i += workers) { + rw[(i*2 )*block_size] = ROOT_SMALLX_R0[off+i] + ROOT_SMALLX_R1[off+i] * x; + rw[(i*2+1)*block_size] = ROOT_SMALLX_W0[off+i] + ROOT_SMALLX_W1[off+i] * x; + } + return; + } + + if (nroots == 1) { + if (worker_id == 0) { + double tt = sqrt(x); + double fmt0 = SQRTPIE4 / tt * erf(tt); + rw[block_size] = fmt0; + double e = exp(-x); + double b = .5 / x; + double fmt1 = b * (fmt0 - e); + rw[0] = fmt1 / fmt0; + } + return; + } + + if (x > 35+nroots*5) { + int off = nroots * (nroots - 1) / 2; + double t = sqrt(PIE4/x); + for (int i = worker_id; i < nroots; i += workers) { + rw[(i*2 )*block_size] = ROOT_LARGEX_R_DATA[off+i] / x; + rw[(i*2+1)*block_size] = ROOT_LARGEX_W_DATA[off+i] * t; + } + return; + } + + double *datax = ROOT_RW_DATA + DEGREE1*INTERVALS * nroots*(nroots-1); + int it = (int)(x * .4); + double u = (x - it * 2.5) * 0.8 - 1.; + double u2 = u * 2.; + for (int rt_id = worker_id; rt_id < nroots*2; rt_id += workers) { + double *c = datax + rt_id * DEGREE1 * INTERVALS; + //for i in range(2, degree + 1): + // c0, c1 = c[degree-i] - c1, c0 + c1*u2 + double c0 = c[it + DEGREE *INTERVALS]; + double c1 = c[it +(DEGREE-1)*INTERVALS]; + double c2, c3; +#pragma unroll + for (int n = DEGREE-2; n > 0; n-=2) { + c2 = c[it + n *INTERVALS] - c1; + c3 = c0 + c1*u2; + c1 = c2 + c3*u2; + c0 = c[it +(n-1)*INTERVALS] - c3; + } + if (DEGREE % 2 == 0) { + c2 = c[it] - c1; + c3 = c0 + c1*u2; + rw[rt_id*block_size] = c2 + c3*u; + } else { + rw[rt_id*block_size] = c0 + c1*u; + } + } +} diff --git a/gpu4pyscf/lib/pbc/unrolled_ft_ao.cu b/gpu4pyscf/lib/pbc/unrolled_ft_ao.cu index d2845274..d95d22b2 100644 --- a/gpu4pyscf/lib/pbc/unrolled_ft_ao.cu +++ b/gpu4pyscf/lib/pbc/unrolled_ft_ao.cu @@ -1,5 +1,5 @@ /* - * Copyright 2024 The PySCF Developers. All Rights Reserved. + * Copyright 2024-2025 The PySCF Developers. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ #include #include #include "gvhf-rys/vhf.cuh" -#include "ft_ao.h" +#include "ft_ao.cuh" #define OVERLAP_FAC 5.56832799683170787 #define OF_COMPLEX 2 diff --git a/gpu4pyscf/lib/pbc/unrolled_int3c2e.cu b/gpu4pyscf/lib/pbc/unrolled_int3c2e.cu new file mode 100644 index 00000000..0c7e0174 --- /dev/null +++ b/gpu4pyscf/lib/pbc/unrolled_int3c2e.cu @@ -0,0 +1,22 @@ +/* + * Copyright 2024-2025 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "int3c2e.cuh" + +int int3c2e_unrolled(double *out, PBCInt3c2eEnvVars *envs, PBCInt3c2eBounds *bounds) +{ + return 0; +} diff --git a/gpu4pyscf/lib/solvent/pcm.cu b/gpu4pyscf/lib/solvent/pcm.cu index 7615f314..4d34ce97 100644 --- a/gpu4pyscf/lib/solvent/pcm.cu +++ b/gpu4pyscf/lib/solvent/pcm.cu @@ -78,9 +78,9 @@ static void _pcm_d_s(double *matrix_d, double *matrix_s, __global__ static void _pcm_dD_dS(double *matrix_dd, double *matrix_ds, - const double *coords, const double *norm_vec, const double *r_vdw, - const double *charge_exp, const double *switch_fun, - int n) + const double *coords, const double *norm_vec, + const double *charge_exp, + int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; @@ -130,6 +130,127 @@ static void _pcm_dD_dS(double *matrix_dd, double *matrix_ds, } } +__global__ +static void _pcm_d2D_d2S(double *matrix_d2D, double *matrix_d2S, + const double *coords, const double *norm_vec, + const double *charge_exp, + int n) +{ + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= n || j >= n) { + return; + } + + // calculate xi + const double ei = charge_exp[i]; + const double ej = charge_exp[j]; + const double eij = ei * ej / sqrt(ei*ei + ej*ej); + + // calculate r + const double dx = coords[3*i] - coords[3*j]; + const double dy = coords[3*i+1] - coords[3*j+1]; + const double dz = coords[3*i+2] - coords[3*j+2]; + const double rij = norm3d(dx, dy, dz); + const double rij_1 = (i != j) ? (1.0 / rij) : 0.0; // This guarantees that if i == j, all matrix elements = 0 + const double rij_2 = rij_1 * rij_1; + const double rij_3 = rij_2 * rij_1; + const double rij_4 = rij_2 * rij_2; + const double rij_5 = rij_2 * rij_3; + const double eij2 = eij * eij; + + const double eij_rij = eij * rij; + const double erf_eij_rij = erf(eij_rij); + const double exp_minus_eij2_rij2 = exp(-eij_rij * eij_rij); + const double two_eij_over_sqrt_pi = 2.0 * eij / SQRT_PI; + const double two_eij_over_sqrt_pi_exp_minus_eij2_rij2 = exp_minus_eij2_rij2 * two_eij_over_sqrt_pi; + + const double S_direct_product_prefactor = -two_eij_over_sqrt_pi_exp_minus_eij2_rij2 * (3 * rij_4 + 2 * eij2 * rij_2) + + 3 * rij_5 * erf_eij_rij; + const double S_xyz_diagonal_prefactor = two_eij_over_sqrt_pi_exp_minus_eij2_rij2 * rij_2 - rij_3 * erf_eij_rij; + + const int n2 = n * n; + matrix_d2S[i*n + j ] = dx * dx * S_direct_product_prefactor + S_xyz_diagonal_prefactor; + matrix_d2S[i*n + j + n2 ] = dx * dy * S_direct_product_prefactor; + matrix_d2S[i*n + j + n2 * 2] = dx * dz * S_direct_product_prefactor; + matrix_d2S[i*n + j + n2 * 3] = dy * dx * S_direct_product_prefactor; + matrix_d2S[i*n + j + n2 * 4] = dy * dy * S_direct_product_prefactor + S_xyz_diagonal_prefactor; + matrix_d2S[i*n + j + n2 * 5] = dy * dz * S_direct_product_prefactor; + matrix_d2S[i*n + j + n2 * 6] = dz * dx * S_direct_product_prefactor; + matrix_d2S[i*n + j + n2 * 7] = dz * dy * S_direct_product_prefactor; + matrix_d2S[i*n + j + n2 * 8] = dz * dz * S_direct_product_prefactor + S_xyz_diagonal_prefactor; + + if (matrix_d2D != NULL) { + const double nxj = norm_vec[3*j]; + const double nyj = norm_vec[3*j+1]; + const double nzj = norm_vec[3*j+2]; + const double nj_rij = dx * nxj + dy * nyj + dz * nzj; + + const double eij4 = eij2 * eij2; + const double rij_6 = rij_4 * rij_2; + const double rij_7 = rij_4 * rij_3; + + const double D_direct_product_prefactor = (-two_eij_over_sqrt_pi_exp_minus_eij2_rij2 * (15 * rij_6 + 10 * eij2 * rij_4 + 4 * eij4 * rij_2) + + 15 * rij_7 * erf_eij_rij) * nj_rij; + matrix_d2D[i*n + j ] = D_direct_product_prefactor * dx * dx - S_direct_product_prefactor * (dx * nxj + dx * nxj + nj_rij); + matrix_d2D[i*n + j + n2 ] = D_direct_product_prefactor * dx * dy - S_direct_product_prefactor * (dy * nxj + dx * nyj); + matrix_d2D[i*n + j + n2 * 2] = D_direct_product_prefactor * dx * dz - S_direct_product_prefactor * (dz * nxj + dx * nzj); + matrix_d2D[i*n + j + n2 * 3] = D_direct_product_prefactor * dy * dx - S_direct_product_prefactor * (dx * nyj + dy * nxj); + matrix_d2D[i*n + j + n2 * 4] = D_direct_product_prefactor * dy * dy - S_direct_product_prefactor * (dy * nyj + dy * nyj + nj_rij); + matrix_d2D[i*n + j + n2 * 5] = D_direct_product_prefactor * dy * dz - S_direct_product_prefactor * (dz * nyj + dy * nzj); + matrix_d2D[i*n + j + n2 * 6] = D_direct_product_prefactor * dz * dx - S_direct_product_prefactor * (dx * nzj + dz * nxj); + matrix_d2D[i*n + j + n2 * 7] = D_direct_product_prefactor * dz * dy - S_direct_product_prefactor * (dy * nzj + dz * nyj); + matrix_d2D[i*n + j + n2 * 8] = D_direct_product_prefactor * dz * dz - S_direct_product_prefactor * (dz * nzj + dz * nzj + nj_rij); + } +} + +__global__ +static void _pcm_d2F_to_d2Sii(const double* F, const double* dF, const double* d2F, const double* charge_exp, + double* d2Sii, const int n_atom, const int n_grid) +{ + const int i_grid = blockIdx.x * blockDim.x + threadIdx.x; + const int ij_atom = blockIdx.y * blockDim.y + threadIdx.y; + if (i_grid >= n_grid || ij_atom >= n_atom * n_atom) { + return; + } + + const int i_atom = ij_atom / n_atom; + const int j_atom = ij_atom % n_atom; + + const double zeta = charge_exp[i_grid]; + const double F_value = F[i_grid]; + const double F_1 = 1.0 / F_value; + const double F_2 = F_1 * F_1; + const double combined_factor = SQRT2_PI * zeta * F_2; + + const double dFix = dF[(i_atom * 3 ) * n_grid + i_grid]; + const double dFiy = dF[(i_atom * 3 + 1) * n_grid + i_grid]; + const double dFiz = dF[(i_atom * 3 + 2) * n_grid + i_grid]; + const double dFjx = dF[(j_atom * 3 ) * n_grid + i_grid]; + const double dFjy = dF[(j_atom * 3 + 1) * n_grid + i_grid]; + const double dFjz = dF[(j_atom * 3 + 2) * n_grid + i_grid]; + + const double d2Fixjx = d2F[((i_atom * n_atom + j_atom) * 9 + 0 * 3 ) * n_grid + i_grid]; + const double d2Fixjy = d2F[((i_atom * n_atom + j_atom) * 9 + 0 * 3 + 1) * n_grid + i_grid]; + const double d2Fixjz = d2F[((i_atom * n_atom + j_atom) * 9 + 0 * 3 + 2) * n_grid + i_grid]; + const double d2Fiyjx = d2F[((i_atom * n_atom + j_atom) * 9 + 1 * 3 ) * n_grid + i_grid]; + const double d2Fiyjy = d2F[((i_atom * n_atom + j_atom) * 9 + 1 * 3 + 1) * n_grid + i_grid]; + const double d2Fiyjz = d2F[((i_atom * n_atom + j_atom) * 9 + 1 * 3 + 2) * n_grid + i_grid]; + const double d2Fizjx = d2F[((i_atom * n_atom + j_atom) * 9 + 2 * 3 ) * n_grid + i_grid]; + const double d2Fizjy = d2F[((i_atom * n_atom + j_atom) * 9 + 2 * 3 + 1) * n_grid + i_grid]; + const double d2Fizjz = d2F[((i_atom * n_atom + j_atom) * 9 + 2 * 3 + 2) * n_grid + i_grid]; + + d2Sii[((i_atom * n_atom + j_atom) * 9 + 0 * 3 ) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFix * dFjx - d2Fixjx); + d2Sii[((i_atom * n_atom + j_atom) * 9 + 0 * 3 + 1) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFix * dFjy - d2Fixjy); + d2Sii[((i_atom * n_atom + j_atom) * 9 + 0 * 3 + 2) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFix * dFjz - d2Fixjz); + d2Sii[((i_atom * n_atom + j_atom) * 9 + 1 * 3 ) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiy * dFjx - d2Fiyjx); + d2Sii[((i_atom * n_atom + j_atom) * 9 + 1 * 3 + 1) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiy * dFjy - d2Fiyjy); + d2Sii[((i_atom * n_atom + j_atom) * 9 + 1 * 3 + 2) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiy * dFjz - d2Fiyjz); + d2Sii[((i_atom * n_atom + j_atom) * 9 + 2 * 3 ) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiz * dFjx - d2Fizjx); + d2Sii[((i_atom * n_atom + j_atom) * 9 + 2 * 3 + 1) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiz * dFjy - d2Fizjy); + d2Sii[((i_atom * n_atom + j_atom) * 9 + 2 * 3 + 2) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiz * dFjz - d2Fizjz); +} + extern "C" { int pcm_d_s(cudaStream_t stream, double *matrix_d, double *matrix_s, const double *coords, const double *norm_vec, const double *r_vdw, @@ -149,15 +270,47 @@ int pcm_d_s(cudaStream_t stream, double *matrix_d, double *matrix_s, } int pcm_dd_ds(cudaStream_t stream, double *matrix_dD, double *matrix_dS, - const double *coords, const double *norm_vec, const double *r_vdw, - const double *charge_exp, const double *switch_fun, - int n) + const double *coords, const double *norm_vec, + const double *charge_exp, + int n) { int ntilex = (n + THREADS - 1) / THREADS; int ntiley = (n + THREADS - 1) / THREADS; dim3 threads(THREADS, THREADS); dim3 blocks(ntilex, ntiley); - _pcm_dD_dS<<>>(matrix_dD, matrix_dS, coords, norm_vec, r_vdw, charge_exp, switch_fun, n); + _pcm_dD_dS<<>>(matrix_dD, matrix_dS, coords, norm_vec, charge_exp, n); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + return 1; + } + return 0; +} + +int pcm_d2d_d2s(cudaStream_t stream, double *matrix_d2D, double *matrix_d2S, + const double *coords, const double *norm_vec, + const double *charge_exp, + int n) +{ + const int ntilex = (n + THREADS - 1) / THREADS; + const int ntiley = (n + THREADS - 1) / THREADS; + const dim3 threads(THREADS, THREADS); + const dim3 blocks(ntilex, ntiley); + _pcm_d2D_d2S<<>>(matrix_d2D, matrix_d2S, coords, norm_vec, charge_exp, n); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + return 1; + } + return 0; +} + +int pcm_d2f_to_d2sii(cudaStream_t stream, const double* F, const double* dF, const double* d2F, const double* charge_exp, + double* d2Sii, const int n_atom, const int n_grid) +{ + const int ntilex = (n_grid + THREADS - 1) / THREADS; + const int ntiley = (n_atom * n_atom + THREADS - 1) / THREADS; + const dim3 threads(THREADS, THREADS); + const dim3 blocks(ntilex, ntiley); + _pcm_d2F_to_d2Sii<<>>(F, dF, d2F, charge_exp, d2Sii, n_atom, n_grid); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { return 1; diff --git a/gpu4pyscf/lib/utils.py b/gpu4pyscf/lib/utils.py index 0b7c613f..5f38a29c 100644 --- a/gpu4pyscf/lib/utils.py +++ b/gpu4pyscf/lib/utils.py @@ -105,6 +105,7 @@ def device(obj): def format_sys_info(): '''Format a list of system information for printing.''' from cupyx._runtime import get_runtime_info + from gpu4pyscf.__config__ import num_devices, mem_fraction, props as device_props pyscf_info = lib.repo_info(pyscf.__file__) gpu4pyscf_info = lib.repo_info(os.path.join(__file__, '..', '..')) @@ -112,7 +113,6 @@ def format_sys_info(): cuda_version = f"{cuda_version // 1000}.{(cuda_version % 1000) // 10}" runtime_info = get_runtime_info() - device_props = cupy.cuda.runtime.getDeviceProperties(0) result = [ f'System: {platform.uname()} Threads {lib.num_threads()}', f'Python {sys.version}', @@ -134,6 +134,8 @@ def format_sys_info(): 'Device info', f' Device name {device_props["name"]}', f' Device global memory {device_props["totalGlobalMem"] / 1024**3:.2f} GB', + f' CuPy memory fraction {mem_fraction}', + f' Num. Devices {num_devices}', f'GPU4PySCF {gpu4pyscf.__version__}', f'GPU4PySCF path {gpu4pyscf_info["path"]}' ] diff --git a/gpu4pyscf/mp/dfmp2.py b/gpu4pyscf/mp/dfmp2.py index 92652402..da398dcb 100644 --- a/gpu4pyscf/mp/dfmp2.py +++ b/gpu4pyscf/mp/dfmp2.py @@ -20,7 +20,7 @@ from gpu4pyscf.mp import mp2 from gpu4pyscf.lib import logger from gpu4pyscf.lib.cupy_helper import contract, tag_array, reduce_to_device -from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.__config__ import _streams, num_devices from pyscf import __config__ WITH_T2 = getattr(__config__, 'mp_dfmp2_with_t2', True) @@ -45,8 +45,8 @@ def _dfmp2_tasks(mp, mo_coeff, mo_energy, device_id=0): return Lov def get_occ_blk(Lov_dist, i, nocc, nvir): - occ_blk_dist = [None] * _num_devices - for device_id in range(_num_devices): + occ_blk_dist = [None] * num_devices + for device_id in range(num_devices): with cupy.cuda.Device(device_id), _streams[device_id]: Lov = Lov_dist[device_id] mat = cupy.dot(Lov[:,i*nvir:(i+1)*nvir].T, @@ -73,8 +73,8 @@ def kernel(mp, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2, # Submit tasks to different devices futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): + with ThreadPoolExecutor(max_workers=num_devices) as executor: + for device_id in range(num_devices): future = executor.submit(_dfmp2_tasks, mp, mo_coeff, mo_energy, device_id=device_id) futures.append(future) diff --git a/gpu4pyscf/pbc/df/aft.py b/gpu4pyscf/pbc/df/aft.py index 5f9edc37..4bc4aa50 100644 --- a/gpu4pyscf/pbc/df/aft.py +++ b/gpu4pyscf/pbc/df/aft.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,7 +27,6 @@ from pyscf.pbc.gto.pseudo import pp_int from pyscf.pbc.lib.kpts_helper import is_zero from pyscf.pbc.df import ft_ao -from pyscf.pbc.df.aft import _check_kpts from pyscf.pbc.tools import k2gamma from gpu4pyscf.pbc.tools.pbc import get_coulG from gpu4pyscf.pbc.df import aft_jk @@ -201,3 +200,19 @@ def get_jk(self, dm, hermi=1, kpts=None, kpts_band=None, to_gpu = utils.to_gpu device = utils.device to_cpu = utils.to_cpu + +def _check_kpts(mydf, kpts): + '''Check if the argument kpts is a single k-point''' + if kpts is None: + kpts = mydf.kpts + if kpts is None: + kpts = np.zeros((1, 3)) + is_single_kpt = True + else: + kpts = np.asarray(kpts) + is_single_kpt = kpts.ndim == 1 or is_zero(kpts) + else: + kpts = np.asarray(kpts) + is_single_kpt = kpts.ndim == 1 + kpts = kpts.reshape(-1,3) + return kpts, is_single_kpt diff --git a/gpu4pyscf/pbc/df/aft_jk.py b/gpu4pyscf/pbc/df/aft_jk.py index 225f97cb..040fc955 100644 --- a/gpu4pyscf/pbc/df/aft_jk.py +++ b/gpu4pyscf/pbc/df/aft_jk.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/gpu4pyscf/pbc/df/df.py b/gpu4pyscf/pbc/df/df.py index 756c94df..45d41f22 100644 --- a/gpu4pyscf/pbc/df/df.py +++ b/gpu4pyscf/pbc/df/df.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ __all__ = ['GDF'] +import warnings import ctypes import tempfile import numpy as np @@ -28,20 +29,19 @@ from pyscf import lib from pyscf.pbc.df import aft as aft_cpu from pyscf.pbc.df import df as df_cpu -from pyscf.pbc.df.aft import _check_kpts from pyscf.pbc.df.gdf_builder import libpbc -from pyscf.pbc.lib.kpts_helper import is_zero, unique -from pyscf.pbc.df.rsdf_builder import _RSGDFBuilder, _RSNucBuilder +from pyscf.pbc.lib.kpts_helper import is_zero from gpu4pyscf.lib import logger -from gpu4pyscf.pbc.df import df_jk -from gpu4pyscf.lib.cupy_helper import return_cupy_array, pack_tril, unpack_tril +from gpu4pyscf.pbc.df import df_jk, rsdf_builder +from gpu4pyscf.pbc.df.aft import _check_kpts +from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh +from gpu4pyscf.lib.cupy_helper import return_cupy_array, pack_tril, get_avail_mem from gpu4pyscf.lib import utils class GDF(lib.StreamObject): '''Gaussian density fitting ''' blockdim = df_cpu.GDF.blockdim - _dataname = 'j3c' _prefer_ccdf = False force_dm_kbuild = False @@ -56,51 +56,25 @@ class GDF(lib.StreamObject): reset = df_cpu.GDF.reset dump_flags = df_cpu.GDF.dump_flags - def build(self, j_only=None, with_j3c=True, kpts_band=None): + def build(self, j_only=None, kpts_band=None): + warnings.warn( + 'PBC.df is currently experimental and subject to significant changes.') if j_only is not None: self._j_only = j_only - if self.kpts_band is not None: - self.kpts_band = np.reshape(self.kpts_band, (-1,3)) - assert kpts_band is None + assert kpts_band is None and self.kpts_band is None self.check_sanity() self.dump_flags() + cell = self.cell + auxcell = df_cpu.make_auxcell(cell, self.auxbasis, self.exp_to_discard) + self.auxcell = auxcell - self.auxcell = df_cpu.make_auxcell(self.cell, self.auxbasis, - self.exp_to_discard) - - if with_j3c and self._cderi_to_save is not None: - if isinstance(self._cderi_to_save, str): - cderi = self._cderi_to_save - else: - cderi = self._cderi_to_save.name - self._cderi = cderi - t1 = (logger.process_clock(), logger.perf_counter()) - self._make_j3c(self.cell, self.auxcell, None, cderi) - t1 = logger.timer_debug1(self, 'j3c', *t1) + t1 = (logger.process_clock(), logger.perf_counter()) + self._cderi, self._cderip = rsdf_builder.build_cderi( + cell, auxcell, self.kpts, j_only=j_only) + t1 = logger.timer_debug1(self, 'j3c', *t1) return self - def _make_j3c(self, cell=None, auxcell=None, kptij_lst=None, cderi_file=None): - if cell is None: cell = self.cell - if auxcell is None: auxcell = self.auxcell - if cderi_file is None: cderi_file = self._cderi_to_save - - # Remove duplicated k-points. Duplicated kpts may lead to a buffer - # located in incore.wrap_int3c larger than necessary. Integral code - # only fills necessary part of the buffer, leaving some space in the - # buffer unfilled. - if self.kpts_band is None: - kpts_union = self.kpts - else: - kpts_union = unique(np.vstack([self.kpts, self.kpts_band]))[0] - - dfbuilder = _RSGDFBuilder(cell, auxcell, kpts_union) - dfbuilder.mesh = self.mesh - dfbuilder.linear_dep_threshold = self.linear_dep_threshold - j_only = self._j_only or len(kpts_union) == 1 - dfbuilder.make_j3c(cderi_file, j_only=j_only, dataname=self._dataname, - kptij_lst=kptij_lst) - has_kpts = df_cpu.GDF.has_kpts weighted_coulG = return_cupy_array(aft_cpu.weighted_coulG) pw_loop = NotImplemented @@ -108,48 +82,72 @@ def _make_j3c(self, cell=None, auxcell=None, kptij_lst=None, cderi_file=None): get_naoaux = df_cpu.GDF.get_naoaux range_coulomb = aft_cpu.AFTDFMixin.range_coulomb - def sr_loop(self, kpti_kptj=np.zeros((2,3)), max_memory=2000, - compact=True, blksize=None, aux_slice=None): - '''Short range part''' - assert aux_slice is None + def sr_loop(self, ki, kj, compact=True, blksize=None): + '''Iterator for the 3-index cderi tensor over the auxliary dimension''' if self._cderi is None: self.build() cell = self.cell - kpti, kptj = kpti_kptj - unpack = is_zero(kpti-kptj) and not compact nao = cell.nao if blksize is None: - blksize = max_memory*1e6/16/(nao**2*2) - blksize /= 2 # For prefetch - blksize = max(16, min(int(blksize), self.blockdim)) - logger.debug2(self, 'max_memory %d MB, blksize %d', max_memory, blksize) - - def load(aux_slice): - b0, b1 = aux_slice - naux = b1 - b0 - Lpq = cp.asarray(j3c[b0:b1]) - if compact and Lpq.shape[1] == nao**2: - Lpq = pack_tril(Lpq.reshape(naux, nao, nao)) - elif unpack and Lpq.shape[1] != nao**2: - Lpq = unpack_tril(Lpq) - return Lpq - - with df_cpu._load3c(self._cderi, self._dataname, kpti_kptj) as j3c: - slices = lib.prange(0, j3c.shape[0], blksize) - for Lpq in lib.map_with_prefetch(load, slices): - yield Lpq, 1 - - if cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum': - # Truncated Coulomb operator is not positive definite. Load the - # CDERI tensor of negative part. - with df_cpu._load3c(self._cderi, self._dataname+'-', kpti_kptj, - ignore_key_error=True) as j3c: - slices = lib.prange(0, j3c.shape[0], blksize) - for Lpq in lib.map_with_prefetch(load, slices): - yield Lpq, -1 - - get_pp = return_cupy_array(df_cpu.GDF.get_pp) - get_nuc = return_cupy_array(df_cpu.GDF.get_nuc) + avail_mem = get_avail_mem() * .8 + blksize = avail_mem/16/(nao**2*3) + if blksize < 16: + raise RuntimeError('Insufficient GPU memory') + blksize = min(int(blksize), self.blockdim) + logger.debug2(self, 'max_memory %d MB, blksize %d', avail_mem*1e-6, blksize) + + if (ki, kj) in self._cderi: + req_conj = False + elif (kj, ki) in self._cderi: + req_conj = True + else: + raise RuntimeError('CDERI for kpoints {ki},{kj} not generated') + + Lpq_kij = self._cderi[ki,kj] + naux = len(Lpq_kij) + for b0, b1 in lib.prange(0, naux, blksize): + if req_conj: + Lpq = Lpq_kij[b0:b1].transpose(0,2,1).conj() + else: + Lpq = Lpq_kij[b0:b1] + assert Lpq[0].size == nao**2 + if compact: + Lpq = pack_tril(Lpq.reshape(-1, nao, nao)) + yield Lpq, 1 + + if cell.dimension == 2: + assert cell.low_dim_ft_type != 'inf_vacuum' + Lpq_kij = self._cderip[ki,kj] + naux = len(Lpq_kij) + for b0, b1 in lib.prange(0, naux, blksize): + if req_conj: + Lpq = Lpq_kij[b0:b1].transpose(0,2,1).conj() + else: + Lpq = Lpq_kij[b0:b1] + assert Lpq[0].size == nao**2 + if compact: + Lpq = pack_tril(Lpq.reshape(-1, nao, nao)) + yield Lpq, -1 + + def get_pp(self, kpts=None): + kpts, is_single_kpt = _check_kpts(self, kpts) + if is_single_kpt and is_zero(kpts): + vpp = rsdf_builder.get_pp(self.cell) + else: + vpp = rsdf_builder.get_pp(self.cell, kpts) + if is_single_kpt: + vpp = vpp[0] + return vpp + + def get_nuc(self, kpts=None): + kpts, is_single_kpt = _check_kpts(self, kpts) + if is_single_kpt and is_zero(kpts): + nuc = rsdf_builder.get_nuc(self.cell) + else: + nuc = rsdf_builder.get_nuc(self.cell, kpts) + if is_single_kpt: + nuc = nuc[0] + return nuc # Note: Special exxdiv by default should not be used for an arbitrary # input density matrix. When the df object was used with the molecular diff --git a/gpu4pyscf/pbc/df/df_jk.py b/gpu4pyscf/pbc/df/df_jk.py index bdaf2427..36ce3acf 100644 --- a/gpu4pyscf/pbc/df/df_jk.py +++ b/gpu4pyscf/pbc/df/df_jk.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ from gpu4pyscf.lib.cupy_helper import contract, unpack_tril from gpu4pyscf.pbc.df.fft_jk import _ewald_exxdiv_for_G0, _format_dms, _format_jks -def density_fit(mf, auxbasis=None, mesh=None, with_df=None): +def density_fit(mf, auxbasis=None, with_df=None): '''Generate density-fitting SCF object Args: @@ -34,8 +34,6 @@ def density_fit(mf, auxbasis=None, mesh=None, with_df=None): Same format to the input attribute mol.basis. If auxbasis is None, auxiliary basis based on AO basis (if possible) or even-tempered Gaussian basis will be used. - mesh : tuple - number of grids in each direction with_df : DF object ''' from gpu4pyscf.pbc.df.df import GDF @@ -45,27 +43,21 @@ def density_fit(mf, auxbasis=None, mesh=None, with_df=None): else: kpts = np.reshape(mf.kpt, (1,3)) with_df = GDF(mf.cell, kpts) - with_df.max_memory = mf.max_memory with_df.stdout = mf.stdout with_df.verbose = mf.verbose with_df.auxbasis = auxbasis - if mesh is not None: - with_df.mesh = mesh - mf = mf.copy() + mf = mf.copy().reset() mf.with_df = with_df - mf._eri = None return mf def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None): log = logger.new_logger(mydf) t0 = log.init_timer() - if mydf._cderi is None or not mydf.has_kpts(kpts_band): - if mydf._cderi is not None: - log.warn('DF integrals for band k-points were not found %s. ' - 'DF integrals will be rebuilt to include band k-points.', - mydf._cderi) + assert kpts_band is None or kpts_band is kpts + assert mydf.has_kpts(kpts) + if mydf._cderi is None: mydf.build(j_only=True, kpts_band=kpts_band) t0 = log.timer_debug1('Init get_j_kpts', *t0) @@ -83,11 +75,9 @@ def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None): nband = len(kpts_band) rho = cp.zeros((nset,naux), dtype=np.complex128) - max_memory = max(2000, (mydf.max_memory - lib.current_memory()[0])) - for k, kpt in enumerate(kpts): - kptii = np.asarray((kpt,kpt)) + for k in range(nkpts): p1 = 0 - for Lpq, sign in mydf.sr_loop(kptii, max_memory, False): + for Lpq, sign in mydf.sr_loop(k, k, False): Lpq = Lpq.reshape(-1,nao,nao) p0, p1 = p1, p1+Lpq.shape[0] rho[:,p0:p1] += sign * contract('Lpq,xqp->xL', Lpq, dms[:,k]) @@ -102,9 +92,8 @@ def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None): vj = cp.zeros((nset,nband,nao_pair), dtype=np.complex128) for k, kpt in enumerate(kpts_band): - kptii = np.asarray((kpt,kpt)) p1 = 0 - for Lpq, sign in mydf.sr_loop(kptii, max_memory, aos2symm): + for Lpq, sign in mydf.sr_loop(k, k, aos2symm): nrow = Lpq.shape[0] p0, p1 = p1, p1+nrow Lpq = Lpq.reshape(nrow, -1) @@ -137,11 +126,9 @@ def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None, raise RuntimeError('GDF does not support exxdiv %s' % exxdiv) t0 = (logger.process_clock(), logger.perf_counter()) - if mydf._cderi is None or not mydf.has_kpts(kpts_band): - if mydf._cderi is not None: - log.warn('DF integrals for band k-points were not found %s. ' - 'DF integrals will be rebuilt to include band k-points.', - mydf._cderi) + assert kpts_band is None or kpts_band is kpts + assert mydf.has_kpts(kpts) + if mydf._cderi is None: mydf.build(kpts_band=kpts_band) t0 = log.timer_debug1('Init get_k_kpts', *t0) @@ -186,12 +173,12 @@ def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None, # K_pq = ( p{k1} i{k2} | i{k2} q{k1} ) # input dm is not Hermitian/PSD --> build K from dm log.debug2('get_k_kpts: build K from dm') - max_memory = max(2000, mydf.max_memory-lib.current_memory()[0]) - def make_kpt(ki, kj, swap_2e, inverse_idx=None): - kpti = kpts[ki] - kptj = kpts_band[kj] - #TODO: utilize kk_adapted_iter with time_reversal_symmetry, as that in aft_jk - for Lpq, sign in mydf.sr_loop((kpti,kptj), max_memory, compact=False): + if mydf._cderi is None: + mydf.build() + def make_kpt(ki, kj, swap_2e): + if (ki, kj) not in mydf._cderi: + kj, ki = ki, kj + for Lpq, sign in mydf.sr_loop(ki, kj, compact=False): Lpq = Lpq.reshape(-1, nao, nao) tmp = contract('njk,Lkl->nLjl', dms[:,ki], Lpq) if sign > 0: @@ -207,23 +194,23 @@ def make_kpt(ki, kj, swap_2e, inverse_idx=None): vk[:,ki] -= contract('nLki,Lji->nkj', tmp, Lpq.conj()) t1 = log.init_timer() - if kpts_band is kpts: # normal k-points HF/DFT - for ki in range(nkpts): - for kj in range(ki): - make_kpt(ki, kj, True) - make_kpt(ki, ki, False) - t1 = log.timer_debug1('get_k_kpts: make_kpt ki>=kj (%d,*)'%ki, *t1) - else: + if kpts_band is not kpts: # normal k-points HF/DFT raise NotImplementedError - - if exxdiv == 'ewald': - _ewald_exxdiv_for_G0(cell, kpts, dms, vk, kpts_band) + #TODO: utilize kk_adapted_iter with time_reversal_symmetry, as that in aft_jk + for ki in range(nkpts): + for kj in range(ki): + make_kpt(ki, kj, True) + make_kpt(ki, ki, False) + t1 = log.timer_debug1('get_k_kpts: make_kpt ki>=kj (%d,*)'%ki, *t1) if (is_zero(kpts) and is_zero(kpts_band) and not np.iscomplexobj(dm_kpts)): vk = vk.real vk *= 1./nkpts + if exxdiv == 'ewald': + _ewald_exxdiv_for_G0(cell, kpts, dms, vk, kpts_band) + log.timer('get_k_kpts', *t0) return _format_jks(vk, dm_kpts, input_band, kpts) @@ -243,29 +230,17 @@ def get_jk(mydf, dm, hermi=1, kpt=np.zeros(3), '''JK for given k-point''' log = logger.new_logger(mydf) t0 = log.init_timer() - if mydf._cderi is None or not mydf.has_kpts(kpts_band): - if mydf._cderi is not None: - log.warn('DF integrals for band k-points were not found %s. ' - 'DF integrals will be rebuilt to include band k-points.', - mydf._cderi) + assert is_zero(kpt) + assert kpts_band is None + if mydf._cderi is None: mydf.build(j_only=not with_k, kpts_band=kpts_band) t0 = log.timer_debug1('Init get_jk', *t0) - vj = vk = None - if kpts_band is not None and abs(kpt-kpts_band).sum() > 1e-9: - kpt = np.reshape(kpt, (1,3)) - if with_k: - vk = get_k_kpts(mydf, dm, hermi, kpt, kpts_band, exxdiv) - if with_j: - vj = get_j_kpts(mydf, dm, hermi, kpt, kpts_band) - return vj, vk - cell = mydf.cell - dm = np.asarray(dm, order='C') + dm = cp.asarray(dm, order='C') dms = _format_dms(dm, [kpt]) nset, _, nao = dms.shape[:3] dms = dms.reshape(nset,nao,nao) - kptii = np.asarray((kpt,kpt)) if with_j: vj = cp.zeros((nset,nao,nao), dtype=np.complex128) if with_k: @@ -294,9 +269,7 @@ def get_jk(mydf, dm, hermi=1, kpt=np.zeros(3), ''' vk = cp.zeros((nset,nao,nao), dtype=np.complex128) - mem_now = lib.current_memory()[0] - max_memory = max(2000, (mydf.max_memory - mem_now)) - for Lpq, sign in mydf.sr_loop(kptii, max_memory, False): + for Lpq, sign in mydf.sr_loop(0, 0, False): if with_j: #:rho_coeff = np.einsum('Lpq,xqp->xL', Lpq, dms) #:vj += np.dot(rho_coeff, Lpq.reshape(-1,nao**2)) diff --git a/gpu4pyscf/pbc/df/fft.py b/gpu4pyscf/pbc/df/fft.py index 9d54b118..d074d9b3 100644 --- a/gpu4pyscf/pbc/df/fft.py +++ b/gpu4pyscf/pbc/df/fft.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/gpu4pyscf/pbc/df/fft_jk.py b/gpu4pyscf/pbc/df/fft_jk.py index dbf64378..1d17ed6d 100644 --- a/gpu4pyscf/pbc/df/fft_jk.py +++ b/gpu4pyscf/pbc/df/fft_jk.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/gpu4pyscf/pbc/df/ft_ao.py b/gpu4pyscf/pbc/df/ft_ao.py index d93678d6..cdd59951 100644 --- a/gpu4pyscf/pbc/df/ft_ao.py +++ b/gpu4pyscf/pbc/df/ft_ao.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,7 +26,6 @@ from pyscf.gto.mole import ANG_OF, ATOM_OF, PTR_COORD from pyscf.scf import _vhf from pyscf.pbc import tools as pbctools -from pyscf.pbc.gto.cell import _extract_pgto_params from pyscf.pbc.tools import k2gamma from pyscf.pbc.lib.kpts_helper import is_zero from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh @@ -36,6 +35,7 @@ from gpu4pyscf.scf.jk import ( g_pair_idx, _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE) from gpu4pyscf.pbc.lib.kpts_helper import conj_images_in_bvk_cell +from gpu4pyscf.pbc.gto.cell import extract_pgto_params from gpu4pyscf.__config__ import props as gpu_specs __all__ = [ @@ -43,8 +43,8 @@ ] libpbc = load_library('libpbc') -libpbc.PBC_build_ft_ao.restype = ctypes.c_int -libpbc.PBC_FT_init_constant.restype = ctypes.c_int +libpbc.build_ft_ao.restype = ctypes.c_int +libpbc.init_constant.restype = ctypes.c_int LMAX = 4 GOUT_WIDTH = 19 @@ -71,27 +71,25 @@ def ft_ao(cell, Gv, shls_slice=None, b=None, gxyz=None, Gvbase=None, kpt=np.zeros(3), verbose=None): from pyscf.pbc.df.ft_ao import ft_ao out = ft_ao(cell, Gv, shls_slice, b, gxyz, Gvbase, kpt, verbose) - return cp.asarray(out) + if out.flags.c_contiguous: + return cp.asarray(out) + else: + return cp.asarray(out, order='F') -def _bas_overlap_mask(cell, bvkmesh_Ls, Ls, cutoff=None): +def _bas_overlap_mask(cell, bvkmesh_Ls, Ls): '''integral screening mask for basis product between cell and supmol''' # consider only the most diffused component of a basis - exps, cs = _extract_pgto_params(cell, 'min') + exps, cs = extract_pgto_params(cell, 'diffused') ls = cell._bas[:,ANG_OF] bas_coords = cp.asarray(cell.atom_coords()[cell._bas[:,ATOM_OF]]) - vol = cell.vol - if cutoff is None: - theta_ij = exps.min() / 2 - lattice_sum_factor = max(2*np.pi*cell.rcut/(vol*theta_ij), 1) - cutoff = cell.precision/lattice_sum_factor * .1 - logger.debug(cell, 'Set ft_ao cutoff to %g', cutoff) - ls = cp.asarray(ls) exps = cp.asarray(exps) norm = cp.asarray(cs) * ((2*ls+1)/(4*np.pi))**.5 aij = exps[:,None] + exps - theta = exps[:,None] * exps / aij + fi = exps[:,None] / aij + fj = exps[None,:] / aij + theta = exps[:,None] * fj Ls = cp.asarray(Ls) # rj format: (bvk_cell_id, bas_id, lattice_img_id) @@ -100,16 +98,18 @@ def _bas_overlap_mask(cell, bvkmesh_Ls, Ls, cutoff=None): dr = cp.linalg.norm(rirj, axis=4) - dri = exps[None,None,:,None]/aij[:,None,:,None] * dr - drj = exps[:,None,None,None]/aij[:,None,:,None] * dr + dri = fj[:,None,:,None] * dr + drj = fi[:,None,:,None] * dr li = ls[:,None,None,None] lj = ls[None,None,:,None] fac_dri = (li * .5/aij[:,None,:,None] + dri**2) ** (li*.5) fac_drj = (lj * .5/aij[:,None,:,None] + drj**2) ** (lj*.5) - fl = 2*np.pi/vol * (dr/theta[:,None,:,None]) + 1. + rad = cell.vol**(-1./3) * dr + 1 + surface = 4*np.pi * rad**2 + fl = cp.where(surface > 1, surface, 1) fac_norm = norm[:,None]*norm * (np.pi/aij)**1.5 ovlp = fac_norm[:,None,:,None] * cp.exp(-theta[:,None,:,None]*dr**2) * fac_dri * fac_drj * fl - return ovlp > cutoff + return ovlp > cell.precision def gen_ft_kernel(cell, kpts=None, verbose=None): r''' @@ -132,11 +132,12 @@ def __init__(self, cell, kpts=None, bvk_kmesh=None): self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)) self.coeff = cp.asarray(coeff, dtype=np.complex128) - if kpts is not None and bvk_kmesh is None: - bvk_kmesh = kpts_to_kmesh(cell, kpts) - - # create BVK super-cell if bvk_kmesh is None: + if kpts is None or is_zero(kpts): + bvk_kmesh = np.ones(3, dtype=int) + else: + bvk_kmesh = kpts_to_kmesh(cell, kpts) + if np.prod(bvk_kmesh) == 1: bvkcell = cell else: bvkcell = pbctools.super_cell(cell, bvk_kmesh, wrap_around=True) @@ -169,7 +170,7 @@ def gen_ft_kernel(self, verbose=None): Ls = Ls[cp.linalg.norm(Ls-.5, axis=1).argsort()] if bvk_kmesh is None: - bvkmesh_Ls = cp.zeros(3) + bvkmesh_Ls = cp.zeros((1, 3)) else: bvkmesh_Ls = cp.asarray( k2gamma.translation_vectors_for_kmesh(cell, bvk_kmesh, True)) @@ -209,7 +210,7 @@ def gen_ft_kernel(self, verbose=None): conj_mapping = cp.asarray(conj_images_in_bvk_cell(bvk_kmesh), dtype=np.int32) init_constant(cell) - kern = libpbc.PBC_build_ft_ao + kern = libpbc.build_ft_ao cp.cuda.Stream.null.synchronize() log.timer_debug1('initialize ft_kern', *cput0) @@ -270,7 +271,7 @@ def _ft_sub(Gv, q, kptjs, transform_ao=True): cell._atm.ctypes, ctypes.c_int(cell.natm), cell._bas.ctypes, ctypes.c_int(cell.nbas), cell._env.ctypes) if err != 0: - raise RuntimeError(f'PBC_build_ft_ao kernel for {ll_pattern} failed') + raise RuntimeError(f'build_ft_ao kernel for {ll_pattern} failed') if log.verbose >= logger.DEBUG1: t1, t1p = log.timer_debug1(f'processing {ll_pattern}', *t1), t1 if ll_pattern not in timing_collection: @@ -290,24 +291,25 @@ def _ft_sub(Gv, q, kptjs, transform_ao=True): #ix, iy = cp.tril_indices(nao, -1) #for k, ck in enumerate(conj_mapping): # out[iy,ix,ck] = out[ix,iy,k] - err = libpbc.PBC_ft_aopair_fill_triu( + err = libpbc.ft_aopair_fill_triu( ctypes.cast(out.data.ptr, ctypes.c_void_p), ctypes.cast(conj_mapping.data.ptr, ctypes.c_void_p), ctypes.c_int(nao), ctypes.c_int(bvk_ncells), ctypes.c_int(nGv)) if err != 0: - raise RuntimeError('PBC_ft_aopair_fill_triu kernel failed') + raise RuntimeError('ft_aopair_fill_triu kernel failed') log.debug1('transform BvK-cell to k-points') - if kptjs is not None: + gamma_point_only = kptjs is None or is_zero(kptjs) + if not gamma_point_only: kptjs = cp.asarray(kptjs, order='C').reshape(-1,3) expLk = cp.exp(1j*cp.dot(bvkmesh_Ls, kptjs.T)) - out = contract('Lk,LpqG->kGpq', expLk, out) + out = contract('Lk,LpqG->kpqG', expLk, out) if transform_ao: log.debug1('transform basis') #:out = einsum('pqLG,pi,qj->LGij', out, coeff, coeff) - out = contract('kGpq,qj->kGpj', out, coeff) - out = contract('kGpj,pi->kGij', out, coeff) + out = contract('kpqG,pi->kiqG', out, coeff) + out = contract('kiqG,qj->kijG', out, coeff) log.timer('ft_aopair', *cput0) return out @@ -323,7 +325,7 @@ def ft_kernel(Gv, q=np.zeros(3), kptjs=kpts, transform_ao=True): avail_mem = get_avail_mem() if 2*out_size < avail_mem * .8: - return _ft_sub(Gv, q, kptjs, transform_ao) + return _ft_sub(Gv, q, kptjs, transform_ao).transpose(0,3,1,2) elif out_size < avail_mem * .8: if kptjs is None: @@ -332,16 +334,16 @@ def ft_kernel(Gv, q=np.zeros(3), kptjs=kpts, transform_ao=True): kptjs = kptjs.reshape(-1, 3) nkpts = len(kptjs) if transform_ao: - out = cp.empty((nkpts, nGv, nao_orig, nao_orig), dtype=np.complex128) + out = cp.empty((nkpts, nao_orig, nao_orig, nGv), dtype=np.complex128) else: - out = cp.empty((nkpts, nGv, nao, nao), dtype=np.complex128) + out = cp.empty((nkpts, nao, nao, nGv), dtype=np.complex128) Gv_block = int((avail_mem * .95 - out_size) / (2*nao**2*bvk_ncells*16)) Gv_block &= 0xfffffc if Gv_block >= 4: logger.debug1(cell, 'Processing ft_kernel in sub-blocks, Gv_block = %d', Gv_block) for p0, p1 in lib.prange(0, nGv, Gv_block): - out[:,p0:p1] = _ft_sub(Gv[p0:p1], q, kptjs, transform_ao) - return out + out[:,:,:,p0:p1] = _ft_sub(Gv[p0:p1], q, kptjs, transform_ao) + return out.transpose(0,3,1,2) raise RuntimeError('Not enough GPU memory. ' f'Available: {avail_mem*1e-9:.2f} GB. ' @@ -365,7 +367,7 @@ class AFTIntEnvVars(ctypes.Structure): def init_constant(cell): g_idx, offsets = g_pair_idx() - err = libpbc.PBC_FT_init_constant( + err = libpbc.init_constant( g_idx.ctypes, offsets.ctypes, cell._env.ctypes, ctypes.c_int(cell._env.size), ctypes.c_int(SHM_SIZE)) if err != 0: diff --git a/gpu4pyscf/pbc/df/int3c2e.py b/gpu4pyscf/pbc/df/int3c2e.py new file mode 100644 index 00000000..f92b6ef6 --- /dev/null +++ b/gpu4pyscf/pbc/df/int3c2e.py @@ -0,0 +1,482 @@ +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +''' +Perodic 3-center 2-electron short-range Coulomb integral helper functions +''' + +import ctypes +import math +import numpy as np +import cupy as cp +from pyscf import lib +from pyscf.lib.parameters import ANGULAR +from pyscf.gto.mole import ANG_OF, ATOM_OF, PTR_COORD, PTR_EXP, conc_env +from pyscf.pbc import tools as pbctools +from pyscf.pbc.tools import k2gamma +from pyscf.pbc.lib.kpts_helper import is_zero +from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh +from gpu4pyscf.lib import logger +from gpu4pyscf.lib.cupy_helper import contract +from gpu4pyscf.gto.mole import group_basis, PTR_BAS_COORD +from gpu4pyscf.scf.jk import _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE +from gpu4pyscf.pbc.gto.cell import extract_pgto_params +from gpu4pyscf.pbc.df.ft_ao import libpbc, init_constant + +__all__ = [ + 'sr_aux_e2', +] + +libpbc.fill_int3c2e.restype = ctypes.c_int + +LMAX = 4 +L_AUX_MAX = 6 +GOUT_WIDTH = 45 +THREADS = 256 +BVK_CELL_SHELLS = 2400 + +def sr_aux_e2(cell, auxcell, omega, kpts=None, bvk_kmesh=None, j_only=False): + r''' + Short-range 3-center integrals (ij|k). The auxiliary basis functions are + placed at the second electron. + ''' + if bvk_kmesh is None and kpts is not None: + if j_only: + # Coulomb integrals requires smaller kmesh to converge finite-size effects + bvk_kmesh = kpts_to_kmesh(cell, bvk_kmesh) + else: + # The remote images may contribute to certain k-point mesh, + # contributing to the finite-size effects in exchange matrix. + rcut = estimate_rcut(cell, auxcell, omega).max() + bvk_kmesh = kpts_to_kmesh(cell, kpts, rcut=rcut) + bvk_kmesh, bvk_kmesh_inp = guess_bvk_kmesh(cell, bvk_kmesh), bvk_kmesh + logger.debug(cell, 'BvK input %s, set to %s for sr_aux_e2', bvk_kmesh_inp, bvk_kmesh) + int3c2e_opt = SRInt3c2eOpt(cell, auxcell, omega, bvk_kmesh) + nao, nao_orig = int3c2e_opt.coeff.shape + naux = int3c2e_opt.aux_coeff.shape[0] + + gamma_point = kpts is None or (kpts.ndim == 1 and is_zero(kpts)) + if gamma_point: + out = cp.zeros((nao, nao, naux)) + else: + kpts = np.asarray(kpts).reshape(-1, 3) + expLk = cp.exp(1j*cp.asarray(int3c2e_opt.bvkmesh_Ls.dot(kpts.T))) + nL, nkpts = expLk.shape + if j_only: + expLLk = contract('Lk,Mk->LMk', expLk.conj(), expLk) + expLLk = expLLk.view(np.float64).reshape(nL,nL,nkpts,2) + out = cp.zeros((nkpts, nao, nao, naux), dtype=np.complex128) + else: + out = cp.zeros((nkpts, nkpts, nao, nao, naux), dtype=np.complex128) + + ao_loc = int3c2e_opt.sorted_cell.ao_loc + aux_loc = int3c2e_opt.sorted_auxcell.ao_loc + + for shls_slice, eri3c in int3c2e_opt.int3c2e_kernel(): + i0, i1, j0, j1 = ao_loc[list(shls_slice[:4])] + k0, k1 = aux_loc[list(shls_slice[4:])] + if gamma_point: + out[i0:i1,j0:j1,k0:k1] = tmp = eri3c.sum(axis=(0,2)) + if i0 != j0: + out[j0:j1,i0:i1,k0:k1] = tmp.transpose(1,0,2) + elif j_only: + tmp = contract('LMkz,LpMqr->kpqrz', expLLk, eri3c) + tmp = tmp.view(np.complex128)[...,0] + out[:,i0:i1,j0:j1,k0:k1] = tmp + if i0 != j0: + out[:,j0:j1,i0:i1,k0:k1] = tmp.transpose(0,2,1,3).conj() + else: + expLkz = expLk.view(np.float64).reshape(nL,nkpts,2) + tmp = contract('Lkz,MpLqr->Mkpqrz', expLkz, eri3c) + tmp = tmp.view(np.complex128)[...,0] + tmp = contract('Mk,Mlpqr->klpqr', expLk.conj(), tmp) + out[:,:,i0:i1,j0:j1,k0:k1] = tmp + if i0 != j0: + out[:,:,j0:j1,i0:i1,k0:k1] = tmp.transpose(1,0,3,2,4).conj() + tmp = None + + if kpts is None: + out = contract('pqr,rk->pqk', out, int3c2e_opt.aux_coeff) + out = contract('pqk,qj->pjk', out, int3c2e_opt.coeff) + out = contract('pjk,pi->ijk', out, int3c2e_opt.coeff) + elif j_only: + #:out = einsum('MpNqr,pi,qj,rk->MiNjk', out, coeff, coeff, auxcoeff) + out = contract('Npqr,rk->Npqk', out, int3c2e_opt.aux_coeff) + out = contract('Npqk,qj->Npjk', out, int3c2e_opt.coeff) + out = contract('Npjk,pi->Nijk', out, int3c2e_opt.coeff) + else: + #:out = einsum('MpNqr,pi,qj,rk->MiNjk', out, coeff, coeff, auxcoeff) + out = contract('MNpqr,rk->MNpqk', out, int3c2e_opt.aux_coeff) + out = contract('MNpqk,qj->MNpjk', out, int3c2e_opt.coeff) + out = contract('MNpjk,pi->MNijk', out, int3c2e_opt.coeff) + return out + +def create_img_idx(cell, bvkcell, auxcell, Ls, int3c2e_envs): + '''integral screening''' + # consider only the most diffused component of a basis + exps, cs = extract_pgto_params(cell, 'diffused') + ls = cell._bas[:,ANG_OF] + exps = cp.asarray(exps, dtype=np.float32) + log_cs = np.log(np.abs(cs * ((2*ls+1)/(4*np.pi))**.5)) + log_cs = cp.asarray(log_cs, np.float32) + nbas = cell.nbas + nk = bvkcell.nbas // nbas + + # Search the most diffused functions on each atom + aux_exps, aux_cs = extract_pgto_params(auxcell, 'diffused') + aux_ls = auxcell._bas[:,ANG_OF] + r2_aux = np.log(aux_cs**2 / cell.precision * 10**aux_ls) / aux_exps + atom_aux_exps = [] + atoms = auxcell._bas[:,ATOM_OF] + atom_aux_exps = cp.full(cell.natm, 1e8, dtype=np.float32) + for ia in range(cell.natm): + bas_mask = atoms == ia + es = aux_exps[bas_mask] + if len(es) > 0: + atom_aux_exps[ia] = es[r2_aux[bas_mask].argmax()] + + def gen_img_idx(ish0, ish1, jsh0, jsh1): + nish = ish1 - ish0 + njsh = jsh1 - jsh0 + #TODO: only tril part when i == j + ij_pairs = nk * nish * nk * njsh + img_counts = cp.zeros(ij_pairs, dtype=np.int32) + err = libpbc.int3c2e_img_counts( + ctypes.cast(img_counts.data.ptr, ctypes.c_void_p), + ctypes.byref(int3c2e_envs), + (ctypes.c_int*4)(ish0, ish1, jsh0, jsh1), + ctypes.cast(exps.data.ptr, ctypes.c_void_p), + ctypes.cast(log_cs.data.ptr, ctypes.c_void_p), + ctypes.cast(atom_aux_exps.data.ptr, ctypes.c_void_p), + ctypes.c_int(nk), ctypes.c_int(cell.natm)) + if err != 0: + raise RuntimeError('int3c2e_img_counts failed') + + remaining_idx = np.nonzero(img_counts > 0)[0] + remaining_idx = remaining_idx[img_counts[remaining_idx].argsort()[::-1]] + remaining_idx = cp.asarray(remaining_idx, dtype=np.int32, order='C') + ij_pairs = remaining_idx.size + img_offsets = cp.empty(ij_pairs+1, dtype=np.int32) + cp.cumsum(img_counts[remaining_idx], out=img_offsets[1:]) + img_offsets[0] = 0 + + img_idx = cp.empty(int(img_offsets[-1]), dtype=np.int32) + err = libpbc.int3c2e_img_idx( + ctypes.cast(img_idx.data.ptr, ctypes.c_void_p), + ctypes.cast(img_offsets.data.ptr, ctypes.c_void_p), + ctypes.cast(remaining_idx.data.ptr, ctypes.c_void_p), + ctypes.c_int(ij_pairs), + ctypes.byref(int3c2e_envs), + (ctypes.c_int*4)(ish0, ish1, jsh0, jsh1), + ctypes.cast(exps.data.ptr, ctypes.c_void_p), + ctypes.cast(log_cs.data.ptr, ctypes.c_void_p), + ctypes.cast(atom_aux_exps.data.ptr, ctypes.c_void_p), + ctypes.c_int(nk), ctypes.c_int(cell.natm)) + if err != 0: + raise RuntimeError('int3c2e_img_idx failed') + + Ki, i, Kj, j = cp.unravel_index(remaining_idx, (nk, nish, nk, njsh)) + i += ish0 + j += jsh0 + # one-dimensional indices corresponding to [Ki,i,Kj,j] + bas_ij = cp.ravel_multi_index((Ki, i, Kj, j), (nk, nbas, nk, nbas)) + bas_ij = cp.asarray(bas_ij, dtype=np.int32) + return img_idx, img_offsets, bas_ij + return gen_img_idx + +class SRInt3c2eOpt: + def __init__(self, cell, auxcell, omega, bvk_kmesh=None): + assert omega < 0 + self.omega = omega + + self.cell = cell + cell, coeff, uniq_l_ctr, l_ctr_counts = group_basis(cell, tile=1) + self.sorted_cell = cell + self.uniq_l_ctr = uniq_l_ctr + self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)) + self.coeff = cp.asarray(coeff) + self.sorted_cell.omega = omega + + self.auxcell = auxcell + auxcell, coeff, uniq_l_ctr, l_ctr_counts = group_basis(auxcell, tile=1) + self.sorted_auxcell = auxcell + self.uniq_l_ctr_aux = uniq_l_ctr + self.l_ctr_aux_offsets = np.append(0, np.cumsum(l_ctr_counts)) + self.aux_coeff = cp.asarray(coeff) + self.sorted_auxcell.omega = omega + + if bvk_kmesh is None: + bvk_kmesh = np.ones(3, dtype=int) + self.bvk_kmesh = bvk_kmesh + self.bvkmesh_Ls = k2gamma.translation_vectors_for_kmesh(cell, bvk_kmesh, True) + + if np.prod(bvk_kmesh) == 1: + bvkcell = cell + else: + bvkcell = pbctools.super_cell(cell, bvk_kmesh, wrap_around=True) + # PTR_BAS_COORD was not initialized in pbctools.supe_rcell + bvkcell._bas[:,PTR_BAS_COORD] = bvkcell._atm[bvkcell._bas[:,ATOM_OF],PTR_COORD] + self.bvkcell = bvkcell + + def int3c2e_kernel(self, cutoff=None, verbose=None): + cell = self.sorted_cell + auxcell = self.sorted_auxcell + uniq_l_ctr = self.uniq_l_ctr + l_ctr_offsets = self.l_ctr_offsets + l_ctr_aux_offsets = self.l_ctr_aux_offsets + bvkcell = self.bvkcell + + log = logger.new_logger(cell, verbose) + cput0 = log.init_timer() + rcut = estimate_rcut(cell, auxcell, self.omega).max() + Ls = cp.asarray(bvkcell.get_lattice_Ls(rcut=rcut)) + Ls = Ls[cp.linalg.norm(Ls-.5, axis=1).argsort()] + nimgs = len(Ls) + log.debug('int3c2e_kernel rcut = %g, nimgs = %d', rcut, nimgs) + + if cutoff is None: + omega = cell.omega + aux_exp, _, aux_l = most_diffused_pgto(auxcell) + cell_exp, _, cell_l = most_diffused_pgto(cell) + if omega == 0: + theta = 1./(1./cell_exp + 1./aux_exp) + else: + theta = 1./(1./cell_exp + 1./aux_exp + omega**-2) + lsum = cell_l * 2 + aux_l + 1 + rad = cell.vol**(-1./3) * rcut + 1 + surface = 4*np.pi * rad**2 + lattice_sum_factor = 2*np.pi*rcut*lsum/(cell.vol*theta) + surface + cutoff = cell.precision / lattice_sum_factor + log.debug1('int3c_kernel integral omega=%g theta=%g cutoff=%g', + omega, theta, cutoff) + + _atm_cpu, _bas_cpu, _env_cpu = conc_env( + bvkcell._atm, bvkcell._bas, _scale_sp_ctr_coeff(bvkcell), + auxcell._atm, auxcell._bas, _scale_sp_ctr_coeff(auxcell)) + #NOTE: PTR_BAS_COORD is not updated in conc_env() + off = _bas_cpu[bvkcell.nbas,PTR_EXP] - auxcell._bas[0,PTR_EXP] + _bas_cpu[bvkcell.nbas:,PTR_BAS_COORD] += off + + bvk_ao_loc = bvkcell.ao_loc + aux_loc = auxcell.ao_loc + + _atm = cp.array(_atm_cpu, dtype=np.int32) + _bas = cp.array(_bas_cpu, dtype=np.int32) + _env = cp.array(_env_cpu, dtype=np.float64) + ao_loc = _conc_locs(bvk_ao_loc, aux_loc) + bvk_ncells = bvkcell.nbas // cell.nbas + int3c2e_envs = Int3c2eEnvVars( + cell.natm, cell.nbas, bvk_ncells, nimgs, + _atm.data.ptr, _bas.data.ptr, _env.data.ptr, ao_loc.data.ptr, + Ls.data.ptr, math.log(cutoff), + ) + # Keep a reference to these arrays, prevent releasing them upon returning the closure + int3c2e_envs._env_ref_holder = (_atm, _bas, _env, ao_loc, Ls) + + gen_img_idx = create_img_idx(cell, bvkcell, auxcell, Ls, int3c2e_envs) + + uniq_l = uniq_l_ctr[:,0] + n_groups = np.count_nonzero(uniq_l <= LMAX) + init_constant(cell) + kern = libpbc.fill_int3c2e + cp.cuda.Stream.null.synchronize() + t1 = log.timer_debug1('initialize int3c2e_kernel', *cput0) + timing_collection = {} + kern_counts = 0 + + cell_ao_loc = cell.ao_loc + di = (cell_ao_loc[l_ctr_offsets[1:]] - cell_ao_loc[l_ctr_offsets[:-1]]).max() + dk = (aux_loc[l_ctr_aux_offsets[1:]] - aux_loc[l_ctr_aux_offsets[:-1]]).max() + buf = cp.empty((bvk_ncells,di, bvk_ncells,di, dk)) + + ij_tasks = ((i, j) for i in range(n_groups) for j in range(i+1)) + for i, j in ij_tasks: + li = uniq_l[i] + lj = uniq_l[j] + ish0, ish1 = l_ctr_offsets[i], l_ctr_offsets[i+1] + jsh0, jsh1 = l_ctr_offsets[j], l_ctr_offsets[j+1] + nrow = bvk_ao_loc[ish1] - bvk_ao_loc[ish0] + ncol = bvk_ao_loc[jsh1] - bvk_ao_loc[jsh0] + img_idx, img_offsets, bas_ij_idx = gen_img_idx(ish0, ish1, jsh0, jsh1) + + for k, lk in enumerate(self.uniq_l_ctr_aux[:,0]): + ksh0, ksh1 = l_ctr_aux_offsets[k:k+2] + naux = aux_loc[ksh1] - aux_loc[ksh0] + shls_slice = ish0, ish1, jsh0, jsh1, ksh0, ksh1 + eri3c = cp.ndarray((bvk_ncells, nrow, bvk_ncells, ncol, naux), + dtype=np.float64, memptr=buf.data) + eri3c.fill(0.) + lll = f'({ANGULAR[li]}{ANGULAR[lj]}|{ANGULAR[lk]})' + scheme = int3c2e_scheme(li, lj, lk) + log.debug2('int3c2e_scheme for %s: %s', lll, scheme) + err = kern( + ctypes.cast(eri3c.data.ptr, ctypes.c_void_p), + ctypes.byref(int3c2e_envs), (ctypes.c_int*3)(*scheme), + (ctypes.c_int*6)(*shls_slice), + ctypes.c_int(bvk_ncells), ctypes.c_int(nrow), + ctypes.c_int(ncol), ctypes.c_int(naux), + ctypes.c_int(bas_ij_idx.size), + ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p), + ctypes.cast(img_idx.data.ptr, ctypes.c_void_p), + ctypes.cast(img_offsets.data.ptr, ctypes.c_void_p), + _atm_cpu.ctypes, ctypes.c_int(bvkcell.natm), + _bas_cpu.ctypes, ctypes.c_int(bvkcell.nbas), _env_cpu.ctypes) + if err != 0: + raise RuntimeError(f'fill_int3c2e kernel for {lll} failed') + if log.verbose >= logger.DEBUG1: + t1, t1p = log.timer_debug1(f'processing {lll}', *t1), t1 + if lll not in timing_collection: + timing_collection[lll] = 0 + timing_collection[lll] += t1[1] - t1p[1] + kern_counts += 1 + yield shls_slice, eri3c + + if log.verbose >= logger.DEBUG1: + log.timer('int3c2e', *cput0) + log.debug1('kernel launches %d', kern_counts) + for lll, t in timing_collection.items(): + log.debug1('%s wall time %.2f', lll, t) + +class Int3c2eEnvVars(ctypes.Structure): + _fields_ = [ + ('cell0_natm', ctypes.c_uint16), + ('cell0_nbas', ctypes.c_uint16), + ('bvk_ncells', ctypes.c_uint16), + ('nimgs', ctypes.c_uint16), + ('atm', ctypes.c_void_p), + ('bas', ctypes.c_void_p), + ('env', ctypes.c_void_p), + ('ao_loc', ctypes.c_void_p), + ('img_coords', ctypes.c_void_p), + ('log_cutoff', ctypes.c_float), + ] + +def _conc_locs(ao_loc1, ao_loc2): + comp_loc = np.append(ao_loc1[:-1], ao_loc1[-1] + ao_loc2) + return cp.array(comp_loc, dtype=np.int32) + +def int3c2e_scheme(li, lj, lk, shm_size=SHM_SIZE): + order = li + lj + lk + nroots = (order//2 + 1) * 2 + + g_size = (li+1)*(lj+1)*(lk+1) + unit = g_size*3 + nroots*2 + 6 + nksp_max = shm_size//(unit*8) + nksp_max = _nearest_power2(nksp_max) + + nfi = (li + 1) * (li + 2) // 2 + nfj = (lj + 1) * (lj + 2) // 2 + nfk = (lk + 1) * (lk + 2) // 2 + gout_size = nfi * nfj * nfk + gout_stride = (gout_size + GOUT_WIDTH-1) // GOUT_WIDTH + # Round up to the next 2^n + gout_stride = _nearest_power2(gout_stride, return_leq=False) + + # Align nksh*gout_stride to warp size + if gout_stride < 32: + nksh_per_block = 32 // gout_stride + nsp_per_block = min(THREADS // 32, nksp_max // nksh_per_block) + else: + nksh_per_block = THREADS // gout_stride + nsp_per_block = 1 + if nksp_max < nksh_per_block: + raise RuntimeError('GOUT_WIDTH too small or not enough shared memory') + + gout_stride = THREADS // (nksh_per_block*nsp_per_block) + return nksh_per_block, gout_stride, nsp_per_block + +def most_diffused_pgto(cell): + exps, cs = extract_pgto_params(cell, 'diffused') + ls = cell._bas[:,ANG_OF] + r2 = np.log(cs**2 / cell.precision * 10**ls) / exps + idx = r2.argmax() + return exps[idx], cs[idx], ls[idx] + +# This modified rcut estimation function will be available in pyscf-2.8 or newer +def estimate_rcut(cell, auxcell, omega): + '''Estimate rcut for 3c2e SR-integrals''' + if cell.nbas == 0 or auxcell.nbas == 0: + return np.zeros(1) + + if omega == 0: + # No SR integrals in int3c2e if omega=0 + assert cell.dimension == 0 + return np.zeros(1) + + precision = cell.precision + ak, ck, lk = most_diffused_pgto(auxcell) + + # the most diffused orbital basis + cell_exps, cs = extract_pgto_params(cell, 'diffused') + ls = cell._bas[:,ANG_OF] + r2_cell = np.log(cs**2 / precision * 10**ls) / cell_exps + ai_idx = r2_cell.argmax() + ai = cell_exps[ai_idx] + aj = cell_exps + li = ls[ai_idx] + lj = ls + ci = cs[ai_idx] + cj = cs + + aij = ai + aj + lij = li + lj + l3 = lij + lk + theta = 1./(omega**-2 + 1./aij + 1./ak) + norm_ang = ((2*li+1)*(2*lj+1))**.5/(4*np.pi) + c1 = ci * cj * ck * norm_ang + sfac = aij*aj/(aij*aj + ai*theta) + fl = 2 + fac = 2**li*np.pi**2.5*c1 * theta**(l3-.5) + rad = cell.vol**(-1./3) * cell.rcut + 1 + surface = 4*np.pi * rad**2 + lattice_sum_factor = 2*np.pi*cell.rcut/(cell.vol*theta) + surface + fac *= lattice_sum_factor + fac /= aij**(li+1.5) * ak**(lk+1.5) * aj**lj + fac *= fl / precision + + r0 = cell.rcut # initial guess + r0 = (np.log(fac * (sfac*r0)**(l3-1) + 1.) / (sfac*theta))**.5 + r0 = (np.log(fac * (sfac*r0)**(l3-1) + 1.) / (sfac*theta))**.5 + rcut = r0 + return rcut + +def guess_bvk_kmesh(cell, bvk_kmesh, target_size=BVK_CELL_SHELLS): + '''Generate a sufficient large bvk cell for fill_int3c2e kernel to achieve + better load balance''' + if bvk_kmesh is None: + bvk_kmesh = np.ones(3, dtype=int) + else: + bvk_kmesh = bvk_kmesh.copy() + bvk_ncells = np.prod(bvk_kmesh) + + # produce a cell with ~2000 shells + replica = target_size / (bvk_ncells * cell.nbas) + if replica < 1: + return bvk_kmesh + + mesh_max = cell.nimgs * 2 + 1 + bvk_multiplier = mesh_max / bvk_kmesh + if cell.dimension == 2: + fac = (replica / np.prod(bvk_multiplier[:2]))**.5 + fac = min(fac, 1) + bvk_kmesh[:2] *= (fac * bvk_multiplier[:2]).astype(int) + else: + # The replica on each axis should be proportional to the required nimg + # along each direction. + fac = (replica / np.prod(bvk_multiplier))**(1./3) + # The replica is not necessary to be more than the required nimg. + fac = min(fac, 1) + bvk_kmesh *= (fac * bvk_multiplier).astype(int) + + return bvk_kmesh diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py new file mode 100644 index 00000000..9f892504 --- /dev/null +++ b/gpu4pyscf/pbc/df/rsdf_builder.py @@ -0,0 +1,427 @@ +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +''' +Build GDF tensor using the range-separation integral algorithm. +''' + +import os +import ctypes +import warnings +import numpy as np +import cupy as cp +from cupyx.scipy.linalg import solve_triangular +from pyscf import lib +#from pyscf.pbc import gto as pbcgto +#from pyscf.pbc.gto import pseudo +from pyscf.pbc.tools import pbc as pbctools +from pyscf.pbc.lib.kpts_helper import is_zero +from pyscf.pbc.df.rsdf_builder import ( + RCUT_THRESHOLD, estimate_ke_cutoff_for_omega) +from pyscf.pbc.df import aft as aft_cpu +from gpu4pyscf.lib import logger +from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem +from gpu4pyscf.pbc.df import ft_ao +from gpu4pyscf.pbc.lib.kpts_helper import kk_adapted_iter +from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh +from gpu4pyscf.pbc.gto.cell import extract_pgto_params +from gpu4pyscf.pbc.df.int3c2e import sr_aux_e2, estimate_rcut + +OMEGA_MIN = 0.3 + +# In the ED of the j2c2e metric, the default LINEAR_DEP_THR setting in pyscf-2.8 +# is too loose. The linear dependency truncation often leads to serious errors. +# PBC GDF very differs to the molecular GDF approximation where diffused +# functions typically have insignificant contributions. The diffused auxliary +# crystial orbitals have large impacts on the accuracy of Coulomb integrals. A +# tight linear dependency threshold have to be applied to control the error, +# even this may cause more numericial stability issues. +LINEAR_DEP_THR = 1e-11 +# Use eigenvalue decomposition in decompose_j2c +PREFER_ED = False + +def build_cderi(cell, auxcell, kpts=None, j_only=False, + omega=None, linear_dep_threshold=LINEAR_DEP_THR): + assert cell.low_dim_ft_type != 'inf_vacuum' + assert cell.dimension >= 2 + if cell.omega != 0: + assert cell.omega < 0 + omega = abs(cell.omega) + with_long_range = False + else: + if omega is None: + cell_exps, cs = extract_pgto_params(cell, 'diffused') + omega = cell_exps.min()**.5 + logger.debug(cell, 'omega guess in rsdf_builder = %g', omega) + omega = abs(omega) + with_long_range = True + + if kpts is None or is_zero(kpts): + return build_cderi_gamma_point( + cell, auxcell, omega, with_long_range, linear_dep_threshold) + elif j_only: + return build_cderi_j_only( + cell, auxcell, kpts, omega, with_long_range, linear_dep_threshold) + else: + return build_cderi_kk( + cell, auxcell, kpts, omega, with_long_range, linear_dep_threshold) + +def build_cderi_kk(cell, auxcell, kpts, omega=OMEGA_MIN, with_long_range=True, + linear_dep_threshold=LINEAR_DEP_THR): + log = logger.new_logger(cell) + t0 = log.init_timer() + if kpts is None: + kpts = np.zeros((1, 3)) + bvk_kmesh = kmesh = np.ones(3, dtype=int) + else: + # The remote images may contribute to certain k-point mesh, contributing + # to the finite-size effects in HFX. For sufficiently large number of + # kpts, the truncation radious cell.rcut may cause finite-size errors. + kpts = kpts.reshape(-1, 3) + rcut = estimate_rcut(cell, auxcell, omega).max() + bvk_kmesh = kmesh = kpts_to_kmesh(cell, kpts, rcut=rcut) + if len(kpts) != np.prod(kmesh): + # When targeting many kpts, num-kpts can be more than num-bvk-images. + # Using a large radius to regenerate MP kmesh. The new MP kmesh + # should cover all kpts. + kmesh = kpts_to_kmesh(cell, kpts, rcut=rcut*20) + j3c = sr_aux_e2(cell, auxcell, -omega, kpts, bvk_kmesh) + t1 = log.timer('pass1: int3c2e', *t0) + + kpt_iters = list(kk_adapted_iter(kmesh)) + uniq_kpts = kpts[[x[0] for x in kpt_iters]] + log.debug('Generate auxcell 2c2e integrals') + j2c = _get_2c2e(auxcell, uniq_kpts, omega, with_long_range) # on CPU + t1 = log.timer('int2c2e', *t1) + + if with_long_range: + ft_ao_iter = _ft_ao_iter_generator(cell, auxcell, bvk_kmesh, omega, log) + + prefer_ed = PREFER_ED + if cell.dimension == 2: + prefer_ed = True + cderi = {} + cderip = {} + for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters): + log.debug1('make_cderi for k-point %d %s', kp, kpts[kp]) + log.debug1('ki_idx = %s', ki_idx) + log.debug1('kj_idx = %s', kj_idx) + + if with_long_range: + '''exp(-i*(G + k) dot r) * Coulomb_kernel''' + for pqG, auxG_conj in ft_ao_iter(kpts[kp], kpts[kj_idx]): + # \sum_G coulG * ints(ij * exp(-i G * r)) * ints(P * exp(i G * r)) + # = \sum_G FT(ij, G) conj(FT(aux, G)) , where aux + # functions |P> are assumed to be real + j3c[ki_idx,kj_idx] += contract('kpqG,Gr->kpqr', pqG, auxG_conj) + + j2c_k = j2c[j2c_idx] + if kp == kp_conj: # self conjugated + # DF metric for self-conjugated k-point should be real + j2c_k = j2c_k.real + cd_j2c, cd_j2c_negative, j2ctag = decompose_j2c( + j2c_k, prefer_ed, linear_dep_threshold) + if cd_j2c.dtype != j3c.dtype: + cd_j2c = cd_j2c.astype(j3c.dtype) + + for ki, kj in zip(ki_idx, kj_idx): + j3c_k = j3c[ki,kj] + cderi[ki,kj] = _solve_cderi(cd_j2c, j3c_k, j2ctag) + if cd_j2c_negative is not None: + assert cell.dimension == 2 + cderip[ki,kj] = _solve_cderi(cd_j2c_negative, j3c_k, j2ctag) + t1 = log.timer('pass2: solve cderi', *t1) + return cderi, cderip + +def build_cderi_gamma_point(cell, auxcell, omega=OMEGA_MIN, with_long_range=True, + linear_dep_threshold=LINEAR_DEP_THR): + log = logger.new_logger(cell) + t0 = log.init_timer() + kmesh = None + kpts = None + + j3c = sr_aux_e2(cell, auxcell, -omega) + t1 = log.timer('pass1: int3c2e', *t0) + + log.debug('Generate auxcell 2c2e integrals') + j2c = _get_2c2e(auxcell, kpts, omega, with_long_range) # on CPU + j2c = j2c[0].real + t1 = log.timer('int2c2e', *t1) + + cderi = {} + cderip = {} + if with_long_range: + ft_ao_iter = _ft_ao_iter_generator(cell, auxcell, kmesh, omega, log) + for pqG, auxG_conj in ft_ao_iter(): + # \sum_G coulG * ints(ij * exp(-i G * r)) * ints(P * exp(i G * r)) + # = \sum_G FT(ij, G) conj(FT(aux, G)) , where aux + # functions |P> are assumed to be real + j3c += contract('pqG,Gr->pqr', pqG[0], auxG_conj).real + + prefer_ed = PREFER_ED + if cell.dimension == 2: + prefer_ed = True + cd_j2c, cd_j2c_negative, j2ctag = decompose_j2c( + j2c, prefer_ed, linear_dep_threshold) + + cderi[0,0] = _solve_cderi(cd_j2c, j3c, j2ctag) + if cd_j2c_negative is not None: + assert cell.dimension == 2 + cderip[0,0] = _solve_cderi(cd_j2c_negative, j3c, j2ctag) + t1 = log.timer('pass2: solve cderi', *t1) + return cderi, cderip + +def build_cderi_j_only(cell, auxcell, kpts, omega=OMEGA_MIN, with_long_range=True, + linear_dep_threshold=LINEAR_DEP_THR): + log = logger.new_logger(cell) + t0 = log.init_timer() + if kpts is None: + kpts = np.zeros((1, 3)) + bvk_kmesh = np.ones(3, dtype=int) + else: + # Coulomb integrals requires smaller kmesh to converge finite-size effects. + # A relatively small bvk_kmesh can be used for Coulomb integrals. + kpts = kpts.reshape(-1, 3) + bvk_kmesh = kpts_to_kmesh(cell, kpts) + # TODO: time-reversal symmetry in j3c, j2c + j3c = sr_aux_e2(cell, auxcell, -omega, kpts, bvk_kmesh, j_only=True) + t1 = log.timer('pass1: int3c2e', *t0) + + log.debug('Generate auxcell 2c2e integrals') + j2c = _get_2c2e(auxcell, None, omega, with_long_range) # on CPU + j2c = j2c[0].real + t1 = log.timer('int2c2e', *t1) + + # TODO: consider time-reversal symmetry + cderi = {} + cderip = {} + if with_long_range: + ft_ao_iter = _ft_ao_iter_generator(cell, auxcell, bvk_kmesh, omega, log) + kpt = np.zeros(3) + for pqG, auxG_conj in ft_ao_iter(kpt, kpts): + # \sum_G coulG * ints(ij * exp(-i G * r)) * ints(P * exp(i G * r)) + # = \sum_G FT(ij, G) conj(FT(aux, G)) , where aux + # functions |P> are assumed to be real + j3c += contract('kpqG,Gr->kpqr', pqG, auxG_conj) + + prefer_ed = PREFER_ED + if cell.dimension == 2: + prefer_ed = True + cd_j2c, cd_j2c_negative, j2ctag = decompose_j2c( + j2c, prefer_ed, linear_dep_threshold) + if cd_j2c.dtype != j3c.dtype: + cd_j2c = cd_j2c.astype(j3c.dtype) + + nkpts = len(kpts) + for k in range(nkpts): + cderi[k, k] = _solve_cderi(cd_j2c, j3c[k], j2ctag) + if cd_j2c_negative is not None: + assert cell.dimension == 2 + cderip[k, k] = _solve_cderi(cd_j2c_negative, j3c[k], j2ctag) + t1 = log.timer('pass2: solve cderi', *t1) + return cderi, cderip + +def _weighted_coulG_LR(cell, Gv, omega, kws, kpt=np.zeros(3)): + coulG = pbctools.get_coulG(cell, kpt, exx=False, Gv=Gv, omega=abs(omega)) + coulG *= kws + if is_zero(kpt): + assert Gv[0].dot(Gv[0]) == 0 + coulG[0] -= np.pi / omega**2 / cell.vol + return cp.asarray(coulG) + +def _ft_ao_iter_generator(cell, auxcell, bvk_kmesh, omega, verbose=None): + ke_cutoff = estimate_ke_cutoff_for_omega(cell, omega) + mesh = cell.cutoff_to_mesh(ke_cutoff) + mesh = cell.symmetrize_mesh(mesh) + Gv, Gvbase, kws = cell.get_Gv_weights(mesh) + ngrids = len(Gv) + nao = cell.nao + + ft_opt = ft_ao.FTOpt(cell, bvk_kmesh=bvk_kmesh) + ft_kern = ft_opt.gen_ft_kernel(verbose=verbose) + if bvk_kmesh is None: + bvk_ncells = 1 + else: + bvk_ncells = np.prod(bvk_kmesh) + avail_mem = get_avail_mem() * .8 + Gblksize = max(16, int(avail_mem/(2*16*nao**2*bvk_ncells))//8*8) + Gblksize = min(Gblksize, ngrids, 16384) + #logger.debug1(cell, 'Gblksize = %d', Gblksize) + def ft_ao_iter(kpt=np.zeros(3), kpts=None): + coulG = _weighted_coulG_LR(auxcell, Gv, omega, kws, kpt) + auxG_conj = cp.asarray(ft_ao.ft_ao(auxcell, Gv, kpt=kpt).conj(), order='C') + auxG_conj *= cp.asarray(coulG[:,None]) + for p0, p1 in lib.prange(0, ngrids, Gblksize): + pqG = ft_kern(Gv[p0:p1], kpt, kpts).transpose(0,2,3,1) + yield pqG, auxG_conj[p0:p1] + return ft_ao_iter + +def decompose_j2c(j2c, prefer_ed=PREFER_ED, linear_dep_threshold=LINEAR_DEP_THR): + if prefer_ed: + return eigenvalue_decomposed_metric(j2c, linear_dep_threshold) + else: + return cholesky_decomposed_metric(j2c) + +def cholesky_decomposed_metric(j2c): + '''Return L for j2c = L L^T''' + j2c_negative = None + j2ctag = 'CD' + # Cupy cholesky does not check positive-definite, seems returning nan in the + # resultant CD matrix silently. + j2c = cp.asarray(j2c) + j2c = cp.linalg.cholesky(j2c) + if cp.isnan(j2c[-1,-1]): + raise RuntimeError('j2c is not positive definite') + return j2c, j2c_negative, j2ctag + +def eigenvalue_decomposed_metric(j2c, linear_dep_threshold=LINEAR_DEP_THR): + j2c = cp.asarray(j2c) + w, v = cp.linalg.eigh(j2c) + mask = w > linear_dep_threshold + v1 = v[:,mask].conj().T + v1 *= w[mask, None]**-.5 + j2c = v1 + idx = cp.where(w < -linear_dep_threshold)[0] + j2c_negative = None + if len(idx) > 0: + j2c_negative = (v[:,idx] * (-w[idx])**-.5).conj().T + j2ctag = 'ED' + return j2c, j2c_negative, j2ctag + +# Create 2c2e, store on CPU +def _get_2c2e(auxcell, uniq_kpts, omega, with_long_range=True): + # j2c ~ (-kpt_ji | kpt_ji) => hermi=1 + precision = auxcell.precision ** 1.5 + aux_exps, aux_cs = extract_pgto_params(auxcell, 'diffused') + aux_exp = aux_exps.min() + theta = 1./(2./aux_exp + omega**-2) + rad = auxcell.vol**(-1./3) * auxcell.rcut + 1 + surface = 4*np.pi * rad**2 + lattice_sum_factor = 2*np.pi*auxcell.rcut/(auxcell.vol*theta) + surface + rcut_sr = (np.log(lattice_sum_factor / precision + 1.) / theta)**.5 + logger.debug1(auxcell, 'auxcell rcut_sr = %g', rcut_sr) + auxcell_sr = auxcell.copy() + auxcell_sr.rcut = rcut_sr + with auxcell_sr.with_short_range_coulomb(omega): + j2c = auxcell_sr.pbc_intor('int2c2e', hermi=1, kpts=uniq_kpts) + + if not with_long_range: + return j2c + + ke = estimate_ke_cutoff_for_omega(auxcell, omega, precision) + mesh = auxcell.cutoff_to_mesh(ke) + mesh = auxcell.symmetrize_mesh(mesh) + logger.debug(auxcell, 'Set 2c2e integrals precision %g, mesh %s', precision, mesh) + + Gv, Gvbase, kws = auxcell.get_Gv_weights(mesh) + b = auxcell.reciprocal_vectors() + gxyz = lib.cartesian_prod([np.arange(len(x)) for x in Gvbase]) + ngrids = Gv.shape[0] + naux = auxcell.nao + max_memory = max(1000, auxcell.max_memory - lib.current_memory()[0]) + blksize = min(ngrids, int(max_memory*.4e6/16/naux), 200000) + logger.debug2(auxcell, 'max_memory %s (MB) blocksize %s', max_memory, blksize) + + if uniq_kpts is None: + j2c = cp.asarray(j2c) + coulG_LR = _weighted_coulG_LR(auxcell, Gv, omega, kws) + for p0, p1 in lib.prange(0, ngrids, blksize): + auxG = ft_ao.ft_ao(auxcell, Gv[p0:p1], None, b, gxyz[p0:p1], Gvbase).T + j2c += (auxG.conj() * coulG_LR[p0:p1]).dot(auxG.T).real + auxG = None + j2c = [j2c.real.get()] + else: + for k, kpt in enumerate(uniq_kpts): + j2c_k = cp.asarray(j2c[k]) + coulG_LR = _weighted_coulG_LR(auxcell, Gv, omega, kws, kpt) + gamma_point = is_zero(kpt) + + for p0, p1 in lib.prange(0, ngrids, blksize): + auxG = ft_ao.ft_ao(auxcell, Gv[p0:p1], None, b, gxyz[p0:p1], Gvbase, kpt).T + if gamma_point: + j2c_k += (auxG.conj() * coulG_LR[p0:p1]).dot(auxG.T).real + else: + j2c_k += (auxG.conj() * coulG_LR[p0:p1]).dot(auxG.T) + auxG = None + j2c[k] = j2c_k.get() + return j2c + +def _solve_cderi(cd_j2c, j3c, j2ctag): + if j2ctag == 'ED': + return contract('Lr,pqr->Lpq', cd_j2c, j3c) + else: + nao, naux = j3c.shape[1:3] + j3c = solve_triangular(cd_j2c, j3c.reshape(-1,naux).T, lower=True) + return j3c.reshape(naux,nao,nao) + +def get_pp_loc_part1(cell, kpts=None, with_pseudo=True, verbose=None): + fakenuc = aft_cpu._fake_nuc(cell, with_pseudo=with_pseudo) + cell_exps, cs = extract_pgto_params(cell, 'diffused') + omega = (2*cell_exps.min())**.5 + logger.debug(cell, 'omega guess in get_pp_loc_part1 = %g', omega) + + if kpts is None or is_zero(kpts): + kpts = None + bvk_kmesh = np.ones(3, dtype=int) + else: + bvk_kmesh = kpts_to_kmesh(cell, kpts) + nuc = sr_aux_e2(cell, fakenuc, -omega, kpts, bvk_kmesh, j_only=True) + charges = -cp.asarray(cell.atom_charges()) + if kpts is None: + nuc = contract('pqr,r->pq', nuc, charges) + else: + nuc = contract('kpqr,r->kpq', nuc, charges) + + # TODO: consider time-reversal symmetry + ft_ao_iter = _ft_ao_iter_generator(cell, fakenuc, bvk_kmesh, omega, verbose) + kpt = np.zeros(3) + for i, (pqG, auxG_conj) in enumerate(ft_ao_iter(kpt, kpts)): + ZG = auxG_conj.dot(charges) + # contributions due to pseudo.pp_int.get_gth_vlocG_part1 + if (with_pseudo and i == 0 and + (cell.dimension == 3 or + (cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum'))): + exps = cp.asarray(np.hstack(fakenuc.bas_exps())) + ZG[0] -= charges.dot(np.pi/exps) / cell.vol + if kpts is None: + nuc += contract('pqG,G->pq', pqG[0], ZG).real + else: + nuc += contract('kpqG,G->kpq', pqG, ZG) + return nuc + +def get_nuc(cell, kpts=None): + '''Get the periodic nuc-el AO matrix, with G=0 removed. + ''' + log = logger.new_logger(cell) + t0 = log.init_timer() + nuc = get_pp_loc_part1(cell, kpts, with_pseudo=False, verbose=log) + log.timer('get_nuc', *t0) + return nuc + +def get_pp(cell, kpts=None): + '''Get the periodic pseudopotential nuc-el ao matrix, with G=0 removed. + ''' + from pyscf.pbc.gto import pseudo + log = logger.new_logger(cell) + t0 = log.init_timer() + pp2builder = aft_cpu._IntPPBuilder(cell, kpts) + vpp = cp.asarray(pp2builder.get_pp_loc_part2()) + t1 = log.timer_debug1('get_pp_loc_part2', *t0) + vpp += cp.asarray(pseudo.pp_int.get_pp_nl(cell, kpts)) + t1 = log.timer_debug1('get_pp_nl', *t1) + + vpp += get_pp_loc_part1(cell, kpts, with_pseudo=True, verbose=log) + t1 = log.timer_debug1('get_pp_loc_part1', *t1) + log.timer('get_pp', *t0) + return vpp diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_aft.py b/gpu4pyscf/pbc/df/tests/test_pbc_aft.py index 6ca1d627..98ddad61 100644 --- a/gpu4pyscf/pbc/df/tests/test_pbc_aft.py +++ b/gpu4pyscf/pbc/df/tests/test_pbc_aft.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,10 +29,9 @@ def setUpModule(): 'C' :[[0, [1., 1]]],} cell.pseudo = {'C':'gth-pade'} cell.a = np.eye(3) * 2.5 + cell.precision = 1e-8 cell.build() - np.random.seed(1) - kpts = np.random.random((4,3)) - kpts[3] = kpts[0]-kpts[1]+kpts[2] + kpts = cell.make_kpts([13,1,1])[4:8] cell1 = pgto.Cell() cell1.atom = 'He 1. .5 .5; He .1 1.3 2.1' @@ -49,22 +48,22 @@ class KnownValues(unittest.TestCase): def test_aft_get_pp(self): ref = aft_cpu.AFTDF(cell, kpts[0]).get_pp() v1 = aft.AFTDF(cell, kpts[0]).get_pp().get() - assert abs(v1 - ref).max() < 1e-12 + assert abs(v1 - ref).max() < 1e-9 kpts4 = cell.make_kpts([4,1,1]) ref = aft_cpu.AFTDF(cell, kpts4).get_pp() v1 = aft.AFTDF(cell, kpts4).get_pp().get() - assert abs(v1 - ref).max() < 1e-12 + assert abs(v1 - ref).max() < 1e-9 def test_aft_get_nuc(self): ref = aft_cpu.AFTDF(cell, kpts[0]).get_nuc() v1 = aft.AFTDF(cell, kpts[0]).get_nuc().get() - assert abs(v1 - ref).max() < 1e-12 + assert abs(v1 - ref).max() < 1e-9 kpts4 = cell.make_kpts([4,1,1]) ref = aft_cpu.AFTDF(cell, kpts4).get_nuc() v1 = aft.AFTDF(cell, kpts4).get_nuc().get() - assert abs(v1 - ref).max() < 1e-12 + assert abs(v1 - ref).max() < 1e-9 def test_jk(self): mesh = [11]*3 @@ -76,15 +75,15 @@ def test_jk(self): dm = np.random.random((nao,nao)) jref, kref = mydf0.get_jk(dm, hermi=0, exxdiv='ewald') vj, vk = mydf.get_jk(dm, hermi=0, exxdiv='ewald') - assert abs(vj.get() - jref).max() < 1e-12 - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vj.get() - jref).max() < 1e-9 + assert abs(vk.get() - kref).max() < 1e-9 dm = dm + np.random.random((nao,nao)) * 1j dm = dm + dm.conj().T jref, kref = mydf0.get_jk(dm, hermi=1, exxdiv='ewald') vj, vk = mydf.get_jk(dm, hermi=1, exxdiv='ewald') - assert abs(vj.get() - jref).max() < 1e-12 - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vj.get() - jref).max() < 1e-9 + assert abs(vk.get() - kref).max() < 1e-9 def test_jk_complex_dm(self): scaled_center = [0.3728,0.5524,0.7672] @@ -98,14 +97,14 @@ def test_jk_complex_dm(self): dm = np.random.random((nao,nao)) + np.random.random((nao,nao)) * 1j jref, kref = mydf0.get_jk(dm, hermi=0, kpts=kpt, exxdiv='ewald') vj, vk = mydf.get_jk(dm, hermi=0, kpts=kpt, exxdiv='ewald') - assert abs(vj.get() - jref).max() < 1e-12 - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vj.get() - jref).max() < 1e-9 + assert abs(vk.get() - kref).max() < 1e-9 dm = dm + dm.conj().T jref, kref = mydf0.get_jk(dm, hermi=1, kpts=kpt, exxdiv='ewald') vj, vk = mydf.get_jk(dm, hermi=1, kpts=kpt, exxdiv='ewald') - assert abs(vj.get() - jref).max() < 1e-12 - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vj.get() - jref).max() < 1e-9 + assert abs(vk.get() - kref).max() < 1e-9 def test_aft_j(self): kpts = np.random.random((4,3)) @@ -120,7 +119,7 @@ def test_aft_j(self): dm = dm + dm.transpose(0,2,1) jref = mydf0.get_jk(dm, with_k=False)[0] vj = mydf.get_jk(dm, with_k=False)[0] - assert abs(vj.get() - jref).max() < 1e-12 + assert abs(vj.get() - jref).max() < 1e-9 def test_aft_k(self): kpts = cell.get_abs_kpts([[-.25,-.25,-.25], @@ -141,7 +140,7 @@ def test_aft_k(self): dm = np.random.random((nkpts,nao,nao)) kref = mydf0.get_jk(dm, hermi=0, with_j=False)[1] vk = mydf.get_jk(dm, hermi=0, with_j=False)[1] - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vk.get() - kref).max() < 1e-9 def test_aft_k1(self): kpts = cell.get_abs_kpts([[-.25,-.25,-.25], @@ -163,7 +162,7 @@ def test_aft_k1(self): dm = dm + dm.transpose(0,2,1) kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1] vk = mydf.get_jk(dm, hermi=1, with_j=False)[1] - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vk.get() - kref).max() < 1e-9 def test_aft_k2(self): kpts = cell.make_kpts([2,1,1]) @@ -183,7 +182,7 @@ def test_aft_k2(self): kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1] vk = mydf.get_jk(dm, hermi=1, with_j=False)[1] - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vk.get() - kref).max() < 1e-9 def test_aft_k3(self): kpts = cell.make_kpts([6,1,1]) @@ -205,7 +204,7 @@ def test_aft_k3(self): kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1] vk = mydf.get_jk(dm, hermi=1, with_j=False)[1] - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vk.get() - kref).max() < 1e-9 if __name__ == '__main__': print("Full Tests for aft") diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_df.py b/gpu4pyscf/pbc/df/tests/test_pbc_df.py index e89cc8a0..fcc22837 100644 --- a/gpu4pyscf/pbc/df/tests/test_pbc_df.py +++ b/gpu4pyscf/pbc/df/tests/test_pbc_df.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ def setUpModule(): 'C' :[[0, [1., 1]]],} cell.pseudo = {'C':'gth-pade'} cell.a = np.eye(3) * 2.5 + cell.precision = 1e-8 cell.build() def tearDownModule(): @@ -37,15 +38,19 @@ def tearDownModule(): class KnownValues(unittest.TestCase): def test_get_pp(self): - kpt = cell.make_kpts([9,6,5])[107] - ref = df_cpu.GDF(cell, kpt).get_pp() - v1 = GDF(cell, kpt).get_pp().get() - assert abs(v1 - ref).max() < 1e-12 + #kpt = cell.make_kpts([9,6,5])[107] + #ref = df_cpu.GDF(cell, kpt).get_pp() + #v1 = GDF(cell, kpt).get_pp().get() + #assert abs(v1 - ref).max() < 1e-8 + + ref = df_cpu.GDF(cell).get_pp() + v1 = GDF(cell).get_pp().get() + assert abs(v1 - ref).max() < 1e-8 kpts4 = cell.make_kpts([4,1,1]) ref = df_cpu.GDF(cell, kpts4).get_pp() v1 = GDF(cell, kpts4).get_pp().get() - assert abs(v1 - ref).max() < 1e-12 + assert abs(v1 - ref).max() < 1e-8 def test_get_nuc(self): L = 5. @@ -56,18 +61,18 @@ def test_get_nuc(self): cell1.atom = '''He 3. 2. 3. He 1. 1. 1.''' cell1.basis = 'ccpvdz' - cell1.precision=1e-12 + cell1.precision=1e-8 cell1.verbose = 0 cell1.max_memory = 1000 cell1.build(0,0) ref = df_cpu.GDF(cell1).get_nuc() v1 = GDF(cell1).get_nuc().get() - assert abs(v1 - ref).max() < 1e-12 + assert abs(v1 - ref).max() < 1e-8 kpts4 = cell1.make_kpts([4,1,1]) ref = df_cpu.GDF(cell1, kpts4).get_nuc() v1 = GDF(cell1, kpts4).get_nuc().get() - assert abs(v1 - ref).max() < 1e-12 + assert abs(v1 - ref).max() < 1e-8 def test_jk(self): mydf0 = df_cpu.GDF(cell) @@ -78,16 +83,38 @@ def test_jk(self): dm = np.random.random((nao,nao)) jref, kref = mydf0.get_jk(dm, hermi=0, exxdiv='ewald') vj, vk = mydf.get_jk(dm, hermi=0, exxdiv='ewald') - assert abs(vj.get() - jref).max() < 1e-12 - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vj.get() - jref).max() < 1e-8 + assert abs(vk.get() - kref).max() < 1e-8 dm = dm + np.random.random((nao,nao)) * 1j dm = dm + dm.conj().T jref, kref = mydf0.get_jk(dm, hermi=1, exxdiv='ewald') vj, vk = mydf.get_jk(dm, hermi=1, exxdiv='ewald') - assert abs(vj.get() - jref).max() < 1e-12 - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vj.get() - jref).max() < 1e-8 + assert abs(vk.get() - kref).max() < 1e-8 + + def test_jk1(self): + kpts = cell.make_kpts([1,6,1]) + nkpts = len(kpts) + mydf0 = df_cpu.GDF(cell, kpts) + mydf = GDF(cell, kpts) + + nao = cell.nao + np.random.seed(12) + dm = (np.random.random((nkpts, nao, nao)) + + np.random.random((nkpts, nao, nao))*1j) + jref, kref = mydf0.get_jk(dm, hermi=0, exxdiv='ewald') + vj, vk = mydf.get_jk(dm, hermi=0, exxdiv='ewald') + assert abs(vj.get() - jref).max() < 1e-8 + assert abs(vk.get() - kref).max() < 1e-8 + + dm = dm + dm.conj().transpose(0,2,1) + jref, kref = mydf0.get_jk(dm, hermi=1, exxdiv='ewald') + vj, vk = mydf.get_jk(dm, hermi=1, exxdiv='ewald') + assert abs(vj.get() - jref).max() < 1e-8 + assert abs(vk.get() - kref).max() < 1e-8 + @unittest.skip('pbc-gdf only supports Monkhorst-Pack k-mesh') def test_jk_complex_dm(self): scaled_center = [0.3728,0.5524,0.7672] kpt = cell.make_kpts([1,1,1], scaled_center=scaled_center)[0] @@ -99,15 +126,16 @@ def test_jk_complex_dm(self): dm = np.random.random((nao,nao)) + np.random.random((nao,nao)) * 1j jref, kref = mydf0.get_jk(dm, hermi=0, kpts=kpt, exxdiv='ewald') vj, vk = mydf.get_jk(dm, hermi=0, kpts=kpt, exxdiv='ewald') - assert abs(vj.get() - jref).max() < 1e-12 - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vj.get() - jref).max() < 1e-8 + assert abs(vk.get() - kref).max() < 1e-8 dm = dm + dm.conj().T jref, kref = mydf0.get_jk(dm, hermi=1, kpts=kpt, exxdiv='ewald') vj, vk = mydf.get_jk(dm, hermi=1, kpts=kpt, exxdiv='ewald') - assert abs(vj.get() - jref).max() < 1e-12 - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vj.get() - jref).max() < 1e-8 + assert abs(vk.get() - kref).max() < 1e-8 + @unittest.skip('pbc-gdf only supports Monkhorst-Pack k-mesh') def test_get_j(self): kpts = np.random.random((4,3)) nkpts = len(kpts) @@ -120,8 +148,9 @@ def test_get_j(self): dm = dm + dm.transpose(0,2,1) jref = mydf0.get_jk(dm, with_k=False)[0] vj = mydf.get_jk(dm, with_k=False)[0] - assert abs(vj.get() - jref).max() < 1e-12 + assert abs(vj.get() - jref).max() < 1e-8 + @unittest.skip('pbc-gdf only supports Monkhorst-Pack k-mesh') def test_get_k(self): kpts = cell.get_abs_kpts([[-.25,-.25,-.25], [-.25,-.25, .25], @@ -140,8 +169,9 @@ def test_get_k(self): dm = np.random.random((nkpts,nao,nao)) kref = mydf0.get_jk(dm, hermi=0, with_j=False)[1] vk = mydf.get_jk(dm, hermi=0, with_j=False)[1] - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vk.get() - kref).max() < 1e-8 + @unittest.skip('pbc-gdf only supports Monkhorst-Pack k-mesh') def test_get_k1(self): kpts = cell.get_abs_kpts([[-.25,-.25,-.25], [-.25,-.25, .25], @@ -161,11 +191,10 @@ def test_get_k1(self): dm = dm + dm.transpose(0,2,1) kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1] vk = mydf.get_jk(dm, hermi=1, with_j=False)[1] - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vk.get() - kref).max() < 1e-8 - @unittest.skip('build_k from MO coefficients') def test_get_k2(self): - kpts = cell.make_kpts([2,1,1]) + kpts = cell.make_kpts([3,1,1]) nkpts = len(kpts) mydf0 = df_cpu.GDF(cell, kpts=kpts) mydf = GDF(cell, kpts=kpts) @@ -176,14 +205,13 @@ def test_get_k2(self): mo = (np.random.random((nkpts,nao,nocc)) + np.random.random((nkpts,nao,nocc))*1j) mo_occ = np.ones((nkpts,nocc)) - dm = np.random.rand(nkpts, nao, nao) + dm = np.einsum('kpi,kqi->kpq', mo, mo.conj()) dm = lib.tag_array(dm, mo_coeff=mo, mo_occ=mo_occ) kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1] vk = mydf.get_jk(dm, hermi=1, with_j=False)[1] - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vk.get() - kref).max() < 1e-8 - @unittest.skip('build_k from MO coefficients') def test_get_k3(self): kpts = cell.make_kpts([6,1,1]) nkpts = len(kpts) @@ -197,12 +225,12 @@ def test_get_k3(self): mo = (np.random.random((nkpts,nao,nocc)) + np.random.random((nkpts,nao,nocc))*1j) mo_occ = np.ones((nkpts,nocc)) - dm = np.random.rand(nkpts, nao, nao) + dm = np.einsum('kpi,kqi->kpq', mo, mo.conj()) dm = lib.tag_array(dm, mo_coeff=mo, mo_occ=mo_occ) kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1] vk = mydf.get_jk(dm, hermi=1, with_j=False)[1] - assert abs(vk.get() - kref).max() < 1e-12 + assert abs(vk.get() - kref).max() < 1e-8 if __name__ == '__main__': print("Full Tests for PBC DF") diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_ft_ao.py b/gpu4pyscf/pbc/df/tests/test_pbc_ft_ao.py index 55646945..ee77c401 100644 --- a/gpu4pyscf/pbc/df/tests/test_pbc_ft_ao.py +++ b/gpu4pyscf/pbc/df/tests/test_pbc_ft_ao.py @@ -1,4 +1,4 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_int3c2e.py b/gpu4pyscf/pbc/df/tests/test_pbc_int3c2e.py new file mode 100644 index 00000000..3238806a --- /dev/null +++ b/gpu4pyscf/pbc/df/tests/test_pbc_int3c2e.py @@ -0,0 +1,153 @@ +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import pyscf +from pyscf import lib +from pyscf.pbc.df import rsdf_builder +from gpu4pyscf.pbc.df.int3c2e import sr_aux_e2 + + +def test_int3c2e_gamma_point(): + cell = pyscf.M( + atom='''C1 1.3 .2 .3 + C2 .19 .1 1.1 + ''', + basis={'C1': [[3, [1.1, 1.]], + [4, [2., 1.]]], + 'C2': 'ccpvdz'}, + precision = 1e-8, + a=np.diag([2.5, 1.9, 2.2])*3) + + auxcell = cell.copy() + auxcell.basis = { + 'C1':''' +C P + 102.9917624900 1.0000000000 +C P + 28.1325940100 1.0000000000 +C P + 9.8364318200 1.0000000000 +C P + 3.3490545000 1.0000000000 +C P + 1.4947618600 1.0000000000 +C P + 0.5769010900 1.0000000000 +C D + 0.1995412500 1.0000000000 ''', + 'C2':[[0, [.5, 1.]]], + } + auxcell.build() + omega = -0.2 + dat = sr_aux_e2(cell, auxcell, omega).get() + + cell.precision=1e-10 + cell.build() + df = rsdf_builder._RSGDFBuilder(cell, auxcell).build(omega=abs(omega)) + int3c = df.gen_int3c_kernel('int3c2e', aosym='s1', return_complex=True) + ref = int3c().reshape(dat.shape) + assert abs(dat - ref).max() < 1e-8 + +def test_int3c2e_kpoints(): + cell = pyscf.M( + atom='''H1 1.3 .2 .3 + H2 .19 .1 1.1 + ''', + basis='ccpvdz', + precision = 1e-8, + a=np.diag([2.5, 1.9, 2.2])*4) + auxcell = cell.copy() + auxcell.basis = [[0, [3.5, 1.]], + [0, [1.1, 1.]], + [1, [0.7, 1.]], + [2, [1.5, 1.]]] + auxcell.build() + kpts = cell.make_kpts([5,1,1]) + omega = -0.2 + dat = sr_aux_e2(cell, auxcell, omega, kpts).get() + + cell.precision=1e-10 + cell.build() + df = rsdf_builder._RSGDFBuilder(cell, auxcell, kpts).build(omega=abs(omega)) + int3c = df.gen_int3c_kernel('int3c2e', aosym='s1', return_complex=True) + ref = int3c().reshape(dat.shape) + assert abs(dat - ref).max() < 1e-8 + +def test_minor_diffused_basis(): + cell = pyscf.M( + atom='''H 1.3 .2 .3 + H .19 .1 1.1 + ''', + basis=''' +C S + 7.5 0.40 + 2.6 0.90 + 0.5 0.08''', + precision = 1e-8, + a=np.diag([2.5, 1.9, 2.2])*3) + auxcell = cell.copy() + auxcell.basis = ''' +C P + 1.4947618600 1.0000000000 +C P + 0.5769010900 1.0000000000 +C D + 0.1995412500 1.0000000000 ''' + auxcell.build() + omega = -0.2 + dat = sr_aux_e2(cell, auxcell, omega).get() + + cell.precision=1e-12 + cell.build() + df = rsdf_builder._RSGDFBuilder(cell, auxcell).build(omega=abs(omega)) + int3c = df.gen_int3c_kernel('int3c2e', aosym='s1', return_complex=True) + ref = int3c().reshape(dat.shape) + assert abs(dat - ref).max() < 1e-8 + +def test_ignorable_diffused_basis(): + cell = pyscf.M( + atom='''H 1.3 .2 .3 + H .19 .1 1.1 + ''', + basis=''' +C S + 7.5 0.4000000 + 2.6 0.9000000 + 0.5 0.0000002''', + precision = 1e-8, + a=np.diag([2.5, 1.9, 2.2])*3) + auxcell = cell.copy() + auxcell.basis = ''' +C P + 1.4947618600 1.0000000000 +C P + 0.5769010900 1.0000000000 +C D + 0.1995412500 1.0000000000 ''' + auxcell.build() + omega = -0.2 + cell.verbose = 6 + dat = sr_aux_e2(cell, auxcell, omega).get() + + cell.basis=''' +C S + 7.5 0.4000000 + 2.6 0.9000000''' + cell.build() + df = rsdf_builder._RSGDFBuilder(cell, auxcell).build(omega=abs(omega)) + int3c = df.gen_int3c_kernel('int3c2e', aosym='s1', return_complex=True) + ref = int3c().reshape(dat.shape) + assert abs(dat - ref).max() < 1e-6 diff --git a/gpu4pyscf/pbc/df/tests/test_rsdf_builder.py b/gpu4pyscf/pbc/df/tests/test_rsdf_builder.py new file mode 100644 index 00000000..0d77cfb0 --- /dev/null +++ b/gpu4pyscf/pbc/df/tests/test_rsdf_builder.py @@ -0,0 +1,177 @@ +# Copyright 2024-2025 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +import numpy as np +import pyscf +from pyscf.pbc.df.rsdf_builder import _RSGDFBuilder +from pyscf.pbc.df.df import _load3c +from gpu4pyscf.pbc.df.rsdf_builder import build_cderi + +def test_gamma_point(): + cell = pyscf.M( + atom='''C1 1.3 .2 .3 + C2 .19 .1 1.1 + ''', + basis={'C1': [[0, [1.1, 1.]], + [1, [2., 1.]]], + 'C2': 'ccpvdz'}, + a=np.diag([2.5, 1.9, 2.2])*3) + + auxcell = cell.copy() + auxcell.basis = { + 'C1':''' +C S + 12.9917624900 1.0000000000 +C S + 2.1325940100 1.0000000000 +C P + 9.8364318200 1.0000000000 +C P + 3.3490545000 1.0000000000 +C P + 1.4947618600 1.0000000000 +C P + 0.5769010900 1.0000000000 +C D + 0.1995412500 1.0000000000 ''', + 'C2':[[0, [.5, 1.]]], + } + auxcell.build() + omega = 0.3 + gpu_dat, dat_neg = build_cderi(cell, auxcell, kpts=None, omega=omega) + + cell.precision = 1e-10 + auxcell.precision = 1e-10 + kpts = cell.make_kpts([1,1,1]) + dfbuilder = _RSGDFBuilder(cell, auxcell, kpts) + dfbuilder.omega = omega + dfbuilder.j2c_eig_always = False + dfbuilder.fft_dd_block = True + dfbuilder.exclude_d_aux = True + naux = auxcell.nao + nao = cell.nao + with tempfile.NamedTemporaryFile() as tmpf: + dfbuilder.make_j3c(tmpf.name, aosym='s1') + with _load3c(tmpf.name, 'j3c', kpts[[0,0]]) as cderi: + ref = abs(cderi[:].reshape(naux,nao,nao)) + dat = abs(gpu_dat[0,0].get()) + assert abs(dat - ref).max() < 1e-8 + +def test_kpts(): + cell = pyscf.M( + atom='''C1 1.3 .2 .3 + C2 .19 .1 1.1 + ''', + basis={'C1': [[0, [1.1, 1.]], + [1, [2., 1.]]], + 'C2': 'ccpvdz'}, + a=np.diag([2.5, 1.9, 2.2])*3) + + auxcell = cell.copy() + auxcell.basis = { + 'C1':''' +C S + 12.9917624900 1.0000000000 +C S + 2.1325940100 1.0000000000 +C P + 9.8364318200 1.0000000000 +C P + 3.3490545000 1.0000000000 +C P + 1.4947618600 1.0000000000 +C P + 0.5769010900 1.0000000000 +C D + 0.1995412500 1.0000000000 ''', + 'C2':[[0, [.5, 1.]]], + } + auxcell.build() + omega = 0.3 + kmesh = [6,1,1] + kpts = cell.make_kpts(kmesh) + gpu_dat, dat_neg = build_cderi(cell, auxcell, kpts, omega=omega) + + cell.precision = 1e-10 + auxcell.precision = 1e-10 + dfbuilder = _RSGDFBuilder(cell, auxcell, kpts) + dfbuilder.omega = omega + dfbuilder.j2c_eig_always = False + dfbuilder.fft_dd_block = True + dfbuilder.exclude_d_aux = True + naux = auxcell.nao + nao = cell.nao + with tempfile.NamedTemporaryFile() as tmpf: + dfbuilder.make_j3c(tmpf.name, aosym='s1') + for ki, kj in gpu_dat: + with _load3c(tmpf.name, 'j3c', kpts[[ki,kj]]) as cderi: + ref = abs(cderi[:].reshape(naux,nao,nao)) + dat = abs(gpu_dat[ki,kj].get()) + print(ki,kj) + assert abs(dat - ref).max() < 1e-8 + +def test_kpts_j_only(): + cell = pyscf.M( + atom='''C1 1.3 .2 .3 + C2 .19 .1 1.1 + ''', + basis={'C1': [[0, [1.1, 1.]], + [1, [2., 1.]]], + 'C2': 'ccpvdz'}, + a=np.diag([2.5, 1.9, 2.2])*3) + + auxcell = cell.copy() + auxcell.basis = { + 'C1':''' +C S + 12.9917624900 1.0000000000 +C S + 2.1325940100 1.0000000000 +C P + 9.8364318200 1.0000000000 +C P + 3.3490545000 1.0000000000 +C P + 1.4947618600 1.0000000000 +C P + 0.5769010900 1.0000000000 +C D + 0.1995412500 1.0000000000 ''', + 'C2':[[0, [.5, 1.]]], + } + auxcell.build() + omega = 0.3 + kmesh = [1,3,4] + kpts = cell.make_kpts(kmesh) + gpu_dat, dat_neg = build_cderi(cell, auxcell, kpts, omega=omega, j_only=True) + + cell.precision = 1e-10 + auxcell.precision = 1e-10 + dfbuilder = _RSGDFBuilder(cell, auxcell, kpts) + dfbuilder.j_only = True + dfbuilder.omega = omega + dfbuilder.j2c_eig_always = False + dfbuilder.fft_dd_block = True + dfbuilder.exclude_d_aux = True + naux = auxcell.nao + nao = cell.nao + with tempfile.NamedTemporaryFile() as tmpf: + dfbuilder.make_j3c(tmpf.name, aosym='s1', j_only=True) + for ki, kj in gpu_dat: + with _load3c(tmpf.name, 'j3c', kpts[[ki,kj]]) as cderi: + ref = abs(cderi[:].reshape(naux,nao,nao)) + dat = abs(gpu_dat[ki,kj].get()) + print(ki,kj) + assert abs(dat - ref).max() < 1e-8 diff --git a/gpu4pyscf/pbc/dft/gen_grid.py b/gpu4pyscf/pbc/dft/gen_grid.py index 8cac0d01..66b362d2 100644 --- a/gpu4pyscf/pbc/dft/gen_grid.py +++ b/gpu4pyscf/pbc/dft/gen_grid.py @@ -16,10 +16,14 @@ import numpy as np import cupy as cp from pyscf import lib -from pyscf.lib import logger from pyscf.pbc.dft import gen_grid as gen_grid_cpu from pyscf.pbc.gto.cell import get_uniform_grids -from gpu4pyscf.lib import utils +from gpu4pyscf.dft import Grids +from gpu4pyscf.lib import utils, logger + +__all__ = [ + 'UniformGrids', 'BeckeGrids', 'AtomicGrids' +] class UniformGrids(lib.StreamObject): '''Uniform Grid class.''' @@ -66,8 +70,31 @@ def size(self): kernel = gen_grid_cpu.UniformGrids.kernel to_gpu = utils.to_gpu - device = utils.device to_cpu = utils.to_cpu -class BeckeGrids: - pass + +class BeckeGrids(Grids): + '''Atomic grids for all-electron calculation.''' + def __init__(self, cell): + self.cell = cell + Grids.__init__(self, cell) + + def build(self, cell=None, with_non0tab=False): + if cell is None: cell = self.cell + coords, weights = gen_grid_cpu.get_becke_grids( + self.cell, self.atom_grid, radi_method=self.radi_method, + level=self.level, prune=self.prune) + self.coords = cp.asarray(coords) + self.weights = cp.asarray(weights) + if with_non0tab: + raise NotImplementedError + self.non0tab = None + logger.info(self, 'tot grids = %d', len(self.weights)) + logger.info(self, 'cell vol = %.9g sum(weights) = %.9g', + cell.vol, self.weights.sum()) + return self + + to_gpu = utils.to_gpu + to_cpu = utils.to_cpu + +AtomicGrids = BeckeGrids diff --git a/gpu4pyscf/pbc/dft/krks.py b/gpu4pyscf/pbc/dft/krks.py index c5fefb7f..c4fa0245 100644 --- a/gpu4pyscf/pbc/dft/krks.py +++ b/gpu4pyscf/pbc/dft/krks.py @@ -47,7 +47,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1, n, exc, vxc = multigrid.nr_rks(ks.with_df, ks.xc, dm, hermi, kpts, kpts_band, with_j=True, return_j=False) - log.info('nelec by numeric integration = %s', n) + log.debug('nelec by numeric integration = %s', n) t0 = log.timer('vxc', *t0) return vxc @@ -61,7 +61,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1, max_memory = ks.max_memory - lib.current_memory()[0] n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi, kpts, kpts_band, max_memory=max_memory) - log.info('nelec by numeric integration = %s', n) + log.debug('nelec by numeric integration = %s', n) if ks.do_nlc(): if ni.libxc.is_nlc(ks.xc): xc = ks.xc @@ -72,7 +72,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1, max_memory=max_memory) exc += enlc vxc += vnlc - log.info('nelec with nlc grids = %s', n) + log.debug('nelec with nlc grids = %s', n) t0 = log.timer('vxc', *t0) nkpts = len(kpts) @@ -140,6 +140,14 @@ def energy_elec(mf, dm_kpts=None, h1e_kpts=None, vhf=None): ecoul.imag) return tot_e.real, ecoul.real + exc.real +def get_rho(mf, dm=None, grids=None, kpts=None): + if dm is None: dm = mf.make_rdm1() + if grids is None: grids = mf.grids + if kpts is None: kpts = mf.kpts + assert dm.ndim == 3 + assert kpts.ndim == 2 + return mf._numint.get_rho(mf.cell, dm, grids, kpts) + class KRKS(rks.KohnShamDFT, khf.KRHF): '''RKS class adapted for PBCs with k-point sampling. ''' @@ -151,7 +159,7 @@ def __init__(self, cell, kpts=np.zeros((1,3)), xc='LDA,VWN', exxdiv='ewald'): dump_flags = krks_cpu.KRKS.dump_flags get_veff = get_veff energy_elec = energy_elec - get_rho = return_cupy_array(krks_cpu.get_rho) + get_rho = get_rho nuc_grad_method = NotImplemented to_hf = NotImplemented diff --git a/gpu4pyscf/pbc/dft/kuks.py b/gpu4pyscf/pbc/dft/kuks.py index 363bfefd..fad45cbd 100644 --- a/gpu4pyscf/pbc/dft/kuks.py +++ b/gpu4pyscf/pbc/dft/kuks.py @@ -28,7 +28,7 @@ from gpu4pyscf.lib import logger, utils from gpu4pyscf.lib.cupy_helper import return_cupy_array, tag_array from gpu4pyscf.pbc.scf import khf, kuhf -from gpu4pyscf.pbc.dft import rks +from gpu4pyscf.pbc.dft import rks, krks def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1, kpts=None, kpts_band=None): @@ -47,7 +47,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1, n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm, hermi, kpts, kpts_band, with_j=True, return_j=False) - log.info('nelec by numeric integration = %s', n) + log.debug('nelec by numeric integration = %s', n) t0 = log.timer('vxc', *t0) return vxc @@ -71,7 +71,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1, 0, hermi, kpts, max_memory=max_memory) exc += enlc vxc += vnlc - log.info('nelec by numeric integration = %s', n) + log.debug('nelec by numeric integration = %s', n) t0 = log.timer('vxc', *t0) nkpts = len(kpts) @@ -150,7 +150,10 @@ def __init__(self, cell, kpts=np.zeros((1,3)), xc='LDA,VWN', exxdiv='ewald'): get_veff = get_veff energy_elec = energy_elec - get_rho = return_cupy_array(kuks_cpu.get_rho) + + def get_rho(self, dm=None, grids=None, kpts=None): + if dm is None: dm = self.make_rdm1() + return krks.get_rho(self, dm[0]+dm[1], grids, kpts) nuc_grad_method = NotImplemented to_hf = NotImplemented diff --git a/gpu4pyscf/pbc/dft/numint.py b/gpu4pyscf/pbc/dft/numint.py index ea9e83cd..f064f664 100644 --- a/gpu4pyscf/pbc/dft/numint.py +++ b/gpu4pyscf/pbc/dft/numint.py @@ -90,17 +90,17 @@ def eval_rho(cell, ao, dm, non0tab=None, xctype='LDA', hermi=0, with_lapl=False, pyscf.dft.numint.eval_rho ''' - if np.iscomplexobj(ao) or np.iscomplexobj(dm): + if cp.iscomplexobj(ao) or cp.iscomplexobj(dm): ngrids, nao = ao.shape[-2:] ao_loc = cell.ao_loc_nr() assert nao == ao_loc[-1] dm = cp.asarray(dm, dtype=np.complex128) + ao = cp.asarray(ao, dtype=np.complex128) if hermi == 1: def dot_bra(bra, aodm): - rho = contract('pi,pi->p', bra.real, aodm.real) - rho += contract('pi,pi->p', bra.imag, aodm.imag) - return rho + rho = contract('pi,pi->p', bra.conj(), aodm).real + return cp.asarray(rho, order='C') dtype = np.float64 else: def dot_bra(bra, aodm): @@ -147,6 +147,7 @@ def dot_bra(bra, aodm): ngrids, nao = ao.shape[-2:] ao_loc = cell.ao_loc_nr() assert nao == ao_loc[-1] + assert ao.dtype == dm.dtype def dot_bra(bra, aodm): return contract('pi,pi->p', bra, aodm) @@ -378,13 +379,12 @@ def _tau_dot(bra, ket, wv): return mat -#TODO: put NumInt and KNumInt into one class KNumInt(lib.StreamObject, numint.LibXCMixin): eval_ao = staticmethod(eval_ao_kpts) make_mask = NotImplemented - def get_rho(self, cell, dm, grids, kpts=np.zeros((1,3)), max_memory=2000): + def get_rho(self, cell, dm, grids, kpts=np.zeros((1,3))): '''Density in real space ''' kpts = kpts.reshape(-1, 3) @@ -445,7 +445,7 @@ def block_loop(self, cell, grids, deriv=0, kpts=None): for ip0, ip1 in lib.prange(0, ngrids, blksize): coords = grids_coords[ip0:ip1] weight = grids_weights[ip0:ip1] - ao_ks = eval_ao_kpts(cell, coords, kpts, deriv=deriv) + ao_ks = self.eval_ao(cell, coords, kpts, deriv=deriv) yield ao_ks, weight, coords ao_ks = None diff --git a/gpu4pyscf/pbc/dft/rks.py b/gpu4pyscf/pbc/dft/rks.py index c6c93b24..fbc35f51 100644 --- a/gpu4pyscf/pbc/dft/rks.py +++ b/gpu4pyscf/pbc/dft/rks.py @@ -73,7 +73,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1, else: n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi, kpt, kpts_band) - log.info('nelec by numeric integration = %s', n) + log.debug('nelec by numeric integration = %s', n) if ks.do_nlc(): if ni.libxc.is_nlc(ks.xc): xc = ks.xc @@ -83,7 +83,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1, n, enlc, vnlc = ni.nr_nlc_vxc(cell, ks.nlcgrids, xc, dm, 0, hermi, kpt) exc += enlc vxc += vnlc - log.info('nelec with nlc grids = %s', n) + log.debug('nelec with nlc grids = %s', n) t0 = log.timer('vxc', *t0) if not hybrid: @@ -122,8 +122,18 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1, vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=None, vk=None) return vxc -def prune_small_rho_grids_(ks, cell, dm, grids, kpts): - raise NotImplementedError +NELEC_ERROR_TOL = getattr(__config__, 'pbc_dft_rks_prune_error_tol', 0.02) +def prune_small_rho_grids_(mf, cell, dm, grids, kpts): + rho = mf.get_rho(dm, grids, kpts) + n = rho.dot(grids.weights) + if abs(n-cell.nelectron) < NELEC_ERROR_TOL*n: + rho *= grids.weights + size0 = grids.weights.size + idx = abs(rho) > mf.small_rho_cutoff / size0 + grids.coords = grids.coords [idx] + grids.weights = grids.weights[idx] + logger.debug(mf, 'Drop grids %d', size0 - grids.weights.size) + return grids class KohnShamDFT(mol_ks.KohnShamDFT): '''PBC-KS''' @@ -148,9 +158,21 @@ def __init__(self, xc='LDA,VWN'): dump_flags = rks_cpu.KohnShamDFT.dump_flags get_veff = NotImplemented - get_rho = return_cupy_array(rks_cpu.get_rho) + get_rho = NotImplemented + + def density_fit(self, auxbasis=None, with_df=None): + from gpu4pyscf.pbc.df.df_jk import density_fit + cell = self.cell + mf = density_fit(self, auxbasis, with_df) + mf.with_df._j_only = not self._numint.libxc.is_hybrid_xc(self.xc) + mf.grids = gen_grid.BeckeGrids(cell) + mf.grids.level = getattr( + __config__, 'dft_rks_RKS_grids_level', mf.grids.level) + mf.nlcgrids = gen_grid.BeckeGrids(cell) + mf.nlcgrids.level = getattr( + __config__, 'dft_rks_RKS_nlcgrids_level', mf.nlcgrids.level) + return mf - density_fit = NotImplemented rs_density_fit = NotImplemented jk_method = NotImplemented @@ -164,7 +186,7 @@ def initialize_grids(self, cell, dm, kpts, ground_state=True): '''Initialize self.grids the first time call get_veff''' if self.grids.coords is None: t0 = (logger.process_clock(), logger.perf_counter()) - self.grids.build(with_non0tab=True) + self.grids.build() if (isinstance(self.grids, gen_grid.BeckeGrids) and self.small_rho_cutoff > 1e-20 and ground_state): self.grids = prune_small_rho_grids_( @@ -173,7 +195,7 @@ def initialize_grids(self, cell, dm, kpts, ground_state=True): is_nlc = self.do_nlc() if is_nlc and self.nlcgrids.coords is None: t0 = (logger.process_clock(), logger.perf_counter()) - self.nlcgrids.build(with_non0tab=True) + self.nlcgrids.build() if (isinstance(self.grids, gen_grid.BeckeGrids) and self.small_rho_cutoff > 1e-20 and ground_state): self.nlcgrids = prune_small_rho_grids_( @@ -185,6 +207,14 @@ def initialize_grids(self, cell, dm, kpts, ground_state=True): pbchf.KohnShamDFT = KohnShamDFT +def get_rho(mf, dm=None, grids=None, kpt=None): + if dm is None: dm = mf.make_rdm1() + if grids is None: grids = mf.grids + if kpt is None: kpt = mf.kpt + assert dm.ndim == 2 + assert kpt.ndim == 1 + return mf._numint.get_rho(mf.cell, dm[None], grids, kpt[None]) + class RKS(KohnShamDFT, pbchf.RHF): '''RKS class adapted for PBCs. @@ -203,6 +233,7 @@ def dump_flags(self, verbose=None): get_veff = get_veff energy_elec = mol_ks.energy_elec + get_rho = get_rho to_gpu = utils.to_gpu device = utils.device diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py b/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py index cc60be8f..17e1451f 100644 --- a/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py +++ b/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py @@ -154,6 +154,31 @@ def test_kpts_rsh_fft(self): mf_ref = kmf.to_cpu().run() self.assertAlmostEqual(kmf.e_tot, mf_ref.e_tot, 7) + def test_kpts_gga_gdf(self): + from gpu4pyscf.pbc.df.df import GDF + L = 4. + cell = pbcgto.Cell() + cell.a = np.eye(3)*L + cell.atom =[['H' , ( L/2+0., L/2+0. , L/2+1.)], + ['H' , ( L/2+1., L/2+0. , L/2+1.)]] + cell.basis = [[0, (4.0, 1.0)], [0, (1.0, 1.0)]] + cell.build() + + mf = cell.RKS(xc='pbe0').to_gpu().density_fit().run() + self.assertTrue(isinstance(mf.with_df, GDF)) + self.assertAlmostEqual(mf.e_tot, -0.44834992009430463, 7) + mf_ref = mf.to_cpu().run() + self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7) + + nk = [2, 1, 1] + kpts = cell.make_kpts(nk) + kmf = pbcdft.KRKS(cell, xc='pbe0', kpts=kpts).density_fit().run() + self.assertTrue(isinstance(kmf.with_df, GDF)) + self.assertAlmostEqual(kmf.e_tot, -0.44429306, 7) + mf_ref = kmf.to_cpu() + mf_ref.run() + self.assertAlmostEqual(kmf.e_tot, mf_ref.e_tot, 7) + if __name__ == '__main__': print("Full Tests for pbc.dft.rks") unittest.main() diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_uks.py b/gpu4pyscf/pbc/dft/tests/test_pbc_uks.py index 5848038c..2b73dfb2 100644 --- a/gpu4pyscf/pbc/dft/tests/test_pbc_uks.py +++ b/gpu4pyscf/pbc/dft/tests/test_pbc_uks.py @@ -68,6 +68,7 @@ def test_gga_fft(self): def test_rsh_fft(self): mf = pbcdft.UKS(cell, xc='camb3lyp').run(conv_tol=1e-9) + self.assertAlmostEqual(mf.e_tot, -4.350842690091271, 7) mf_ref = mf.to_cpu().run() self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7) @@ -153,6 +154,32 @@ def test_kpts_rsh_fft(self): mf_ref = kmf.to_cpu().run() self.assertAlmostEqual(kmf.e_tot, mf_ref.e_tot, 7) + def test_kpts_gga_gdf(self): + from gpu4pyscf.pbc.df.df import GDF + L = 4. + cell = pbcgto.Cell() + cell.a = np.eye(3)*L + cell.atom =[['H' , ( L/2+0., L/2+0. , L/2+1.)], + ['H' , ( L/2+1., L/2+0. , L/2+1.)]] + cell.basis = [[0, (4.0, 1.0)], [0, (1.0, 1.0)]] + cell.spin = 2 + cell.build() + + mf = cell.UKS(xc='pbe0').to_gpu().density_fit().run() + self.assertTrue(isinstance(mf.with_df, GDF)) + self.assertAlmostEqual(mf.e_tot, -0.10443638, 7) + mf_ref = mf.to_cpu().run() + self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7) + + nk = [2, 1, 1] + kpts = cell.make_kpts(nk) + kmf = pbcdft.KUKS(cell, xc='pbe0', kpts=kpts).density_fit().run() + self.assertTrue(isinstance(kmf.with_df, GDF)) + self.assertAlmostEqual(kmf.e_tot, -0.19581151, 7) + mf_ref = kmf.to_cpu() + mf_ref.run() + self.assertAlmostEqual(kmf.e_tot, mf_ref.e_tot, 7) + if __name__ == '__main__': print("Full Tests for pbc.dft.uks") unittest.main() diff --git a/gpu4pyscf/pbc/dft/uks.py b/gpu4pyscf/pbc/dft/uks.py index 8ce4466e..1cd2f976 100644 --- a/gpu4pyscf/pbc/dft/uks.py +++ b/gpu4pyscf/pbc/dft/uks.py @@ -52,7 +52,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1, n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm, hermi, kpt.reshape(1,3), kpts_band, with_j=True, return_j=False) - log.info('nelec by numeric integration = %s', n) + log.debug('nelec by numeric integration = %s', n) t0 = log.timer('vxc', *t0) return vxc @@ -79,7 +79,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1, 0, hermi, kpt, max_memory=max_memory) exc += enlc vxc += vnlc - log.info('nelec by numeric integration = %s', n) + log.debug('nelec by numeric integration = %s', n) t0 = log.timer('vxc', *t0) if not hybrid: @@ -134,10 +134,13 @@ def __init__(self, cell, kpt=np.zeros(3), xc='LDA,VWN', exxdiv='ewald'): dump_flags = uks_cpu.UKS.dump_flags - get_rho = return_cupy_array(uks_cpu.get_rho) get_veff = get_veff energy_elec = mol_uks.energy_elec + def get_rho(self, dm=None, grids=None, kpt=None): + if dm is None: dm = self.make_rdm1() + return rks.get_rho(self, dm[0]+dm[1], grids, kpt) + nuc_grad_method = NotImplemented to_hf = NotImplemented diff --git a/gpu4pyscf/pbc/gto/__init__.py b/gpu4pyscf/pbc/gto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gpu4pyscf/pbc/gto/cell.py b/gpu4pyscf/pbc/gto/cell.py new file mode 100644 index 00000000..14df0ff9 --- /dev/null +++ b/gpu4pyscf/pbc/gto/cell.py @@ -0,0 +1,49 @@ +# Copyright 2025 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +# This function is only available in pyscf-2.8 or later +def extract_pgto_params(cell, op='diffused'): + '''A helper function to extract exponents and contraction coefficients for + estimate_xxx function + ''' + es = [] + cs = [] + if op == 'diffused': + precision = cell.precision + for i in range(cell.nbas): + e = cell.bas_exp(i) + c = abs(cell._libcint_ctr_coeff(i)).max(axis=1) + l = cell.bas_angular(i) + # A quick estimation for the radius that each primitive GTO vanishes + r2 = np.log(c**2 / precision * 10**l) / e + idx = r2.argmax() + es.append(e[idx]) + cs.append(c[idx].max()) + elif op == 'compact': + precision = cell.precision + for i in range(cell.nbas): + e = cell.bas_exp(i) + c = abs(cell._libcint_ctr_coeff(i)).max(axis=1) + l = cell.bas_angular(i) + # A quick estimation for the resolution of planewaves that each + # primitive GTO requires + ke = np.log(c**2 / precision * 50**l) * e + idx = ke.argmax() + es.append(e[idx]) + cs.append(c[idx].max()) + else: + raise RuntimeError(f'Unsupported operation {op}') + return np.array(es), np.array(cs) diff --git a/gpu4pyscf/pbc/lib/kpts_helper.py b/gpu4pyscf/pbc/lib/kpts_helper.py index 9b85184b..6a3d0334 100644 --- a/gpu4pyscf/pbc/lib/kpts_helper.py +++ b/gpu4pyscf/pbc/lib/kpts_helper.py @@ -13,6 +13,7 @@ # limitations under the License. import numpy as np +from pyscf import lib def conj_images_in_bvk_cell(kmesh, return_pair=False): ''' @@ -42,3 +43,40 @@ def conj_images_in_bvk_cell(kmesh, return_pair=False): mask = Ls_idx <= Ls_idx_conj return np.column_stack((Ls_idx[mask], Ls_idx_conj[mask])) +def kk_adapted_iter(kmesh): + '''Generates kpt which is adapted to the kpt_p in (ij|p) + + This function provides the similar functionality as the + pyscf.pbc.lib.kpts_helper.kk_adapted_iter . + ''' + kmesh = np.asarray(kmesh) + nkpts = np.prod(kmesh) + nx, ny, nz = kmesh + kx = np.fft.fftfreq(nx, 1./nx).astype(int) + ky = np.fft.fftfreq(ny, 1./ny).astype(int) + kz = np.fft.fftfreq(nz, 1./nz).astype(int) + + kxyz = lib.cartesian_prod([kx, ky, kz]) + dk = (kxyz[None,:,:] - kxyz[:,None,:]).reshape(-1, 3) + + dk %= kmesh + wrap_around_mask = dk >= (kmesh+1)//2 + dk[wrap_around_mask[:,0],0] -= nx + dk[wrap_around_mask[:,1],1] -= ny + dk[wrap_around_mask[:,2],2] -= nz + uniq_ks, uniq_index, uniq_inverse = np.unique( + dk, axis=0, return_index=True, return_inverse=True) + + ks_conj = -uniq_ks + strides = np.array((ny*nz, nz, 1)) + ks_idx = (uniq_ks % kmesh).dot(strides) + ks_idx_conj = (ks_conj % kmesh).dot(strides) + + independent_idx = np.sort(np.nonzero(ks_idx <= ks_idx_conj)[0]) + for x in independent_idx: + kp = ks_idx[x] + kp_conj = ks_idx_conj[x] + kpt_ij_idx = np.where(uniq_inverse == x)[0] + kpti_idx = kpt_ij_idx // nkpts + kptj_idx = kpt_ij_idx % nkpts + yield kp, kp_conj, kpti_idx, kptj_idx diff --git a/gpu4pyscf/pbc/scf/hf.py b/gpu4pyscf/pbc/scf/hf.py index 740f76a5..3aec403d 100644 --- a/gpu4pyscf/pbc/scf/hf.py +++ b/gpu4pyscf/pbc/scf/hf.py @@ -240,11 +240,16 @@ class RHF(SCF): to_gpu = utils.to_gpu device = utils.device + def density_fit(self, auxbasis=None, with_df=None): + from gpu4pyscf.pbc.df.df_jk import density_fit + return density_fit(self, auxbasis, with_df) + def to_cpu(self): mf = hf_cpu.RHF(self.cell) utils.to_cpu(self, out=mf) return mf + def _format_jks(vj, dm, kpts_band): if kpts_band is None: vj = vj.reshape(dm.shape) diff --git a/gpu4pyscf/pbc/scf/khf.py b/gpu4pyscf/pbc/scf/khf.py index d4c7855e..4ec72d98 100644 --- a/gpu4pyscf/pbc/scf/khf.py +++ b/gpu4pyscf/pbc/scf/khf.py @@ -399,6 +399,8 @@ def get_init_guess(self, cell=None, key='minao', s1e=None): dm_kpts *= (nelectron / ne).reshape(-1,1,1) return dm_kpts + density_fit = pbchf.RHF.density_fit + to_gpu = utils.to_gpu device = utils.device diff --git a/gpu4pyscf/pbc/scf/kuhf.py b/gpu4pyscf/pbc/scf/kuhf.py index 7e82d932..d63396c7 100644 --- a/gpu4pyscf/pbc/scf/kuhf.py +++ b/gpu4pyscf/pbc/scf/kuhf.py @@ -38,8 +38,9 @@ def make_rdm1(mo_coeff_kpts, mo_occ_kpts, **kwargs): Returns: dm_kpts : (2, nkpts, nao, nao) ndarray ''' - assert isinstance(mo_occ_kpts, cp.ndarray) - assert isinstance(mo_coeff_kpts, cp.ndarray) + mo_occ_kpts = cp.asarray(mo_occ_kpts) + mo_coeff_kpts = cp.asarray(mo_coeff_kpts) + assert mo_occ_kpts.dtype == np.float64 c = mo_coeff_kpts * mo_occ_kpts[:,:,None,:] dm = contract('nkpi,nkqi->nkpq', mo_coeff_kpts, c.conj()) return tag_array(dm, mo_coeff=mo_coeff_kpts, mo_occ=mo_occ_kpts) @@ -312,6 +313,8 @@ def get_bands(self, kpts_band, cell=None, dm_kpts=None, kpts=None): to_ks = NotImplemented convert_from_ = NotImplemented + density_fit = khf.KRHF.density_fit + to_gpu = utils.to_gpu device = utils.device diff --git a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py index ca0810c5..71ae0ef1 100644 --- a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py +++ b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py @@ -132,6 +132,28 @@ def test_krhf_bands(self): e_ref = kmf_cpu.get_bands(kpts_bands)[0] self.assertAlmostEqual(abs(e.get()-e_ref).max(), 0, 7) + def test_density_fit(self): + from gpu4pyscf.pbc.df.df import GDF + L = 4. + cell = pbcgto.Cell() + cell.a = np.eye(3)*L + cell.atom =[['H' , ( L/2+0., L/2+0. , L/2+1.)], + ['H' , ( L/2+1., L/2+0. , L/2+1.)]] + cell.basis = [[0, (4.0, 1.0)], [0, (1.0, 1.0)]] + cell.build() + + ref = cell.RHF().density_fit().run() + mf = ref.to_gpu().run(conv_tol=1e-8) + self.assertTrue(isinstance(mf.with_df, GDF)) + self.assertAlmostEqual(ref.e_tot, -0.3740002917376214, 8) + self.assertAlmostEqual(mf.e_tot, ref.e_tot, 8) + + ref = cell.KRHF().density_fit().run() + mf = ref.to_gpu().run(conv_tol=1e-8) + self.assertTrue(isinstance(mf.with_df, GDF)) + self.assertAlmostEqual(ref.e_tot, -0.3740002917376214, 8) + self.assertAlmostEqual(mf.e_tot, ref.e_tot, 8) + if __name__ == '__main__': print("Full Tests for pbc.scf.hf") unittest.main() diff --git a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_uhf.py b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_uhf.py index 2f888bdb..b9665f06 100644 --- a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_uhf.py +++ b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_uhf.py @@ -90,6 +90,28 @@ def test_small_system(self): mf = pscf.KUHF(mol,kpts=[[0., 0., 0.]]).run() self.assertAlmostEqual(mf.e_tot, -2.2719576422665635, 8) + def test_density_fit(self): + from gpu4pyscf.pbc.df.df import GDF + L = 4. + cell = pbcgto.Cell() + cell.a = np.eye(3)*L + cell.atom =[['H' , ( L/2+0., L/2+0. , L/2+1.)], + ['H' , ( L/2+1., L/2+0. , L/2+1.)]] + cell.basis = [[0, (4.0, 1.0)], [0, (1.0, 1.0)]] + cell.spin = 2 + cell.build() + + ref = cell.UHF().density_fit().run() + mf = ref.to_gpu().run(conv_tol=1e-8) + self.assertTrue(isinstance(mf.with_df, GDF)) + self.assertAlmostEqual(ref.e_tot, -0.11995733902879813, 8) + self.assertAlmostEqual(mf.e_tot, ref.e_tot, 8) + + ref = cell.UHF().density_fit().run() + mf = ref.to_gpu().run(conv_tol=1e-8) + self.assertTrue(isinstance(mf.with_df, GDF)) + self.assertAlmostEqual(ref.e_tot, -0.11995733902879813, 8) + self.assertAlmostEqual(mf.e_tot, ref.e_tot, 8) if __name__ == '__main__': print("Tests for PBC UHF and PBC KUHF") diff --git a/gpu4pyscf/pbc/scf/uhf.py b/gpu4pyscf/pbc/scf/uhf.py index 5abe6398..65e02ef2 100644 --- a/gpu4pyscf/pbc/scf/uhf.py +++ b/gpu4pyscf/pbc/scf/uhf.py @@ -124,6 +124,8 @@ def get_init_guess(self, cell=None, key='minao', s1e=None): to_ks = NotImplemented convert_from_ = NotImplemented + density_fit = pbchf.RHF.density_fit + to_gpu = utils.to_gpu device = utils.device diff --git a/gpu4pyscf/pbc/tools/k2gamma.py b/gpu4pyscf/pbc/tools/k2gamma.py index 5e0041cf..2de30399 100644 --- a/gpu4pyscf/pbc/tools/k2gamma.py +++ b/gpu4pyscf/pbc/tools/k2gamma.py @@ -18,20 +18,23 @@ import numpy as np from pyscf.lib import logger -# This version of kpts_to_kmesh will be available in PySCF-2.8 -def kpts_to_kmesh(cell, kpts, precision=None, max_images=10000): - '''Find the minimal k-points mesh to include all input kpts''' +# This version of kpts_to_kmesh may become available in PySCF-2.9 +def kpts_to_kmesh(cell, kpts, precision=None, rcut=None): + '''Search the minimal BvK mesh or Monkhorst-Pack k-point mesh''' + assert kpts.ndim == 2 scaled_kpts = cell.get_scaled_kpts(kpts) logger.debug3(cell, ' scaled_kpts kpts %s', scaled_kpts) - # cell.nimgs are the upper limits for kmesh - kmesh = np.asarray(cell.nimgs) * 2 + 1 + if rcut is None: + kmesh = np.asarray(cell.nimgs) * 2 + 1 + else: + nimgs = cell.get_bounding_sphere(rcut) + kmesh = nimgs * 2 + 1 if precision is None: precision = cell.precision * 1e2 for i in range(3): floats = scaled_kpts[:,i] uniq_floats_idx = np.unique(floats.round(6), return_index=True)[1] uniq_floats = floats[uniq_floats_idx] - # Limit the number of images to 30 in each direction fracs = [Fraction(x).limit_denominator(int(kmesh[i])) for x in uniq_floats] denominators = np.unique([x.denominator for x in fracs]) common_denominator = reduce(np.lcm, denominators) @@ -43,14 +46,4 @@ def kpts_to_kmesh(cell, kpts, precision=None, max_images=10000): i, common_denominator, abs(fs - np.rint(fs)).max()) logger.debug3(cell, ' unique kpts %s', uniq_floats) logger.debug3(cell, ' frac kpts %s', fracs) - - assert max_images > 0 - if np.prod(kmesh) > max_images: - kmesh_raw = kmesh.copy() - for i in itertools.cycle(np.argsort(kmesh)[::-1]): - kmesh[i] = int(kmesh[i] * .8) - if np.prod(kmesh) < max_images: - break - logger.warn(cell, 'kmesh (%s) exceeds max_images (%d); reduced to %s', - kmesh_raw, max_images, kmesh) return kmesh diff --git a/gpu4pyscf/pop/esp.py b/gpu4pyscf/pop/esp.py index 8406ac06..e6d41e5f 100644 --- a/gpu4pyscf/pop/esp.py +++ b/gpu4pyscf/pop/esp.py @@ -88,7 +88,7 @@ def vdw_surface(mol, scales=[1.0], density=1.0*radii.BOHR**2, rad=R_VDW): Generate vdw surface of molecules, in Bohr ''' coords = mol.atom_coords(unit='B') - charges = mol.atom_charges() + charges = [gto.charge(sym) for sym in mol.elements] atom_radii = rad[charges] surface_points = [] @@ -196,7 +196,7 @@ def resp_solve(mol, dm, grid_density=1.0*radii.BOHR**2, q[u] = q[v] = q[w] ''' - charges = mol.atom_charges() + charges = np.asarray([gto.charge(sym) for sym in mol.elements]) natm = mol.natm is_restraint = charges > 1 is_restraint[charges == 1] = not hfree diff --git a/gpu4pyscf/properties/polarizability.py b/gpu4pyscf/properties/polarizability.py index 8face371..7949b4f5 100644 --- a/gpu4pyscf/properties/polarizability.py +++ b/gpu4pyscf/properties/polarizability.py @@ -13,11 +13,10 @@ # limitations under the License. import numpy as np -from gpu4pyscf.scf import cphf import cupy +from gpu4pyscf.scf import hf, cphf, _response_functions from gpu4pyscf.lib.cupy_helper import contract - def gen_vind(mf, mo_coeff, mo_occ): """get the induced potential. This is the same as contract the mo1 with the kernel. @@ -59,6 +58,7 @@ def eval_polarizability(mf): Returns: polarizability (numpy.array): polarizability in au """ + assert isinstance(mf, hf.RHF), "Unrestricted mf object is not supported." polarizability = np.empty((3, 3)) diff --git a/gpu4pyscf/properties/tests/test_polarizability.py b/gpu4pyscf/properties/tests/test_polarizability.py index e9aebe48..7c02c718 100644 --- a/gpu4pyscf/properties/tests/test_polarizability.py +++ b/gpu4pyscf/properties/tests/test_polarizability.py @@ -17,6 +17,7 @@ import pyscf from pyscf import lib from pyscf.dft import rks as rks_cpu +from pyscf.dft import uks as uks_cpu from gpu4pyscf.dft import rks, uks from gpu4pyscf.properties import polarizability @@ -62,7 +63,7 @@ def run_dft_df_polarizability(xc): polar = polarizability.eval_polarizability(mf) return e_dft, polar -def _vs_cpu(xc): +def _vs_cpu_rks(xc): mf = rks.RKS(mol, xc=xc) mf.grids.level = grids_level e_gpu = mf.kernel() @@ -76,6 +77,20 @@ def _vs_cpu(xc): assert np.abs(e_gpu - e_cpu) < 1e-5 assert np.linalg.norm(polar_cpu - polar_gpu) < 1e-3 +def _vs_cpu_uks(xc): + mf = uks.UKS(mol, xc=xc) + mf.grids.level = grids_level + e_gpu = mf.kernel() + polar_gpu = polarizability.eval_polarizability(mf) + + mf_cpu = uks_cpu.UKS(mol, xc=xc) + mf_cpu.conv_tol = 1e-12 + e_cpu = mf_cpu.kernel() + polar_cpu = polar.rhf.Polarizability(mf_cpu).polarizability() + + assert np.abs(e_gpu - e_cpu) < 1e-5 + assert np.linalg.norm(polar_cpu - polar_gpu) < 1e-3 + class KnownValues(unittest.TestCase): ''' known values are obtained by Q-Chem @@ -140,9 +155,16 @@ def test_rks_b3lyp_df(self): assert np.allclose(polar, qchem_polar) @unittest.skipIf(polar is None, "Skipping test if pyscf.properties is not installed") - def test_cpu(self): - _vs_cpu('b3lyp') + def test_cpu_rks(self): + _vs_cpu_rks('b3lyp') + """ + # UKS is not supported yet + @unittest.skipIf(polar is None, "Skipping test if pyscf.properties is not installed") + def test_cpu_uks(self): + _vs_cpu_uks('b3lyp') + """ + if __name__ == "__main__": print("Full Tests for polarizabillity") unittest.main() diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py index 3a0497ff..09523d4a 100644 --- a/gpu4pyscf/scf/hf.py +++ b/gpu4pyscf/scf/hf.py @@ -51,15 +51,13 @@ def _get_jk(mf, mol=None, dm=None, hermi=1, with_j=True, with_k=True, vj, vk = get_jk(mol, dm, hermi, vhfopt, with_j, with_k, omega) return vj, vk -def make_rdm1(mf, mo_coeff=None, mo_occ=None, **kwargs): - if mo_occ is None: mo_occ = mf.mo_occ - if mo_coeff is None: mo_coeff = mf.mo_coeff +def make_rdm1(mo_coeff, mo_occ): mo_coeff = cupy.asarray(mo_coeff) mo_occ = cupy.asarray(mo_occ) is_occ = mo_occ > 0 mocc = mo_coeff[:, is_occ] dm = cupy.dot(mocc*mo_occ[is_occ], mocc.conj().T) - occ_coeff = mo_coeff[:, mo_occ>1.0] + occ_coeff = mo_coeff[:, is_occ] return tag_array(dm, occ_coeff=occ_coeff, mo_occ=mo_occ, mo_coeff=mo_coeff) def get_occ(mf, mo_energy=None, mo_coeff=None): @@ -422,7 +420,6 @@ def check_sanity(self): init_guess_by_chkfile = hf_cpu.SCF.init_guess_by_chkfile from_chk = hf_cpu.SCF.from_chk get_init_guess = return_cupy_array(hf_cpu.SCF.get_init_guess) - make_rdm1 = make_rdm1 make_rdm2 = NotImplemented energy_elec = energy_elec energy_tot = energy_tot @@ -461,6 +458,11 @@ def check_sanity(self): mulliken_pop = NotImplemented mulliken_meta = NotImplemented + def make_rdm1(self, mo_coeff=None, mo_occ=None, **kwargs): + if mo_occ is None: mo_occ = self.mo_occ + if mo_coeff is None: mo_coeff = self.mo_coeff + return make_rdm1(mo_coeff, mo_occ) + def dip_moment(self, mol=None, dm=None, unit='Debye', origin=None, verbose=logger.NOTE): if mol is None: mol = self.mol diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py index 3d98ae5f..715eef45 100644 --- a/gpu4pyscf/scf/j_engine.py +++ b/gpu4pyscf/scf/j_engine.py @@ -26,7 +26,7 @@ from pyscf import __config__ from gpu4pyscf.lib.cupy_helper import load_library, condense, sandwich_dot, transpose_sum from gpu4pyscf.__config__ import props as gpu_specs -from gpu4pyscf.__config__ import _num_devices +from gpu4pyscf.__config__ import num_devices from gpu4pyscf.lib import logger from gpu4pyscf.scf import jk from gpu4pyscf.scf.jk import _make_j_engine_pair_locs, RysIntEnvVars, _scale_sp_ctr_coeff @@ -53,7 +53,7 @@ def get_j(mol, dm, hermi=1, vhfopt=None, omega=None, verbose=None): if vhfopt is None: with mol.with_range_coulomb(omega): groupsize = None - if _num_devices > 1: + if num_devices > 1: groupsize = jk.GROUP_SIZE vhfopt = _VHFOpt(mol).build(group_size=groupsize) if omega is None: diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py index 0e328204..a0048bf5 100644 --- a/gpu4pyscf/scf/jk.py +++ b/gpu4pyscf/scf/jk.py @@ -26,12 +26,12 @@ from pyscf.gto import ANG_OF, ATOM_OF, NPRIM_OF, NCTR_OF, PTR_COORD, PTR_COEFF from pyscf import lib from pyscf.scf import _vhf -from pyscf import __config__ from gpu4pyscf.lib.cupy_helper import (load_library, condense, sandwich_dot, transpose_sum, reduce_to_device) +from gpu4pyscf.__config__ import _streams, num_devices, shm_size from gpu4pyscf.__config__ import props as gpu_specs -from gpu4pyscf.__config__ import _streams, _num_devices from gpu4pyscf.lib import logger +from gpu4pyscf.lib import multi_gpu from gpu4pyscf.gto.mole import group_basis __all__ = [ @@ -54,34 +54,68 @@ UNROLL_NFMAX = ctypes.c_int.in_dll(libvhf_rys, 'rys_jk_unrolled_max_nf').value UNROLL_J_LMAX = ctypes.c_int.in_dll(libvhf_rys, 'rys_j_unrolled_lmax').value UNROLL_J_MAX_ORDER = ctypes.c_int.in_dll(libvhf_rys, 'rys_j_unrolled_max_order').value +SHM_SIZE = shm_size - 1024 +del shm_size GOUT_WIDTH = 42 -SHM_SIZE = getattr(__config__, 'GPU_SHM_SIZE', - int(gpu_specs['sharedMemPerBlockOptin']//9)*8) THREADS = 256 GROUP_SIZE = 256 -def _jk_task(mol, dms, vhfopt, task_list, hermi=0, - device_id=0, with_j=True, with_k=True, verbose=None): - n_dm = dms.shape[0] - nao, _ = vhfopt.coeff.shape +def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None): + '''Compute J, K matrices + ''' + assert with_j or with_k + log = logger.new_logger(mol, verbose) + cput0 = log.init_timer() + + if vhfopt is None: + vhfopt = _VHFOpt(mol).build() + + mol = vhfopt.sorted_mol + nao, nao_orig = vhfopt.coeff.shape + + dm = cp.asarray(dm, order='C') + dms = dm.reshape(-1,nao_orig,nao_orig) + #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) + dms = sandwich_dot(dms, vhfopt.coeff.T) + dms = cp.asarray(dms, order='C') + + ao_loc = mol.ao_loc + nao = ao_loc[-1] uniq_l_ctr = vhfopt.uniq_l_ctr uniq_l = uniq_l_ctr[:,0] l_ctr_bas_loc = vhfopt.l_ctr_offsets l_symb = [lib.param.ANGULAR[i] for i in uniq_l] - kern = libvhf_rys.RYS_build_jk + n_groups = np.count_nonzero(uniq_l <= LMAX) - timing_counter = Counter() - kern_counts = 0 - with cp.cuda.Device(device_id), _streams[device_id]: + dm_cond = condense('absmax', dms, ao_loc) + if hermi == 0: + # Wrap the triu contribution to tril + dm_cond = dm_cond + dm_cond.T + dm_cond = cp.log(dm_cond + 1e-300).astype(np.float32) + log_max_dm = float(dm_cond.max()) + log_cutoff = math.log(vhfopt.direct_scf_tol) + + tasks = [(i,j,k,l) + for i in range(n_groups) + for j in range(i+1) + for k in range(i+1) + for l in range(k+1)] + schemes = {t: quartets_scheme(mol, uniq_l_ctr[list(t)]) for t in tasks} + + def proc(dms, dm_cond): + device_id = cp.cuda.device.get_device_id() + stream = cp.cuda.stream.get_current_stream() log = logger.new_logger(mol, verbose) - cput0 = log.init_timer() - dms = cp.asarray(dms) + t0 = log.init_timer() + dms = cp.asarray(dms) # transfer to current device + dm_cond = cp.asarray(dm_cond) if hermi == 0: # Contract the tril and triu parts separately dms = cp.vstack([dms, dms.transpose(0,2,1)]) n_dm = dms.shape[0] - tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p) + tile_q_cond = vhfopt.tile_q_cond + tile_q_ptr = ctypes.cast(tile_q_cond.data.ptr, ctypes.c_void_p) q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p) s_ptr = lib.c_null_ptr() if mol.omega < 0: @@ -97,31 +131,34 @@ def _jk_task(mol, dms, vhfopt, task_list, hermi=0, vj = cp.zeros(dms.shape) vj_ptr = ctypes.cast(vj.data.ptr, ctypes.c_void_p) - ao_loc = mol.ao_loc - dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32) - log_max_dm = dm_cond.max() - log_cutoff = math.log(vhfopt.direct_scf_tol) - tile_mappings = _make_tril_tile_mappings(l_ctr_bas_loc, vhfopt.tile_q_cond, + tile_mappings = _make_tril_tile_mappings(l_ctr_bas_loc, tile_q_cond, log_cutoff-log_max_dm) workers = gpu_specs['multiProcessorCount'] pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16) info = cp.empty(2, dtype=np.uint32) - t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) + t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *t0) - for i, j, k, l in task_list: - ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], - l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) + init_constant(mol) + timing_counter = Counter() + kern_counts = 0 + kern = libvhf_rys.RYS_build_jk + + while tasks: + try: + task = tasks.pop() + except IndexError: + break + + i, j, k, l = task + shls_slice = l_ctr_bas_loc[[i, i+1, j, j+1, k, k+1, l, l+1]] tile_ij_mapping = tile_mappings[i,j] - llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' - kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], - l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) tile_kl_mapping = tile_mappings[k,l] - scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + scheme = schemes[task] err = kern( vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), ctypes.c_int(n_dm), ctypes.c_int(nao), vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), + (ctypes.c_int*8)(*shls_slice), ctypes.c_int(tile_ij_mapping.size), ctypes.c_int(tile_kl_mapping.size), ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), @@ -135,12 +172,17 @@ def _jk_task(mol, dms, vhfopt, task_list, hermi=0, mol._atm.ctypes, ctypes.c_int(mol.natm), mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) if err != 0: + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') if log.verbose >= logger.DEBUG1: + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' t1, t1p = log.timer_debug1(msg, *t1), t1 timing_counter[llll] += t1[1] - t1p[1] kern_counts += 1 + if num_devices > 1: + stream.synchronize() + if with_j: if hermi == 1: vj *= 2. @@ -153,67 +195,16 @@ def _jk_task(mol, dms, vhfopt, task_list, hermi=0, else: vk, vkT = vk[:n_dm//2], vk[n_dm//2:] vk += vkT.transpose(0,2,1) - return vj, vk, kern_counts, timing_counter - -def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None): - '''Compute J, K matrices - ''' - log = logger.new_logger(mol, verbose) - cput0 = log.init_timer() - - if vhfopt is None: - vhfopt = _VHFOpt(mol).build() - - mol = vhfopt.sorted_mol - nao, nao_orig = vhfopt.coeff.shape - - dm = cp.asarray(dm, order='C') - dms = dm.reshape(-1,nao_orig,nao_orig) - #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) - dms = sandwich_dot(dms, vhfopt.coeff.T) - dms = cp.asarray(dms, order='C') - - n_dm = dms.shape[0] - - assert with_j or with_k - - init_constant(mol) - - uniq_l_ctr = vhfopt.uniq_l_ctr - uniq_l = uniq_l_ctr[:,0] - l_symb = [lib.param.ANGULAR[i] for i in uniq_l] - n_groups = np.count_nonzero(uniq_l <= LMAX) - - tasks = [] - for i in range(n_groups): - for j in range(i+1): - for k in range(i+1): - for l in range(k+1): - tasks.append((i,j,k,l)) - tasks = np.array(tasks) - task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) - - cp.cuda.get_current_stream().synchronize() - futures = [] - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): - future = executor.submit( - _jk_task, - mol, dms, vhfopt, task_list[device_id], hermi=hermi, - with_j=with_j, with_k=with_k, verbose=verbose, - device_id=device_id) - futures.append(future) + return vj, vk, kern_counts, timing_counter + results = multi_gpu.run(proc, args=(dms, dm_cond), non_blocking=True) kern_counts = 0 timing_collection = Counter() vj_dist = [] vk_dist = [] - for future in futures: - vj, vk, counts, counter = future.result() + for vj, vk, counts, t_counter in results: kern_counts += counts - timing_collection += counter + timing_collection += t_counter vj_dist.append(vj) vk_dist.append(vk) @@ -222,17 +213,14 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None for llll, t in timing_collection.items(): log.debug1('%s wall time %.2f', llll, t) - for s in _streams: - s.synchronize() - cp.cuda.get_current_stream().synchronize() vj = vk = None if with_k: - vk = reduce_to_device(vk_dist, inplace=True) + vk = multi_gpu.array_reduce(vk_dist, inplace=True) #:vk = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vk, vhfopt.coeff) vk = sandwich_dot(vk, vhfopt.coeff) - + if with_j: - vj = reduce_to_device(vj_dist, inplace=True) + vj = multi_gpu.array_reduce(vj_dist, inplace=True) vj = transpose_sum(vj) #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vj, vhfopt.coeff) vj = sandwich_dot(vj, vhfopt.coeff) @@ -251,10 +239,7 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None else: scripts.append('jk->s1il') shls_excludes = [0, h_shls[0]] * 4 - if hermi == 1: - dms = dms.get() - else: - dms = dms[:n_dm//2].get() + dms = dms.get() vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts, dms, 1, mol._atm, mol._bas, mol._env, shls_excludes=shls_excludes) @@ -310,121 +295,148 @@ def get_j(mol, dm, hermi=0, vhfopt=None, verbose=None): ao_loc = mol.ao_loc dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32) - log_max_dm = dm_cond.max() + log_max_dm = float(dm_cond.max()) log_cutoff = math.log(vhfopt.direct_scf_tol) + uniq_l_ctr = vhfopt.uniq_l_ctr + uniq_l = uniq_l_ctr[:,0] + l_ctr_bas_loc = vhfopt.l_ctr_offsets + l_symb = [lib.param.ANGULAR[i] for i in uniq_l] + n_groups = np.count_nonzero(uniq_l <= LMAX) + ntiles = mol.nbas // TILE + dms = dms.get() pair_loc = _make_j_engine_pair_locs(mol) dm_xyz = np.empty(pair_loc[-1]) libvhf_rys.transform_cart_to_xyz( dm_xyz.ctypes, dms.ctypes, ao_loc.ctypes, pair_loc.ctypes, mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - dm_xyz = cp.asarray(dm_xyz) - vj_xyz = cp.zeros_like(dm_xyz) - - pair_loc_on_gpu = cp.asarray(pair_loc) - rys_envs = RysIntEnvVars( - mol.natm, mol.nbas, - vhfopt.rys_envs.atm, vhfopt.rys_envs.bas, vhfopt.rys_envs.env, - pair_loc_on_gpu.data.ptr, - ) - err = libvhf_rys.RYS_init_rysj_constant(ctypes.c_int(SHM_SIZE)) - if err != 0: - raise RuntimeError('CUDA kernel initialization') + tasks = [(i,j,k,l) + for i in range(n_groups) + for j in range(i+1) + for k in range(i+1) + for l in range(k+1)] + schemes = {t: _j_engine_quartets_scheme(mol, uniq_l_ctr[list(t)]) for t in tasks} - uniq_l_ctr = vhfopt.uniq_l_ctr - uniq_l = uniq_l_ctr[:,0] - l_ctr_bas_loc = vhfopt.l_ctr_offsets - l_symb = [lib.param.ANGULAR[i] for i in uniq_l] - n_groups = np.count_nonzero(uniq_l <= LMAX) - ntiles = mol.nbas // TILE - tile_mappings = {} - workers = gpu_specs['multiProcessorCount'] - pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16) - info = cp.empty(2, dtype=np.uint32) + def proc(dm_xyz, dm_cond): + device_id = cp.cuda.device.get_device_id() + stream = cp.cuda.stream.get_current_stream() + log = logger.new_logger(mol, verbose) + t0 = log.init_timer() + dm_xyz = cp.asarray(dm_xyz) # transfer to current device + dm_cond = cp.asarray(dm_cond) + vj_xyz = cp.zeros_like(dm_xyz) + pair_loc_on_gpu = cp.asarray(pair_loc) + _atm, _bas, _env, _ = vhfopt.rys_envs._env_ref_holder + rys_envs = RysIntEnvVars( + mol.natm, mol.nbas, + _atm.data.ptr, _bas.data.ptr, _env.data.ptr, + pair_loc_on_gpu.data.ptr, + ) + tile_q_cond = vhfopt.tile_q_cond + q_cond = vhfopt.q_cond + + err = libvhf_rys.RYS_init_rysj_constant(ctypes.c_int(SHM_SIZE)) + if err != 0: + raise RuntimeError('CUDA kernel initialization') + + tile_mappings = {} + workers = gpu_specs['multiProcessorCount'] + pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16) + info = cp.empty(2, dtype=np.uint32) - for i in range(n_groups): - for j in range(i+1): - ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1] - jsh0, jsh1 = l_ctr_bas_loc[j], l_ctr_bas_loc[j+1] - ij_shls = (ish0, ish1, jsh0, jsh1) - i0 = ish0 // TILE - i1 = ish1 // TILE - j0 = jsh0 // TILE - j1 = jsh1 // TILE - sub_tile_q = vhfopt.tile_q_cond[i0:i1,j0:j1] - mask = sub_tile_q > log_cutoff - log_max_dm - if i == j: - mask = cp.tril(mask) - t_ij = (cp.arange(i0, i1, dtype=np.int32)[:,None] * ntiles + - cp.arange(j0, j1, dtype=np.int32)) - idx = cp.argsort(sub_tile_q[mask])[::-1] - tile_mappings[i,j] = t_ij[mask][idx] - t1 = t2 = log.timer_debug1('q_cond and dm_cond', *cput0) + for i in range(n_groups): + for j in range(i+1): + ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1] + jsh0, jsh1 = l_ctr_bas_loc[j], l_ctr_bas_loc[j+1] + i0 = ish0 // TILE + i1 = ish1 // TILE + j0 = jsh0 // TILE + j1 = jsh1 // TILE + sub_tile_q = tile_q_cond[i0:i1,j0:j1] + mask = sub_tile_q > log_cutoff - log_max_dm + if i == j: + mask = cp.tril(mask) + t_ij = (cp.arange(i0, i1, dtype=np.int32)[:,None] * ntiles + + cp.arange(j0, j1, dtype=np.int32)) + idx = cp.argsort(sub_tile_q[mask])[::-1] + tile_mappings[i,j] = t_ij[mask][idx] + t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *t0) + + timing_collection = {} + kern_counts = 0 + kern = libvhf_rys.RYS_build_j + + while tasks: + try: + task = tasks.pop() + except IndexError: + break + + i, j, k, l = task + shls_slice = l_ctr_bas_loc[[i, i+1, j, j+1, k, k+1, l, l+1]] + tile_ij_mapping = tile_mappings[i,j] + tile_kl_mapping = tile_mappings[k,l] + scheme = schemes[task] + err = kern( + ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p), + ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + rys_envs, (ctypes.c_int*3)(*scheme), + (ctypes.c_int*8)(*shls_slice), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_q_cond.data.ptr, ctypes.c_void_p), + ctypes.cast(q_cond.data.ptr, ctypes.c_void_p), + lib.c_null_ptr(), + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + if err != 0: + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + t1, t1p = log.timer_debug1(f'processing {llll}, tasks = {info[1]}', *t1), t1 + if llll not in timing_collection: + timing_collection[llll] = 0 + timing_collection[llll] += t1[1] - t1p[1] + kern_counts += 1 + if num_devices > 1: + stream.synchronize() + return vj_xyz, kern_counts, timing_collection - timing_collection = {} + results = multi_gpu.run(proc, args=(dm_xyz, dm_cond), non_blocking=True) kern_counts = 0 - kern = libvhf_rys.RYS_build_j - - for i in range(n_groups): - for j in range(i+1): - ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], - l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) - tile_ij_mapping = tile_mappings[i,j] - for k in range(i+1): - for l in range(k+1): - llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' - kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], - l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) - tile_kl_mapping = tile_mappings[k,l] - scheme = _j_engine_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) - err = kern( - ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p), - ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p), - ctypes.c_int(n_dm), ctypes.c_int(nao), - rys_envs, (ctypes.c_int*3)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), - ctypes.c_int(tile_ij_mapping.size), - ctypes.c_int(tile_kl_mapping.size), - ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p), - ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p), - lib.c_null_ptr(), - ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), - ctypes.c_float(log_cutoff), - ctypes.cast(pool.data.ptr, ctypes.c_void_p), - ctypes.cast(info.data.ptr, ctypes.c_void_p), - ctypes.c_int(workers), - mol._atm.ctypes, ctypes.c_int(mol.natm), - mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - if err != 0: - raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') - if log.verbose >= logger.DEBUG1: - t1, t1p = log.timer_debug1(f'processing {llll}, tasks = {info[1]}', *t1), t1 - if llll not in timing_collection: - timing_collection[llll] = 0 - timing_collection[llll] += t1[1] - t1p[1] - kern_counts += 1 + timing_collection = Counter() + vj_dist = [] + for vj, counts, t_counter in results: + kern_counts += counts + timing_collection += t_counter + vj_dist.append(vj) if log.verbose >= logger.DEBUG1: log.debug1('kernel launches %d', kern_counts) for llll, t in timing_collection.items(): log.debug1('%s wall time %.2f', llll, t) - cp.cuda.Stream.null.synchronize() - log.timer_debug1('cuda kernel', *t2) + vj_xyz = multi_gpu.array_reduce(vj_dist, inplace=True) vj_xyz = vj_xyz.get() vj = np.empty_like(dms) libvhf_rys.transform_xyz_to_cart( vj.ctypes, vj_xyz.ctypes, ao_loc.ctypes, pair_loc.ctypes, mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, cp.asarray(vj), vhfopt.coeff) - vj = sandwich_dot(vj, vhfopt.coeff) + vj = sandwich_dot(cp.asarray(vj), vhfopt.coeff) vj = transpose_sum(vj) vj *= 2. - vj = vj.reshape(dm.shape) h_shls = vhfopt.h_shls if h_shls: @@ -433,7 +445,7 @@ def get_j(mol, dm, hermi=0, vhfopt=None, verbose=None): scripts = ['ji->s2kl'] shls_excludes = [0, h_shls[0]] * 4 vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts, - dms.get(), 1, mol._atm, mol._bas, mol._env, + dms, 1, mol._atm, mol._bas, mol._env, shls_excludes=shls_excludes) vj1 = vs_h[0].reshape(n_dm,nao,nao) coeff = vhfopt.coeff @@ -443,6 +455,7 @@ def get_j(mol, dm, hermi=0, vhfopt=None, verbose=None): vj[i] += coeff.T.dot(cp.asarray(v)).dot(coeff) log.timer_debug1('get_j pass 2 for h functions on cpu', *cput1) + vj = vj.reshape(dm.shape) log.timer('vj', *cput0) return vj @@ -457,7 +470,6 @@ def __init__(self, mol, cutoff=1e-13): # Hold cache on GPU devices self._rys_envs = {} - self._mol_gpu = {} self._q_cond = {} self._tile_q_cond = {} self._s_estimator = {} @@ -550,11 +562,11 @@ def rys_envs(self): _bas = cp.array(mol._bas) _env = cp.array(_scale_sp_ctr_coeff(mol)) ao_loc = cp.array(mol.ao_loc) - self._mol_gpu[device_id] = (_atm, _bas, _env, ao_loc) - self._rys_envs[device_id] = RysIntEnvVars( + self._rys_envs[device_id] = rys_envs = RysIntEnvVars( mol.natm, mol.nbas, _atm.data.ptr, _bas.data.ptr, _env.data.ptr, ao_loc.data.ptr) + rys_envs._env_ref_holder = (_atm, _bas, _env, ao_loc) return self._rys_envs[device_id] class RysIntEnvVars(ctypes.Structure): @@ -600,13 +612,12 @@ def g_pair_idx(ij_inc=None): def init_constant(mol): g_idx, offsets = g_pair_idx() - for device_id in range(_num_devices): - with cp.cuda.Device(device_id), _streams[device_id]: - err = libvhf_rys.RYS_init_constant( - g_idx.ctypes, offsets.ctypes, mol._env.ctypes, - ctypes.c_int(mol._env.size), ctypes.c_int(SHM_SIZE)) - if err != 0: - raise RuntimeError(f'CUDA kernel initialization on device {device_id}') + err = libvhf_rys.RYS_init_constant( + g_idx.ctypes, offsets.ctypes, mol._env.ctypes, + ctypes.c_int(mol._env.size), ctypes.c_int(SHM_SIZE)) + if err != 0: + device_id = cp.cuda.device.get_device_id() + raise RuntimeError(f'CUDA kernel initialization on device {device_id}') def _make_tril_tile_mappings(l_ctr_bas_loc, tile_q_cond, cutoff, tile=TILE): n_groups = len(l_ctr_bas_loc) - 1 diff --git a/gpu4pyscf/scf/tests/test_scf_jk.py b/gpu4pyscf/scf/tests/test_scf_jk.py index 78ae68eb..e311482f 100644 --- a/gpu4pyscf/scf/tests/test_scf_jk.py +++ b/gpu4pyscf/scf/tests/test_scf_jk.py @@ -125,3 +125,32 @@ def test_jk_hermi0(): assert abs(vj2+vj3 - vj1).max() < 1e-9 assert abs(vk2+vk3 - vk1).max() < 1e-9 + +def test_jk_hermi0_l5(): + mol = pyscf.M( + atom = ''' + O 0.000 -0. 0.1174 + H -0.757 4. -0.4696 + H 0.757 4. -0.4696 + C 1. 1. 0. + H 4. 0. 3. + H 0. 1. .6 + ''', + basis={'default': 'def2-tzvp', 'O': [[5, [1., 1.]]]}, + unit='B',) + + np.random.seed(9) + nao = mol.nao + dm = np.random.rand(nao, nao) + vj, vk = jk.get_jk(mol, dm, hermi=0) + vj = vj.get() + vk = vk.get() + ref = get_jk(mol, dm, hermi=0) + assert abs(vj - ref[0]).max() < 1e-9 + assert abs(vk - ref[1]).max() < 1e-9 + assert abs(lib.fp(vj) - -61.28856847097108) < 1e-9 + assert abs(lib.fp(vk) - -76.38373664249241) < 1e-9 + + vj = jk.get_j(mol, dm, hermi=0).get() + assert abs(vj - ref[0]).max() < 1e-9 + assert abs(lib.fp(vj) - -61.28856847097108) < 1e-9 diff --git a/gpu4pyscf/scf/uhf.py b/gpu4pyscf/scf/uhf.py index 12a01d57..1107cbd2 100644 --- a/gpu4pyscf/scf/uhf.py +++ b/gpu4pyscf/scf/uhf.py @@ -38,10 +38,6 @@ def make_rdm1(mo_coeff, mo_occ, **kwargs): mo_b = mo_coeff[1] dm_a = cupy.dot(mo_a*mo_occ[0], mo_a.conj().T) dm_b = cupy.dot(mo_b*mo_occ[1], mo_b.conj().T) -# DO NOT make tag_array for DM here because the DM arrays may be modified and -# passed to functions like get_jk, get_vxc. These functions may take the tags -# (mo_coeff, mo_occ) to compute the potential if tags were found in the DM -# arrays and modifications to DM arrays may be ignored. return tag_array((dm_a, dm_b), mo_coeff=mo_coeff, mo_occ=mo_occ) diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py index 3fe7cb6c..28711f77 100644 --- a/gpu4pyscf/solvent/grad/pcm.py +++ b/gpu4pyscf/solvent/grad/pcm.py @@ -40,13 +40,6 @@ def grad_switch_h(x): dy[x>1] = 0.0 return dy -def gradgrad_switch_h(x): - ''' 2nd derivative of h(x) ''' - ddy = 60.0*x - 180.0*x**2 + 120*x**3 - ddy[x<0] = 0.0 - ddy[x>1] = 0.0 - return ddy - def get_dF_dA(surface): ''' J. Chem. Phys. 133, 244111 (2010), Appendix C @@ -63,10 +56,9 @@ def get_dF_dA(surface): dF = cupy.zeros([ngrids, natom, 3]) dA = cupy.zeros([ngrids, natom, 3]) - for ia in range(atom_coords.shape[0]): + for ia in range(natom): p0,p1 = surface['gslice_by_atom'][ia] coords = grid_coords[p0:p1] - p1 = p0 + coords.shape[0] ri_rJ = cupy.expand_dims(coords, axis=1) - atom_coords riJ = cupy.linalg.norm(ri_rJ, axis=-1) diJ = (riJ - R_in_J) / R_sw_J @@ -145,9 +137,7 @@ def get_dD_dS(surface, with_S=True, with_D=False, stream=None): ''' charge_exp = surface['charge_exp'] grid_coords = surface['grid_coords'] - switch_fun = surface['switch_fun'] norm_vec = surface['norm_vec'] - R_vdw = surface['R_vdw'] n = charge_exp.shape[0] dS = cupy.empty([3,n,n]) dD = None @@ -163,9 +153,7 @@ def get_dD_dS(surface, with_S=True, with_D=False, stream=None): dD_ptr, dS_ptr, ctypes.cast(grid_coords.data.ptr, ctypes.c_void_p), ctypes.cast(norm_vec.data.ptr, ctypes.c_void_p), - ctypes.cast(R_vdw.data.ptr, ctypes.c_void_p), ctypes.cast(charge_exp.data.ptr, ctypes.c_void_p), - ctypes.cast(switch_fun.data.ptr, ctypes.c_void_p), ctypes.c_int(n) ) if err != 0: @@ -181,7 +169,7 @@ def get_dSii(surface, dF): dSii = dSii_dF[:,None] * dF return dSii -def grad_nuc(pcmobj, dm): +def grad_nuc(pcmobj, dm, q_sym = None): mol = pcmobj.mol log = logger.new_logger(mol, mol.verbose) t1 = log.init_timer() @@ -194,7 +182,8 @@ def grad_nuc(pcmobj, dm): pcmobj._get_vind(dm) mol = pcmobj.mol - q_sym = pcmobj._intermediates['q_sym'].get() + if q_sym is None: + q_sym = pcmobj._intermediates['q_sym'].get() gridslice = pcmobj.surface['gslice_by_atom'] grid_coords = pcmobj.surface['grid_coords'].get() exponents = pcmobj.surface['charge_exp'].get() @@ -220,7 +209,7 @@ def grad_nuc(pcmobj, dm): t1 = log.timer_debug1('grad nuc', *t1) return de -def grad_qv(pcmobj, dm): +def grad_qv(pcmobj, dm, q_sym = None): ''' contributions due to integrals ''' @@ -237,7 +226,8 @@ def grad_qv(pcmobj, dm): gridslice = pcmobj.surface['gslice_by_atom'] charge_exp = pcmobj.surface['charge_exp'] grid_coords = pcmobj.surface['grid_coords'] - q_sym = pcmobj._intermediates['q_sym'] + if q_sym is None: + q_sym = pcmobj._intermediates['q_sym'] intopt = int3c1e.VHFOpt(mol) intopt.build(1e-14, aosym=False) @@ -282,12 +272,23 @@ def grad_solver(pcmobj, dm): vK_1 = cupy.linalg.solve(K.T, v_grids) epsilon = pcmobj.eps + def contract_bra(a, B, c): + ''' i,xij,j->jx ''' + tmp = a.dot(B) + return (tmp*c).T + + def contract_ket(a, B, c): + ''' i,xij,j->ix ''' + tmp = B.dot(c) + return (a*tmp).T + de = cupy.zeros([pcmobj.mol.natm,3]) if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']: dD, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True) # dR = 0, dK = dS - de_dS = (vK_1 * dS.dot(q)).T # cupy.einsum('i,xij,j->ix', vK_1, dS, q) + de_dS = 0.5 * contract_ket(vK_1, dS, q) + de_dS -= 0.5 * contract_bra(vK_1, dS, q) de -= cupy.asarray([cupy.sum(de_dS[p0:p1], axis=0) for p0,p1 in gridslice]) dD = dS = None @@ -295,24 +296,13 @@ def grad_solver(pcmobj, dm): dSii = get_dSii(pcmobj.surface, dF) de -= 0.5*contract('i,xij->jx', vK_1*q, dSii) # 0.5*cupy.einsum('i,xij,i->jx', vK_1, dSii, q) - elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE', 'SMD']: + elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']: dF, dA = get_dF_dA(pcmobj.surface) dSii = get_dSii(pcmobj.surface, dF) dF = None dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True) - def contract_bra(a, B, c): - ''' i,xij,j->jx ''' - tmp = a.dot(B) - return (tmp*c).T - - def contract_ket(a, B, c): - ''' i,xij,j->ix ''' - tmp = B.dot(c) - return (a*tmp).T - - # IEF-PCM and SS(V)PE formally are the same in gradient calculation # dR = f_eps/(2*pi) * (dD*A + D*dA), # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS) f_epsilon = (epsilon - 1.0)/(epsilon + 1.0) @@ -352,6 +342,67 @@ def contract_ket(a, B, c): de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1) de += de_dR - de_dK + elif pcmobj.method.upper() in [ 'SS(V)PE' ]: + dF, dA = get_dF_dA(pcmobj.surface) + dSii = get_dSii(pcmobj.surface, dF) + dF = None + + dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True) + + # dR = f_eps/(2*pi) * (dD*A + D*dA), + # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS) + f_epsilon = (epsilon - 1.0)/(epsilon + 1.0) + fac = f_epsilon/(2.0*PI) + + Av = A*v_grids + de_dR = 0.5*fac * contract_ket(vK_1, dD, Av) + de_dR -= 0.5*fac * contract_bra(vK_1, dD, Av) + de_dR = cupy.asarray([cupy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice]) + + vK_1_D = vK_1.dot(D) + vK_1_Dv = vK_1_D * v_grids + de_dR += 0.5*fac * contract('j,xjn->nx', vK_1_Dv, dA) + + de_dS0 = 0.5*contract_ket(vK_1, dS, q) + de_dS0 -= 0.5*contract_bra(vK_1, dS, q) + de_dS0 = cupy.asarray([cupy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice]) + + vK_1_q = vK_1 * q + de_dS0 += 0.5*contract('i,xin->nx', vK_1_q, dSii) + + vK_1_DA = vK_1_D*A + de_dS1 = 0.5*contract_ket(vK_1_DA, dS, q) + de_dS1 -= 0.5*contract_bra(vK_1_DA, dS, q) + de_dS1 = cupy.asarray([cupy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice]) + vK_1_DAq = vK_1_DA*q + de_dS1 += 0.5*contract('j,xjn->nx', vK_1_DAq, dSii) + + DT_q = cupy.dot(D.T, q) + ADT_q = A * DT_q + de_dS1_T = 0.5*contract_ket(vK_1, dS, ADT_q) + de_dS1_T -= 0.5*contract_bra(vK_1, dS, ADT_q) + de_dS1_T = cupy.asarray([cupy.sum(de_dS1_T[p0:p1], axis=0) for p0,p1 in gridslice]) + vK_1_ADT_q = vK_1 * ADT_q + de_dS1_T += 0.5*contract('j,xjn->nx', vK_1_ADT_q, dSii) + + Sq = cupy.dot(S,q) + ASq = A*Sq + de_dD = 0.5*contract_ket(vK_1, dD, ASq) + de_dD -= 0.5*contract_bra(vK_1, dD, ASq) + de_dD = cupy.asarray([cupy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice]) + + vK_1_S = cupy.dot(vK_1, S) + vK_1_SA = vK_1_S * A + de_dD_T = 0.5*contract_ket(vK_1_SA, -dD.transpose(0,2,1), q) + de_dD_T -= 0.5*contract_bra(vK_1_SA, -dD.transpose(0,2,1), q) + de_dD_T = cupy.asarray([cupy.sum(de_dD_T[p0:p1], axis=0) for p0,p1 in gridslice]) + + de_dA = 0.5*contract('j,xjn->nx', vK_1_D*Sq, dA) # 0.5*cupy.einsum('j,xjn,j->nx', vK_1_D, dA, Sq) + + de_dA_T = 0.5*contract('j,xjn->nx', vK_1_S*DT_q, dA) + + de_dK = de_dS0 - 0.5 * fac * (de_dD + de_dA + de_dS1 + de_dD_T + de_dA_T + de_dS1_T) + de += de_dR - de_dK else: raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}") t1 = log.timer_debug1('grad solver', *t1) diff --git a/gpu4pyscf/solvent/hessian/pcm.py b/gpu4pyscf/solvent/hessian/pcm.py index 538cb859..11c3e1df 100644 --- a/gpu4pyscf/solvent/hessian/pcm.py +++ b/gpu4pyscf/solvent/hessian/pcm.py @@ -19,141 +19,685 @@ import numpy import cupy +import ctypes from pyscf import lib, gto from gpu4pyscf import scf -from gpu4pyscf.solvent.pcm import PI -from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc, get_dD_dS, get_dF_dA, get_dSii +from gpu4pyscf.solvent.pcm import PI, switch_h, libsolvent +from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc, get_dD_dS, get_dF_dA, get_dSii, grad_switch_h from gpu4pyscf.df import int3c2e from gpu4pyscf.lib import logger from gpu4pyscf.hessian.jk import _ao2mo from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2 +from gpu4pyscf.gto.int3c1e_ipip import int1e_grids_ipip1, int1e_grids_ipvip1, int1e_grids_ipip2, int1e_grids_ip1ip2 from gpu4pyscf.gto import int3c1e from gpu4pyscf.gto.int3c1e import int1e_grids +from pyscf import lib as pyscf_lib -def hess_nuc(pcmobj): - raise NotImplementedError("Not tested") +def gradgrad_switch_h(x): + ''' 2nd derivative of h(x) ''' + ddy = 60.0*x - 180.0*x**2 + 120.0*x**3 + ddy[x<0] = 0.0 + ddy[x>1] = 0.0 + return ddy + +def get_d2F_d2A(surface): + ''' + Notations adopted from + J. Chem. Phys. 133, 244111 (2010), Appendix C + ''' + atom_coords = surface['atom_coords'] + grid_coords = surface['grid_coords'] + switch_fun = surface['switch_fun'] + area = surface['area'] + R_in_J = surface['R_in_J'] + R_sw_J = surface['R_sw_J'] + + ngrids = grid_coords.shape[0] + natom = atom_coords.shape[0] + d2F = cupy.zeros([ngrids, natom, natom, 3, 3]) + d2A = cupy.zeros([ngrids, natom, natom, 3, 3]) + + for i_grid_atom in range(natom): + p0,p1 = surface['gslice_by_atom'][i_grid_atom] + coords = grid_coords[p0:p1] + si_rJ = cupy.expand_dims(coords, axis=1) - atom_coords + norm_si_rJ = cupy.linalg.norm(si_rJ, axis=-1) + diJ = (norm_si_rJ - R_in_J) / R_sw_J + diJ[:,i_grid_atom] = 1.0 + diJ[diJ < 1e-8] = 0.0 + si_rJ[:,i_grid_atom,:] = 0.0 + si_rJ[diJ < 1e-8] = 0.0 + + fiJ = switch_h(diJ) + dfiJ = grad_switch_h(diJ) + + fiJK = fiJ[:, :, cupy.newaxis] * fiJ[:, cupy.newaxis, :] + dfiJK = dfiJ[:, :, cupy.newaxis] * dfiJ[:, cupy.newaxis, :] + R_sw_JK = R_sw_J[:, cupy.newaxis] * R_sw_J[cupy.newaxis, :] + norm_si_rJK = norm_si_rJ[:, :, cupy.newaxis] * norm_si_rJ[:, cupy.newaxis, :] + terms_size_ngrids_natm_natm = dfiJK / (fiJK * norm_si_rJK * R_sw_JK) + si_rJK = si_rJ[:, :, cupy.newaxis, :, cupy.newaxis] * si_rJ[:, cupy.newaxis, :, cupy.newaxis, :] + d2fiJK_offdiagonal = terms_size_ngrids_natm_natm[:, :, :, cupy.newaxis, cupy.newaxis] * si_rJK + + d2fiJ = gradgrad_switch_h(diJ) + terms_size_ngrids_natm = d2fiJ / (norm_si_rJ**2 * R_sw_J) - dfiJ / (norm_si_rJ**3) + si_rJJ = si_rJ[:, :, :, cupy.newaxis] * si_rJ[:, :, cupy.newaxis, :] + d2fiJK_diagonal = cupy.einsum('qA,qAdD->qAdD', terms_size_ngrids_natm, si_rJJ) + d2fiJK_diagonal += cupy.einsum('qA,dD->qAdD', dfiJ / norm_si_rJ, cupy.eye(3)) + d2fiJK_diagonal /= (fiJ * R_sw_J)[:, :, cupy.newaxis, cupy.newaxis] + + d2fiJK = d2fiJK_offdiagonal + for i_atom in range(natom): + d2fiJK[:, i_atom, i_atom, :, :] = d2fiJK_diagonal[:, i_atom, :, :] + + Fi = switch_fun[p0:p1] + Ai = area[p0:p1] + + d2F[p0:p1, :, :, :, :] += cupy.einsum('q,qABdD->qABdD', Fi, d2fiJK) + d2A[p0:p1, :, :, :, :] += cupy.einsum('q,qABdD->qABdD', Ai, d2fiJK) + + d2fiJK_grid_atom_offdiagonal = -cupy.einsum('qABdD->qAdD', d2fiJK) + d2F[p0:p1, i_grid_atom, :, :, :] = cupy.einsum('q,qAdD->qAdD', Fi, d2fiJK_grid_atom_offdiagonal.transpose(0,1,3,2)) + d2F[p0:p1, :, i_grid_atom, :, :] = cupy.einsum('q,qAdD->qAdD', Fi, d2fiJK_grid_atom_offdiagonal) + d2A[p0:p1, i_grid_atom, :, :, :] = cupy.einsum('q,qAdD->qAdD', Ai, d2fiJK_grid_atom_offdiagonal.transpose(0,1,3,2)) + d2A[p0:p1, :, i_grid_atom, :, :] = cupy.einsum('q,qAdD->qAdD', Ai, d2fiJK_grid_atom_offdiagonal) + + d2fiJK_grid_atom_diagonal = -cupy.einsum('qAdD->qdD', d2fiJK_grid_atom_offdiagonal) + d2F[p0:p1, i_grid_atom, i_grid_atom, :, :] = cupy.einsum('q,qdD->qdD', Fi, d2fiJK_grid_atom_diagonal) + d2A[p0:p1, i_grid_atom, i_grid_atom, :, :] = cupy.einsum('q,qdD->qdD', Ai, d2fiJK_grid_atom_diagonal) + + d2F = d2F.transpose(1,2,3,4,0) + d2A = d2A.transpose(1,2,3,4,0) + return d2F, d2A + +def get_d2Sii(surface, dF, d2F, stream=None): + ''' Second derivative of S matrix (diagonal only) + ''' + charge_exp = surface['charge_exp'] + switch_fun = surface['switch_fun'] + ngrids = switch_fun.shape[0] + dF = dF.transpose(2,0,1) + natm = dF.shape[0] + assert dF.shape == (natm, 3, ngrids) + + # dF_dF = dF[:, cupy.newaxis, :, cupy.newaxis, :] * dF[cupy.newaxis, :, cupy.newaxis, :, :] + # dF_dF_over_F3 = dF_dF * (1.0/(switch_fun**3)) + # d2F_over_F2 = d2F * (1.0/(switch_fun**2)) + # d2Sii = 2 * dF_dF_over_F3 - d2F_over_F2 + # d2Sii = (2.0/PI)**0.5 * (d2Sii * charge_exp) + + dF = dF.flatten() # Make sure the underlying data order is the same as shape shows + d2F = d2F.flatten() # Make sure the underlying data order is the same as shape shows + d2Sii = cupy.empty((natm, natm, 3, 3, ngrids), dtype=cupy.float64) + if stream is None: + stream = cupy.cuda.get_current_stream() + err = libsolvent.pcm_d2f_to_d2sii( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(switch_fun.data.ptr, ctypes.c_void_p), + ctypes.cast(dF.data.ptr, ctypes.c_void_p), + ctypes.cast(d2F.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exp.data.ptr, ctypes.c_void_p), + ctypes.cast(d2Sii.data.ptr, ctypes.c_void_p), + ctypes.c_int(natm), + ctypes.c_int(ngrids), + ) + if err != 0: + raise RuntimeError('Failed in converting PCM d2F to d2Sii.') + return d2Sii + +def get_d2D_d2S(surface, with_S=True, with_D=False, stream=None): + ''' Second derivatives of D matrix and S matrix (offdiagonals only) + ''' + charge_exp = surface['charge_exp'] + grid_coords = surface['grid_coords'] + norm_vec = surface['norm_vec'] + n = charge_exp.shape[0] + d2S = cupy.empty([3,3,n,n]) + d2D = None + d2S_ptr = ctypes.cast(d2S.data.ptr, ctypes.c_void_p) + d2D_ptr = pyscf_lib.c_null_ptr() + if with_D: + d2D = cupy.empty([3,3,n,n]) + d2D_ptr = ctypes.cast(d2D.data.ptr, ctypes.c_void_p) + if stream is None: + stream = cupy.cuda.get_current_stream() + err = libsolvent.pcm_d2d_d2s( + ctypes.cast(stream.ptr, ctypes.c_void_p), + d2D_ptr, d2S_ptr, + ctypes.cast(grid_coords.data.ptr, ctypes.c_void_p), + ctypes.cast(norm_vec.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exp.data.ptr, ctypes.c_void_p), + ctypes.c_int(n) + ) + if err != 0: + raise RuntimeError('Failed in generating PCM d2D and d2S matrices.') + return d2D, d2S + +def analytical_hess_nuc(pcmobj, dm, verbose=None): if not pcmobj._intermediates: pcmobj.build() + dm_cache = pcmobj._intermediates.get('dm', None) + if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10: + pass + else: + pcmobj._get_vind(dm) mol = pcmobj.mol + log = logger.new_logger(pcmobj, verbose) + t1 = log.init_timer() + q_sym = pcmobj._intermediates['q_sym'].get() gridslice = pcmobj.surface['gslice_by_atom'] grid_coords = pcmobj.surface['grid_coords'].get() exponents = pcmobj.surface['charge_exp'].get() + ngrids = q_sym.shape[0] + atom_coords = mol.atom_coords(unit='B') atom_charges = numpy.asarray(mol.atom_charges(), dtype=numpy.float64) fakemol_nuc = gto.fakemol_for_charges(atom_coords) fakemol = gto.fakemol_for_charges(grid_coords, expnt=exponents**2) - # nuclei potential response + d2e_from_d2I = numpy.zeros([mol.natm, mol.natm, 3, 3]) + int2c2e_ip1ip2 = mol._add_suffix('int2c2e_ip1ip2') - v_ng_ip1ip2 = gto.mole.intor_cross(int2c2e_ip1ip2, fakemol_nuc, fakemol).reshape([3,3,mol.natm,-1]) - dv_g = numpy.einsum('n,xyng->ngxy', atom_charges, v_ng_ip1ip2) - dv_g = numpy.einsum('ngxy,g->ngxy', dv_g, q_sym) + d2I_dAdC = gto.mole.intor_cross(int2c2e_ip1ip2, fakemol_nuc, fakemol) + d2I_dAdC = d2I_dAdC.reshape(3, 3, mol.natm, ngrids) + for i_atom in range(mol.natm): + g0,g1 = gridslice[i_atom] + d2e_from_d2I[:, i_atom, :, :] += numpy.einsum('A,dDAq,q->AdD', atom_charges, d2I_dAdC[:, :, :, g0:g1], q_sym[g0:g1]) + d2e_from_d2I[i_atom, :, :, :] += numpy.einsum('A,dDAq,q->AdD', atom_charges, d2I_dAdC[:, :, :, g0:g1], q_sym[g0:g1]) - de = numpy.zeros([mol.natm, mol.natm, 3, 3]) - for ia in range(mol.natm): - p0, p1 = gridslice[ia] - de_tmp = numpy.sum(dv_g[:,p0:p1], axis=1) - de[:,ia] -= de_tmp - #de[ia,:] -= de_tmp.transpose([0,2,1]) + int2c2e_ipip1 = mol._add_suffix('int2c2e_ipip1') + # # Some explanations here: + # # Why can we use the ip1ip2 here? Because of the translational invariance + # # $\frac{\partial^2 I_{AC}}{\partial A^2} + \frac{\partial^2 I_{AC}}{\partial A \partial C} = 0$ + # # Why not using the ipip1 here? Because the nuclei, a point charge, is handled as a Gaussian charge with exponent = 1e16 + # # This causes severe numerical problem in function int2c2e_ip1ip2, and make the main diagonal of hessian garbage. + # d2I_dA2 = gto.mole.intor_cross(int2c2e_ipip1, fakemol_nuc, fakemol) + d2I_dA2 = -gto.mole.intor_cross(int2c2e_ip1ip2, fakemol_nuc, fakemol) + d2I_dA2 = d2I_dA2 @ q_sym + d2I_dA2 = d2I_dA2.reshape(3, 3, mol.natm) + for i_atom in range(mol.natm): + d2e_from_d2I[i_atom, i_atom, :, :] += atom_charges[i_atom] * d2I_dA2[:, :, i_atom] + + d2I_dC2 = gto.mole.intor_cross(int2c2e_ipip1, fakemol, fakemol_nuc) + d2I_dC2 = d2I_dC2 @ atom_charges + d2I_dC2 = d2I_dC2.reshape(3, 3, ngrids) + for i_atom in range(mol.natm): + g0,g1 = gridslice[i_atom] + d2e_from_d2I[i_atom, i_atom, :, :] += d2I_dC2[:, :, g0:g1] @ q_sym[g0:g1] + intopt_derivative = int3c1e.VHFOpt(mol) + intopt_derivative.build(cutoff = 1e-14, aosym = False) - int2c2e_ip1ip2 = mol._add_suffix('int2c2e_ip1ip2') - v_ng_ip1ip2 = gto.mole.intor_cross(int2c2e_ip1ip2, fakemol, fakemol_nuc).reshape([3,3,-1,mol.natm]) - dv_g = numpy.einsum('n,xygn->gnxy', atom_charges, v_ng_ip1ip2) - dv_g = numpy.einsum('gnxy,g->gnxy', dv_g, q_sym) + dqdx = get_dqsym_dx(pcmobj, dm, range(mol.natm), intopt_derivative) + dqdx = dqdx.get() - for ia in range(mol.natm): - p0, p1 = gridslice[ia] - de_tmp = numpy.sum(dv_g[p0:p1], axis=0) - de[ia,:] -= de_tmp - #de[ia,:] -= de_tmp.transpose([0,2,1]) + d2e_from_dIdq = numpy.zeros([mol.natm, mol.natm, 3, 3]) + for i_atom in range(mol.natm): + for i_xyz in range(3): + d2e_from_dIdq[i_atom, :, i_xyz, :] = grad_nuc(pcmobj, dm, q_sym = dqdx[i_atom, i_xyz, :]) - int2c2e_ipip1 = mol._add_suffix('int2c2e_ipip1') - v_ng_ipip1 = gto.mole.intor_cross(int2c2e_ipip1, fakemol_nuc, fakemol).reshape([3,3,mol.natm,-1]) - dv_g = numpy.einsum('g,xyng->nxy', q_sym, v_ng_ipip1) - for ia in range(mol.natm): - de[ia,ia] -= dv_g[ia] * atom_charges[ia] - - v_ng_ipip1 = gto.mole.intor_cross(int2c2e_ipip1, fakemol, fakemol_nuc).reshape([3,3,-1,mol.natm]) - dv_g = numpy.einsum('n,xygn->gxy', atom_charges, v_ng_ipip1) - dv_g = numpy.einsum('g,gxy->gxy', q_sym, dv_g) - for ia in range(mol.natm): - p0, p1 = gridslice[ia] - de[ia,ia] -= numpy.sum(dv_g[p0:p1], axis=0) - - return de - -def hess_qv(pcmobj, dm, verbose=None): - raise NotImplementedError("PCM analytical hessian is not tested") - if not pcmobj._intermediates or 'q_sym' not in pcmobj._intermediates: - pcmobj._get_vind(dm) - gridslice = pcmobj.surface['gslice_by_atom'] - q_sym = pcmobj._intermediates['q_sym'] + d2e = d2e_from_d2I - d2e_from_dIdq - intopt = pcmobj.intopt - intopt.clear() - # rebuild with aosym - intopt.build(1e-14, diag_block_with_triu=True, aosym=False) - coeff = intopt.coeff - dm_cart = coeff @ dm @ coeff.T - #dm_cart = cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff) - - dvj, _ = int3c2e.get_int3c2e_ipip1_hjk(intopt, q_sym, None, dm_cart, with_k=False) - dq, _ = int3c2e.get_int3c2e_ipvip1_hjk(intopt, q_sym, None, dm_cart, with_k=False) - dvj, _ = int3c2e.get_int3c2e_ip1ip2_hjk(intopt, q_sym, None, dm_cart, with_k=False) - dq, _ = int3c2e.get_int3c2e_ipip2_hjk(intopt, q_sym, None, dm_cart, with_k=False) - - cart_ao_idx = intopt.cart_ao_idx - rev_cart_ao_idx = numpy.argsort(cart_ao_idx) - dvj = dvj[:,rev_cart_ao_idx] - - aoslice = intopt.mol.aoslice_by_atom() - dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice]) - dvj= 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]]) - de = dq + dvj - return de.get() - -def hess_elec(pcmobj, dm, verbose=None): - ''' - slow version with finite difference - TODO: use analytical hess_nuc - ''' + t1 = log.timer_debug1('solvent hessian d(dVnuc/dx * q)/dx contribution', *t1) + return d2e + +def analytical_hess_qv(pcmobj, dm, verbose=None): + if not pcmobj._intermediates: + pcmobj.build() + dm_cache = pcmobj._intermediates.get('dm', None) + if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10: + pass + else: + pcmobj._get_vind(dm) + mol = pcmobj.mol log = logger.new_logger(pcmobj, verbose) t1 = log.init_timer() - pmol = pcmobj.mol.copy() - mol = pmol.copy() - coords = mol.atom_coords(unit='Bohr') - - def pcm_grad_scanner(mol): - # TODO: use more analytical forms - pcmobj.reset(mol) - e, v = pcmobj._get_vind(dm) - #return grad_elec(pcmobj, dm) - pcm_grad = grad_nuc(pcmobj, dm) - pcm_grad+= grad_solver(pcmobj, dm) - pcm_grad+= grad_qv(pcmobj, dm) - return pcm_grad - - mol.verbose = 0 - de = numpy.zeros([mol.natm, mol.natm, 3, 3]) - eps = 1e-3 - for ia in range(mol.natm): - for ix in range(3): - dv = numpy.zeros_like(coords) - dv[ia,ix] = eps - mol.set_geom_(coords + dv, unit='Bohr') - g0 = pcm_grad_scanner(mol) - - mol.set_geom_(coords - dv, unit='Bohr') - g1 = pcm_grad_scanner(mol) - de[ia,:,ix] = (g0 - g1)/2.0/eps - t1 = log.timer_debug1('solvent energy', *t1) - pcmobj.reset(pmol) - return de - -def get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K): + + gridslice = pcmobj.surface['gslice_by_atom'] + charge_exp = pcmobj.surface['charge_exp'] + grid_coords = pcmobj.surface['grid_coords'] + q_sym = pcmobj._intermediates['q_sym'] + + aoslice = mol.aoslice_by_atom() + aoslice = numpy.array(aoslice) + + intopt_derivative = int3c1e.VHFOpt(mol) + intopt_derivative.build(cutoff = 1e-14, aosym = False) + + # fakemol = gto.fakemol_for_charges(grid_coords.get(), expnt=charge_exp.get()**2) + # intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e') + # intopt.build(1e-14, diag_block_with_triu=True, aosym=False) + + d2e_from_d2I = cupy.zeros([mol.natm, mol.natm, 3, 3]) + + # d2I_dA2 = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip1', direct_scf_tol=1e-14) + # d2I_dA2 = cupy.einsum('dijq,q->dij', d2I_dA2, q_sym) + # d2I_dA2 = d2I_dA2.reshape([3, 3, nao, nao]) + d2I_dA2 = int1e_grids_ipip1(mol, grid_coords, charges = q_sym, intopt = intopt_derivative, charge_exponents = charge_exp**2) + for i_atom in range(mol.natm): + p0,p1 = aoslice[i_atom, 2:] + d2e_from_d2I[i_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[p0:p1, :], d2I_dA2[:, :, p0:p1, :]) + d2e_from_d2I[i_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[:, p0:p1], d2I_dA2[:, :, p0:p1, :].transpose(0,1,3,2)) + d2I_dA2 = None + + # d2I_dAdB = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipvip1', direct_scf_tol=1e-14) + # d2I_dAdB = cupy.einsum('dijq,q->dij', d2I_dAdB, q_sym) + # d2I_dAdB = d2I_dAdB.reshape([3, 3, nao, nao]) + d2I_dAdB = int1e_grids_ipvip1(mol, grid_coords, charges = q_sym, intopt = intopt_derivative, charge_exponents = charge_exp**2) + for i_atom in range(mol.natm): + pi0,pi1 = aoslice[i_atom, 2:] + for j_atom in range(mol.natm): + pj0,pj1 = aoslice[j_atom, 2:] + d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[pi0:pi1, pj0:pj1], d2I_dAdB[:, :, pi0:pi1, pj0:pj1]) + d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[pj0:pj1, pi0:pi1], d2I_dAdB[:, :, pi0:pi1, pj0:pj1].transpose(0,1,3,2)) + d2I_dAdB = None + + for j_atom in range(mol.natm): + g0,g1 = gridslice[j_atom] + # d2I_dAdC = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ip1ip2', direct_scf_tol=1e-14) + # d2I_dAdC = cupy.einsum('dijq,q->dij', d2I_dAdC[:, :, :, g0:g1], q_sym[g0:g1]) + # d2I_dAdC = d2I_dAdC.reshape([3, 3, nao, nao]) + d2I_dAdC = int1e_grids_ip1ip2(mol, grid_coords[g0:g1, :], charges = q_sym[g0:g1], intopt = intopt_derivative, charge_exponents = charge_exp[g0:g1]**2) + + for i_atom in range(mol.natm): + p0,p1 = aoslice[i_atom, 2:] + d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[p0:p1, :], d2I_dAdC[:, :, p0:p1, :]) + d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[:, p0:p1], d2I_dAdC[:, :, p0:p1, :].transpose(0,1,3,2)) + + d2e_from_d2I[j_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[p0:p1, :], d2I_dAdC[:, :, p0:p1, :].transpose(1,0,2,3)) + d2e_from_d2I[j_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[:, p0:p1], d2I_dAdC[:, :, p0:p1, :].transpose(1,0,3,2)) + d2I_dAdC = None + + # d2I_dC2 = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip2', direct_scf_tol=1e-14) + # d2I_dC2 = cupy.einsum('dijq,ij->dq', d2I_dC2, dm) + # d2I_dC2 = d2I_dC2.reshape([3, 3, ngrids]) + d2I_dC2 = int1e_grids_ipip2(mol, grid_coords, dm = dm, intopt = intopt_derivative, charge_exponents = charge_exp**2) + for i_atom in range(mol.natm): + g0,g1 = gridslice[i_atom] + d2e_from_d2I[i_atom, i_atom, :, :] += d2I_dC2[:, :, g0:g1] @ q_sym[g0:g1] + d2I_dC2 = None + + dqdx = get_dqsym_dx(pcmobj, dm, range(mol.natm), intopt_derivative) + + d2e_from_dIdq = numpy.zeros([mol.natm, mol.natm, 3, 3]) + for i_atom in range(mol.natm): + for i_xyz in range(3): + d2e_from_dIdq[i_atom, :, i_xyz, :] = grad_qv(pcmobj, dm, q_sym = dqdx[i_atom, i_xyz, :]) + + d2e_from_d2I = d2e_from_d2I.get() + d2e = d2e_from_d2I + d2e_from_dIdq + d2e *= -1 + + t1 = log.timer_debug1('solvent hessian d(dI/dx * q)/dx contribution', *t1) + return d2e + +def einsum_ij_Adj_Adi_inverseK(K, Adj_term): + nA, nd, nj = Adj_term.shape + # return cupy.einsum('ij,Adj->Adi', cupy.linalg.inv(K), Adj_term) + return cupy.linalg.solve(K, Adj_term.reshape(nA * nd, nj).T).T.reshape(nA, nd, nj) +def einsum_Adi_ij_Adj_inverseK(Adi_term, K): + nA, nd, nj = Adi_term.shape + # return cupy.einsum('Adi,ij->Adj', Adi_term, cupy.linalg.inv(K)) + return cupy.linalg.solve(K.T, Adi_term.reshape(nA * nd, nj).T).T.reshape(nA, nd, nj) + +def get_dS_dot_q(dS, dSii, q, atmlst, gridslice): + output = cupy.einsum('diA,i->Adi', dSii[:,:,atmlst], q) + for i_atom in atmlst: + g0,g1 = gridslice[i_atom] + output[i_atom, :, g0:g1] += dS[:,g0:g1,:] @ q + output[i_atom, :, :] -= dS[:,:,g0:g1] @ q[g0:g1] + return output +def get_dST_dot_q(dS, dSii, q, atmlst, gridslice): + # S is symmetric + return get_dS_dot_q(dS, dSii, q, atmlst, gridslice) + +def get_dA_dot_q(dA, q, atmlst): + return cupy.einsum('diA,i->Adi', dA[:,:,atmlst], q) + +def get_dD_dot_q(dD, q, atmlst, gridslice, ngrids): + output = cupy.zeros([len(atmlst), 3, ngrids]) + for i_atom in atmlst: + g0,g1 = gridslice[i_atom] + output[i_atom, :, g0:g1] += dD[:,g0:g1,:] @ q + output[i_atom, :, :] -= dD[:,:,g0:g1] @ q[g0:g1] + return output +def get_dDT_dot_q(dD, q, atmlst, gridslice, ngrids): + return get_dD_dot_q(-dD.transpose(0,2,1), q, atmlst, gridslice, ngrids) + +def get_v_dot_d2S_dot_q(d2S, d2Sii, v_left, q_right, natom, gridslice): + output = d2Sii @ (v_left * q_right) + for i_atom in range(natom): + gi0,gi1 = gridslice[i_atom] + for j_atom in range(natom): + gj0,gj1 = gridslice[j_atom] + d2S_atom_ij = cupy.einsum('q,dDq->dD', v_left[gi0:gi1], d2S[:,:,gi0:gi1,gj0:gj1] @ q_right[gj0:gj1]) + output[i_atom, i_atom, :, :] += d2S_atom_ij + output[j_atom, j_atom, :, :] += d2S_atom_ij + output[i_atom, j_atom, :, :] -= d2S_atom_ij + output[j_atom, i_atom, :, :] -= d2S_atom_ij + return output +def get_v_dot_d2ST_dot_q(d2S, d2Sii, v_left, q_right, natom, gridslice): + # S is symmetric + return get_v_dot_d2S_dot_q(d2S, d2Sii, v_left, q_right, natom, gridslice) + +def get_v_dot_d2A_dot_q(d2A, v_left, q_right): + return d2A @ (v_left * q_right) + +def get_v_dot_d2D_dot_q(d2D, v_left, q_right, natom, gridslice): + output = cupy.zeros([natom, natom, 3, 3]) + for i_atom in range(natom): + gi0,gi1 = gridslice[i_atom] + for j_atom in range(natom): + gj0,gj1 = gridslice[j_atom] + d2D_atom_ij = cupy.einsum('q,dDq->dD', v_left[gi0:gi1], d2D[:,:,gi0:gi1,gj0:gj1] @ q_right[gj0:gj1]) + output[i_atom, i_atom, :, :] += d2D_atom_ij + output[j_atom, j_atom, :, :] += d2D_atom_ij + output[i_atom, j_atom, :, :] -= d2D_atom_ij + output[j_atom, i_atom, :, :] -= d2D_atom_ij + return output +def get_v_dot_d2DT_dot_q(d2D, v_left, q_right, natom, gridslice): + return get_v_dot_d2D_dot_q(d2D.transpose(0,1,3,2), v_left, q_right, natom, gridslice) + +def analytical_hess_solver(pcmobj, dm, verbose=None): + if not pcmobj._intermediates: + pcmobj.build() + dm_cache = pcmobj._intermediates.get('dm', None) + if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10: + pass + else: + pcmobj._get_vind(dm) + mol = pcmobj.mol + log = logger.new_logger(mol, verbose) + t1 = log.init_timer() + + natom = mol.natm + atmlst = range(natom) # Attention: This cannot be split + + gridslice = pcmobj.surface['gslice_by_atom'] + v_grids = pcmobj._intermediates['v_grids'] + A = pcmobj._intermediates['A'] + D = pcmobj._intermediates['D'] + S = pcmobj._intermediates['S'] + K = pcmobj._intermediates['K'] + R = pcmobj._intermediates['R'] + q = pcmobj._intermediates['q'] + f_epsilon = pcmobj._intermediates['f_epsilon'] + + ngrids = q.shape[0] + + vK_1 = cupy.linalg.solve(K.T, v_grids) + + if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']: + _, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True) + dF, _ = get_dF_dA(pcmobj.surface) + dSii = get_dSii(pcmobj.surface, dF) + + # dR = 0, dK = dS + # d(S-1 R) = - S-1 dS S-1 R + # d2(S-1 R) = (S-1 dS S-1 dS S-1 R) + (S-1 dS S-1 dS S-1 R) - (S-1 d2S S-1 R) + dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice) + S_1_dSdx_dot_q = einsum_ij_Adj_Adi_inverseK(K, dSdx_dot_q) + dSdx_dot_q = None + VS_1_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1, atmlst, gridslice) + dS = None + dSii = None + d2e_from_d2KR = cupy.einsum('Adi,BDi->ABdD', VS_1_dot_dSdx, S_1_dSdx_dot_q) * 2 + + _, d2S = get_d2D_d2S(pcmobj.surface, with_D=False, with_S=True) + d2F, _ = get_d2F_d2A(pcmobj.surface) + d2Sii = get_d2Sii(pcmobj.surface, dF, d2F) + dF = None + d2F = None + d2e_from_d2KR -= get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1, q, natom, gridslice) + d2S = None + d2Sii = None + + dK_1Rv = -S_1_dSdx_dot_q + dvK_1R = -einsum_Adi_ij_Adj_inverseK(VS_1_dot_dSdx, K) @ R + + elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']: + dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True) + dF, dA = get_dF_dA(pcmobj.surface) + dSii = get_dSii(pcmobj.surface, dF) + + # dR = f_eps/(2*pi) * (dD*A + D*dA) + # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS) + + # d2R = f_eps/(2*pi) * (d2D*A + dD*dA + dD*dA + D*d2A) + # d2K = d2S - f_eps/(2*pi) * (d2D*A*S + D*d2A*S + D*A*d2S + dD*dA*S + dD*dA*S + dD*A*dS + dD*A*dS + D*dA*dS + D*dA*dS) + # The terms showing up twice on equation above (dD*dA + dD*dA for example) refer to dD/dx * dA/dy + dD/dy * dA/dx, + # since D is not symmetric, they are not the same. + + # d(K-1 R) = - K-1 dK K-1 R + K-1 dR + # d2(K-1 R) = (K-1 dK K-1 dK K-1 R) + (K-1 dK K-1 dK K-1 R) - (K-1 d2K K-1 R) - (K-1 dK K-1 dR) + # - (K-1 dK K-1 dR) + (K-1 d2R) + f_eps_over_2pi = f_epsilon/(2.0*PI) + + dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice) + DA = D*A + dKdx_dot_q = dSdx_dot_q - f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q) + dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst) + dKdx_dot_q -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq) + AS = (A * S.T).T # It's just diag(A) @ S + ASq = AS @ q + dDdx_dot_ASq = get_dD_dot_q(dD, ASq, atmlst, gridslice, ngrids) + dKdx_dot_q -= f_eps_over_2pi * dDdx_dot_ASq + dDdx_dot_ASq = None + + K_1_dot_dKdx_dot_q = einsum_ij_Adj_Adi_inverseK(K, dKdx_dot_q) + dKdx_dot_q = None + + vK_1_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1, atmlst, gridslice) + vK_1_dot_dKdx = vK_1_dot_dSdx + vK_1_dot_dSdx = None + vK_1_dot_dDdx = get_dDT_dot_q(dD, vK_1, atmlst, gridslice, ngrids) + vK_1_dot_dKdx -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', AS.T, vK_1_dot_dDdx) + AS = None + vK_1D = D.T @ vK_1 + vK_1D_dot_dAdx = get_dA_dot_q(dA, vK_1D, atmlst) + vK_1_dot_dKdx -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', S.T, vK_1D_dot_dAdx) + vK_1DA = DA.T @ vK_1 + DA = None + vK_1DA_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1DA, atmlst, gridslice) + dS = None + dSii = None + vK_1_dot_dKdx -= f_eps_over_2pi * vK_1DA_dot_dSdx + vK_1DA_dot_dSdx = None + + d2e_from_d2KR = cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dKdx, K_1_dot_dKdx_dot_q) + d2e_from_d2KR += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dKdx, K_1_dot_dKdx_dot_q) + + d2F, d2A = get_d2F_d2A(pcmobj.surface) + vK_1_d2K_q = get_v_dot_d2A_dot_q(d2A, vK_1D, S @ q) + vK_1_d2R_V = get_v_dot_d2A_dot_q(d2A, vK_1D, v_grids) + d2A = None + d2Sii = get_d2Sii(pcmobj.surface, dF, d2F) + dF = None + d2F = None + d2D, d2S = get_d2D_d2S(pcmobj.surface, with_D=True, with_S=True) + vK_1_d2K_q += get_v_dot_d2D_dot_q(d2D, vK_1, ASq, natom, gridslice) + vK_1_d2R_V += get_v_dot_d2D_dot_q(d2D, vK_1, A * v_grids, natom, gridslice) + d2D = None + vK_1_d2K_q += get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1DA, q, natom, gridslice) + vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx, dAdx_dot_Sq) + vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx * A, dSdx_dot_q) + vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1D_dot_dAdx, dSdx_dot_q) + vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx, dAdx_dot_Sq) + vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx * A, dSdx_dot_q) + vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1D_dot_dAdx, dSdx_dot_q) + vK_1_d2K_q *= -f_eps_over_2pi + vK_1_d2K_q += get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1, q, natom, gridslice) + d2S = None + d2Sii = None + + d2e_from_d2KR -= vK_1_d2K_q + + dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst) + dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice, ngrids) + dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V)) + dDdx_dot_AV = None + + K_1_dot_dRdx_dot_V = einsum_ij_Adj_Adi_inverseK(K, dRdx_dot_V) + dRdx_dot_V = None + + d2e_from_d2KR -= cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dKdx, K_1_dot_dRdx_dot_V) + d2e_from_d2KR -= cupy.einsum('Adi,BDi->BADd', vK_1_dot_dKdx, K_1_dot_dRdx_dot_V) + + vK_1_d2R_V += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx, dAdx_dot_V) + vK_1_d2R_V += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx, dAdx_dot_V) + vK_1_d2R_V *= f_eps_over_2pi + + d2e_from_d2KR += vK_1_d2R_V + + dK_1Rv = -K_1_dot_dKdx_dot_q + K_1_dot_dRdx_dot_V + + VK_1D_dot_dAdx = get_dA_dot_q(dA, (D.T @ vK_1).T, atmlst) + VK_1_dot_dDdx = get_dDT_dot_q(dD, vK_1, atmlst, gridslice, ngrids) + VK_1_dot_dRdx = f_eps_over_2pi * (VK_1D_dot_dAdx + VK_1_dot_dDdx * A) + + dvK_1R = -einsum_Adi_ij_Adj_inverseK(vK_1_dot_dKdx, K) @ R + VK_1_dot_dRdx + + elif pcmobj.method.upper() in ['SS(V)PE']: + dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True) + dF, dA = get_dF_dA(pcmobj.surface) + dSii = get_dSii(pcmobj.surface, dF) + + # dR = f_eps/(2*pi) * (dD*A + D*dA) + # dK = dS - f_eps/(4*pi) * (dD*A*S + D*dA*S + D*A*dS + dST*AT*DT + ST*dAT*DT + ST*AT*dDT) + + # d2R = f_eps/(2*pi) * (d2D*A + dD*dA + dD*dA + D*d2A) + # d2K = d2S - f_eps/(4*pi) * (d2D*A*S + D*d2A*S + D*A*d2S + dD*dA*S + dD*dA*S + dD*A*dS + dD*A*dS + D*dA*dS + D*dA*dS + # + d2ST*AT*DT + ST*d2AT*DT + ST*AT*d2DT + dST*dAT*DT + dST*dAT*DT + dST*AT*dDT + dST*AT*dDT + ST*dAT*dDT + ST*dAT*dDT) + f_eps_over_2pi = f_epsilon/(2.0*PI) + f_eps_over_4pi = f_epsilon/(4.0*PI) + + dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice) + DA = D*A + dKdx_dot_q = dSdx_dot_q - f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q) + dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst) + dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq) + AS = (A * S.T).T # It's just diag(A) @ S + ASq = AS @ q + dDdx_dot_ASq = get_dD_dot_q(dD, ASq, atmlst, gridslice, ngrids) + dKdx_dot_q -= f_eps_over_4pi * dDdx_dot_ASq + dDdx_dot_ASq = None + dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice, ngrids) + dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_q) + dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst) + dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_q) + AT_DT_q = DA.T @ q + dSdxT_dot_AT_DT_q = get_dS_dot_q(dS, dSii, AT_DT_q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_4pi * dSdxT_dot_AT_DT_q + dSdxT_dot_AT_DT_q = None + + K_1_dot_dKdx_dot_q = einsum_ij_Adj_Adi_inverseK(K, dKdx_dot_q) + dKdx_dot_q = None + + vK_1_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1, atmlst, gridslice) + vK_1_dot_dKdx = vK_1_dot_dSdx + vK_1_dot_dSdx = None + vK_1_dot_dDdx = get_dDT_dot_q(dD, vK_1, atmlst, gridslice, ngrids) + vK_1_dot_dKdx -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, vK_1_dot_dDdx) + vK_1D_dot_dAdx = get_dA_dot_q(dA, D.T @ vK_1, atmlst) + vK_1_dot_dKdx -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, vK_1D_dot_dAdx) + vK_1DA = DA.T @ vK_1 + vK_1DA_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1DA, atmlst, gridslice) + vK_1_dot_dKdx -= f_eps_over_4pi * vK_1DA_dot_dSdx + vK_1DA_dot_dSdx = None + vK_1_dot_dSdxT = get_dS_dot_q(dS, dSii, vK_1, atmlst, gridslice) + dS = None + dSii = None + vK_1_dot_dKdx -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, vK_1_dot_dSdxT) + DA = None + vK_1_ST_dot_dAdxT = get_dA_dot_q(dA, (S @ vK_1).T, atmlst) + vK_1_dot_dKdx -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, vK_1_ST_dot_dAdxT) + vK_1_ST_AT = AS @ vK_1 + AS = None + vK_1_ST_AT_dot_dDdxT = get_dD_dot_q(dD, vK_1_ST_AT, atmlst, gridslice, ngrids) + vK_1_dot_dKdx -= f_eps_over_4pi * vK_1_ST_AT_dot_dDdxT + vK_1_ST_AT_dot_dDdxT = None + + d2e_from_d2KR = cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dKdx, K_1_dot_dKdx_dot_q) + d2e_from_d2KR += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dKdx, K_1_dot_dKdx_dot_q) + + d2F, d2A = get_d2F_d2A(pcmobj.surface) + vK_1_d2K_q = get_v_dot_d2A_dot_q(d2A, (D.T @ vK_1).T, S @ q) + vK_1_d2K_q += get_v_dot_d2A_dot_q(d2A, (S @ vK_1).T, D.T @ q) + vK_1_d2R_V = get_v_dot_d2A_dot_q(d2A, (D.T @ vK_1).T, v_grids) + d2A = None + d2Sii = get_d2Sii(pcmobj.surface, dF, d2F) + dF = None + d2F = None + d2D, d2S = get_d2D_d2S(pcmobj.surface, with_D=True, with_S=True) + vK_1_d2K_q += get_v_dot_d2D_dot_q(d2D, vK_1, ASq, natom, gridslice) + vK_1_d2K_q += get_v_dot_d2DT_dot_q(d2D, vK_1_ST_AT, q, natom, gridslice) + vK_1_d2R_V += get_v_dot_d2D_dot_q(d2D, vK_1, A * v_grids, natom, gridslice) + d2D = None + vK_1_d2K_q += get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1DA, q, natom, gridslice) + vK_1_d2K_q += get_v_dot_d2ST_dot_q(d2S, d2Sii, vK_1, AT_DT_q, natom, gridslice) + vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx, dAdx_dot_Sq) + vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx * A, dSdx_dot_q) + vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1D_dot_dAdx, dSdx_dot_q) + vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dSdxT, dAdxT_dot_DT_q) + vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dSdxT * A, dDdxT_dot_q) + vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_ST_dot_dAdxT, dDdxT_dot_q) + vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx, dAdx_dot_Sq) + vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx * A, dSdx_dot_q) + vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1D_dot_dAdx, dSdx_dot_q) + vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dSdxT, dAdxT_dot_DT_q) + vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dSdxT * A, dDdxT_dot_q) + vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_ST_dot_dAdxT, dDdxT_dot_q) + vK_1_d2K_q *= -f_eps_over_4pi + vK_1_d2K_q += get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1, q, natom, gridslice) + d2S = None + d2Sii = None + + d2e_from_d2KR -= vK_1_d2K_q + + dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst) + dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice, ngrids) + dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V)) + dDdx_dot_AV = None + + K_1_dot_dRdx_dot_V = einsum_ij_Adj_Adi_inverseK(K, dRdx_dot_V) + + d2e_from_d2KR -= cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dKdx, K_1_dot_dRdx_dot_V) + d2e_from_d2KR -= cupy.einsum('Adi,BDi->BADd', vK_1_dot_dKdx, K_1_dot_dRdx_dot_V) + + vK_1_d2R_V += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx, dAdx_dot_V) + vK_1_d2R_V += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx, dAdx_dot_V) + vK_1_d2R_V *= f_eps_over_2pi + + d2e_from_d2KR += vK_1_d2R_V + + dK_1Rv = -K_1_dot_dKdx_dot_q + K_1_dot_dRdx_dot_V + + VK_1D_dot_dAdx = get_dA_dot_q(dA, (D.T @ vK_1).T, atmlst) + VK_1_dot_dDdx = get_dDT_dot_q(dD, vK_1, atmlst, gridslice, ngrids) + VK_1_dot_dRdx = f_eps_over_2pi * (VK_1D_dot_dAdx + VK_1_dot_dDdx * A) + + dvK_1R = -einsum_Adi_ij_Adj_inverseK(vK_1_dot_dKdx, K) @ R + VK_1_dot_dRdx + + else: + raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}") + + d2e = d2e_from_d2KR + + intopt_derivative = int3c1e.VHFOpt(mol) + intopt_derivative.build(cutoff = 1e-14, aosym = False) + + dVdx = get_dvgrids(pcmobj, dm, range(mol.natm), intopt_derivative) + d2e -= cupy.einsum('Adi,BDi->BADd', dvK_1R, dVdx) + d2e -= cupy.einsum('Adi,BDi->ABdD', dVdx, dK_1Rv) + + d2e *= 0.5 + d2e = d2e.get() + t1 = log.timer_debug1('solvent hessian d(V * dK-1R/dx * V)/dx contribution', *t1) + return d2e + +def get_dqsym_dx_fix_vgrids(pcmobj, atmlst): assert pcmobj._intermediates is not None gridslice = pcmobj.surface['gslice_by_atom'] @@ -161,35 +705,14 @@ def get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K): A = pcmobj._intermediates['A'] D = pcmobj._intermediates['D'] S = pcmobj._intermediates['S'] + K = pcmobj._intermediates['K'] R = pcmobj._intermediates['R'] + q = pcmobj._intermediates['q'] q_sym = pcmobj._intermediates['q_sym'] f_epsilon = pcmobj._intermediates['f_epsilon'] ngrids = q_sym.shape[0] - def get_dS_dot_q(dS, dSii, q, atmlst, gridslice): - output = cupy.einsum('diA,i->Adi', dSii[:,:,atmlst], q) - for i_atom in atmlst: - g0,g1 = gridslice[i_atom] - output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dS[:,g0:g1,:], q) - output[i_atom, :, :] -= cupy.einsum('dij,j->di', dS[:,:,g0:g1], q[g0:g1]) - return output - def get_dST_dot_q(dS, dSii, q, atmlst, gridslice): - return get_dS_dot_q(-dS.transpose(0,2,1), dSii, q, atmlst, gridslice) - - def get_dA_dot_q(dA, q, atmlst, gridslice): - return cupy.einsum('diA,i->Adi', dA[:,:,atmlst], q) - - def get_dD_dot_q(dD, q, atmlst, gridslice): - output = cupy.zeros([len(atmlst), 3, ngrids]) - for i_atom in atmlst: - g0,g1 = gridslice[i_atom] - output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dD[:,g0:g1,:], q) - output[i_atom, :, :] -= cupy.einsum('dij,j->di', dD[:,:,g0:g1], q[g0:g1]) - return output - def get_dDT_dot_q(dD, q, atmlst, gridslice): - return get_dD_dot_q(-dD.transpose(0,2,1), q, atmlst, gridslice) - if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']: _, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True) dF, _ = get_dF_dA(pcmobj.surface) @@ -199,7 +722,7 @@ def get_dDT_dot_q(dD, q, atmlst, gridslice): # dR = 0, dK = dS dSdx_dot_q = get_dS_dot_q(dS, dSii, q_sym, atmlst, gridslice) - dqdx_fix_Vq = cupy.einsum('ij,Adj->Adi', inverse_K, dSdx_dot_q) + dqdx_fix_Vq = einsum_ij_Adj_Adi_inverseK(K, dSdx_dot_q) elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']: dF, dA = get_dF_dA(pcmobj.surface) @@ -212,33 +735,32 @@ def get_dDT_dot_q(dD, q, atmlst, gridslice): # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS) f_eps_over_2pi = f_epsilon/(2.0*PI) - q = inverse_K @ R @ v_grids dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice) DA = D*A dKdx_dot_q = dSdx_dot_q - f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q) - dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice) + dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst) dKdx_dot_q -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq) AS = (A * S.T).T # It's just diag(A) @ S - dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice) + dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice, ngrids) dKdx_dot_q -= f_eps_over_2pi * dDdx_dot_ASq - dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q) + dqdx_fix_Vq = -einsum_ij_Adj_Adi_inverseK(K, dKdx_dot_q) - dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice) + dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst) - dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice) + dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice, ngrids) dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V)) - dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V) + dqdx_fix_Vq += einsum_ij_Adj_Adi_inverseK(K, dRdx_dot_V) - invKT_V = inverse_K.T @ v_grids - dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice) + invKT_V = cupy.linalg.solve(K.T, v_grids) + dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice, ngrids) DT_invKT_V = D.T @ invKT_V - dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice) + dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst) dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V) dSdxT_dot_invKT_V = get_dST_dot_q(dS, dSii, invKT_V, atmlst, gridslice) @@ -249,8 +771,9 @@ def get_dDT_dot_q(dD, q, atmlst, gridslice): dSdxT_dot_AT_DT_invKT_V = get_dST_dot_q(dS, dSii, DA.T @ invKT_V, atmlst, gridslice) dKdxT_dot_invKT_V -= f_eps_over_2pi * dSdxT_dot_AT_DT_invKT_V + invKT_dKdxT_dot_invKT_V = einsum_ij_Adj_Adi_inverseK(K.T, dKdxT_dot_invKT_V) - dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdxT_dot_invKT_V) + dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T, invKT_dKdxT_dot_invKT_V) dqdx_fix_Vq *= -0.5 @@ -269,17 +792,17 @@ def dK_dot_q(q): DA = D*A dKdx_dot_q = dSdx_dot_q - f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q) - dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice) + dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst) dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq) AS = (A * S.T).T # It's just diag(A) @ S - dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice) + dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice, ngrids) dKdx_dot_q -= f_eps_over_4pi * dDdx_dot_ASq - dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice) + dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice, ngrids) dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_q) - dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst, gridslice) + dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst) dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_q) dSdxT_dot_AT_DT_q = get_dST_dot_q(dS, dSii, DA.T @ q, atmlst, gridslice) @@ -289,26 +812,27 @@ def dK_dot_q(q): f_eps_over_2pi = f_epsilon/(2.0*PI) - q = inverse_K @ R @ v_grids dKdx_dot_q = dK_dot_q(q) - dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q) + dqdx_fix_Vq = -einsum_ij_Adj_Adi_inverseK(K, dKdx_dot_q) - dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice) + dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst) - dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice) + dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice, ngrids) dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V)) - dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V) + dqdx_fix_Vq += einsum_ij_Adj_Adi_inverseK(K, dRdx_dot_V) - invKT_V = inverse_K.T @ v_grids - dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice) + invKT_V = cupy.linalg.solve(K.T, v_grids) + dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice, ngrids) DT_invKT_V = D.T @ invKT_V - dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice) + dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst) dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V) dKdx_dot_invKT_V = dK_dot_q(invKT_V) - dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdx_dot_invKT_V) + invKT_dKdx_dot_invKT_V = einsum_ij_Adj_Adi_inverseK(K.T, dKdx_dot_invKT_V) + + dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T, invKT_dKdx_dot_invKT_V) dqdx_fix_Vq *= -0.5 @@ -317,14 +841,13 @@ def dK_dot_q(q): return dqdx_fix_Vq -def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative): +def get_dvgrids(pcmobj, dm, atmlst, intopt_derivative): assert pcmobj._intermediates is not None mol = pcmobj.mol gridslice = pcmobj.surface['gslice_by_atom'] charge_exp = pcmobj.surface['charge_exp'] grid_coords = pcmobj.surface['grid_coords'] - R = pcmobj._intermediates['R'] atom_coords = mol.atom_coords(unit='B') atom_charges = numpy.asarray(mol.atom_charges(), dtype=numpy.float64) @@ -351,17 +874,24 @@ def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative): g0,g1 = gridslice[i_atom] dV_on_charge_dx[i_atom,:,g0:g1] -= dIdC[:,g0:g1] - KR_symmetrized = 0.5 * (inverse_K @ R + R.T @ inverse_K.T) - dqdx_fix_K_R = cupy.einsum('ij,Adj->Adi', KR_symmetrized, dV_on_charge_dx) + return dV_on_charge_dx + +def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, intopt_derivative): + dV_on_charge_dx = get_dvgrids(pcmobj, dm, atmlst, intopt_derivative) + K = pcmobj._intermediates['K'] + R = pcmobj._intermediates['R'] + R_dVdx = cupy.einsum('ij,Adj->Adi', R, dV_on_charge_dx) + K_1_R_dVdx = einsum_ij_Adj_Adi_inverseK(K, R_dVdx) + K_1T_dVdx = einsum_ij_Adj_Adi_inverseK(K.T, dV_on_charge_dx) + RT_K_1T_dVdx = cupy.einsum('ij,Adj->Adi', R.T, K_1T_dVdx) + dqdx_fix_K_R = 0.5 * (K_1_R_dVdx + RT_K_1T_dVdx) return dqdx_fix_K_R def get_dqsym_dx(pcmobj, dm, atmlst, intopt_derivative): - K = pcmobj._intermediates['K'] - inverse_K = cupy.linalg.inv(K) - return get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K) + get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative) + return get_dqsym_dx_fix_vgrids(pcmobj, atmlst) + get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, intopt_derivative) -def analytic_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None): +def analytical_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None): ''' dv_solv / da ''' @@ -470,8 +1000,9 @@ def kernel(self, *args, dm=None, atmlst=None, **kwargs): dm = dm[0] + dm[1] is_equilibrium = self.base.with_solvent.equilibrium_solvation self.base.with_solvent.equilibrium_solvation = True - self.de_solvent = hess_elec(self.base.with_solvent, dm, verbose=self.verbose) - #self.de_solvent+= hess_nuc(self.base.with_solvent) + self.de_solvent = analytical_hess_nuc(self.base.with_solvent, dm, verbose=self.verbose) + self.de_solvent += analytical_hess_qv(self.base.with_solvent, dm, verbose=self.verbose) + self.de_solvent += analytical_hess_solver(self.base.with_solvent, dm, verbose=self.verbose) self.de_solute = super().kernel(*args, **kwargs) self.de = self.de_solute + self.de_solvent self.base.with_solvent.equilibrium_solvation = is_equilibrium @@ -483,7 +1014,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) if isinstance(self.base, scf.hf.RHF): dm = self.base.make_rdm1(ao_repr=True) - dv = analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) + dv = analytical_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) for i0, ia in enumerate(atmlst): h1ao[i0] += dv[i0] return h1ao @@ -492,15 +1023,15 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): solvent = self.base.with_solvent dm = self.base.make_rdm1(ao_repr=True) dm = dm[0] + dm[1] - dva = analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose) - dvb = analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose) + dva = analytical_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose) + dvb = analytical_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose) for i0, ia in enumerate(atmlst): h1aoa[i0] += dva[i0] h1aob[i0] += dvb[i0] return h1aoa, h1aob else: raise NotImplementedError('Base object is not supported') - + def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1): v1vo = super().get_veff_resp_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi) if not self.base.with_solvent.equilibrium_solvation: @@ -523,7 +1054,7 @@ def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1): else: raise NotImplementedError('Base object is not supported') return v1vo - + def _finalize(self): # disable _finalize. It is called in grad_method.kernel method # where self.de was not yet initialized. diff --git a/gpu4pyscf/solvent/hessian/smd.py b/gpu4pyscf/solvent/hessian/smd.py index 49897d74..dafaa573 100644 --- a/gpu4pyscf/solvent/hessian/smd.py +++ b/gpu4pyscf/solvent/hessian/smd.py @@ -22,8 +22,6 @@ from gpu4pyscf import scf from gpu4pyscf.lib import logger from gpu4pyscf.solvent import smd -from gpu4pyscf.solvent.grad import smd as smd_grad -from gpu4pyscf.solvent.grad import pcm as pcm_grad from gpu4pyscf.solvent.hessian import pcm as pcm_hess from gpu4pyscf.hessian.jk import _ao2mo @@ -60,45 +58,6 @@ def smd_grad_scanner(mol): t1 = log.timer_debug1('solvent energy', *t1) return hess_cds # hartree - -def hess_elec(smdobj, dm, verbose=None): - ''' - slow version with finite difference - TODO: use analytical hess_nuc - ''' - log = logger.new_logger(smdobj, verbose) - t1 = log.init_timer() - pmol = smdobj.mol.copy() - mol = pmol.copy() - coords = mol.atom_coords(unit='Bohr') - - def pcm_grad_scanner(mol): - # TODO: use more analytical forms - smdobj.reset(mol) - e, v = smdobj._get_vind(dm) - #return grad_elec(smdobj, dm) - grad = pcm_grad.grad_nuc(smdobj, dm) - grad+= smd_grad.grad_solver(smdobj, dm) - grad+= pcm_grad.grad_qv(smdobj, dm) - return grad - - mol.verbose = 0 - de = np.zeros([mol.natm, mol.natm, 3, 3]) - eps = 1e-3 - for ia in range(mol.natm): - for ix in range(3): - dv = np.zeros_like(coords) - dv[ia,ix] = eps - mol.set_geom_(coords + dv, unit='Bohr') - g0 = pcm_grad_scanner(mol) - - mol.set_geom_(coords - dv, unit='Bohr') - g1 = pcm_grad_scanner(mol) - de[ia,:,ix] = (g0 - g1)/2.0/eps - t1 = log.timer_debug1('solvent energy', *t1) - smdobj.reset(pmol) - return de - def make_hess_object(hess_method): '''For hess_method in vacuum, add nuclear Hessian of solvent smdobj''' if hess_method.base.with_solvent.frozen: @@ -140,8 +99,9 @@ def kernel(self, *args, dm=None, atmlst=None, **kwargs): dm = dm[0] + dm[1] is_equilibrium = self.base.with_solvent.equilibrium_solvation self.base.with_solvent.equilibrium_solvation = True - self.de_solvent = pcm_hess.hess_elec(self.base.with_solvent, dm, verbose=self.verbose) - #self.de_solvent+= hess_nuc(self.base.with_solvent) + self.de_solvent = pcm_hess.analytical_hess_nuc(self.base.with_solvent, dm, verbose=self.verbose) + self.de_solvent += pcm_hess.analytical_hess_qv(self.base.with_solvent, dm, verbose=self.verbose) + self.de_solvent += pcm_hess.analytical_hess_solver(self.base.with_solvent, dm, verbose=self.verbose) self.de_solute = super().kernel(*args, **kwargs) self.de_cds = get_cds(self.base.with_solvent) self.de = self.de_solute + self.de_solvent + self.de_cds @@ -154,7 +114,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) if isinstance(self.base, scf.hf.RHF): dm = self.base.make_rdm1(ao_repr=True) - dv = pcm_hess.analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) + dv = pcm_hess.analytical_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) for i0, ia in enumerate(atmlst): h1ao[i0] += dv[i0] return h1ao @@ -163,8 +123,8 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): solvent = self.base.with_solvent dm = self.base.make_rdm1(ao_repr=True) dm = dm[0] + dm[1] - dva = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose) - dvb = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose) + dva = pcm_hess.analytical_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose) + dvb = pcm_hess.analytical_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose) for i0, ia in enumerate(atmlst): h1aoa[i0] += dva[i0] h1aob[i0] += dvb[i0] diff --git a/gpu4pyscf/solvent/tests/test_pcm_grad.py b/gpu4pyscf/solvent/tests/test_pcm_grad.py index f141ae56..c17e05f3 100644 --- a/gpu4pyscf/solvent/tests/test_pcm_grad.py +++ b/gpu4pyscf/solvent/tests/test_pcm_grad.py @@ -36,6 +36,7 @@ def setUpModule(): mol.basis = 'sto3g' mol.output = '/dev/null' mol.build(verbose=0) + # Warning: This system has all orbitals filled, which is FAR from physical mol.nelectron = mol.nao * 2 epsilon = 35.9 lebedev_order = 3 @@ -169,11 +170,14 @@ def test_grad_IEFPCM(self): def test_grad_SSVPE(self): grad = _grad_with_solvent('SS(V)PE') - g0 = numpy.asarray( - [[ 3.42479745e-15, -1.00280742e-16, -1.61117735e+00], - [ 1.07135985e+00, -6.97375148e-16, 8.05588676e-01], - [-1.07135985e+00, 7.91425487e-16, 8.05588676e-01]] - ) + # Note: This reference value is obtained via finite difference with dx = 1e-5 + # QChem 6.1 has a bug in SSVPE gradient, they use the IEFPCM gradient algorithm + # to compute SSVPE gradient, which is wrong. + g0 = numpy.asarray([ + [ 0.00000000e+00, -7.10542736e-10, -1.63195623e+00], + [ 1.07705138e+00, 2.13162821e-09, 8.15978117e-01], + [-1.07705138e+00, -2.13162821e-09, 8.15978116e-01], + ]) print(f"Gradient error in RHF with SS(V)PE: {numpy.linalg.norm(g0 - grad)}") assert numpy.linalg.norm(g0 - grad) < 1e-6 diff --git a/gpu4pyscf/solvent/tests/test_pcm_hessian.py b/gpu4pyscf/solvent/tests/test_pcm_hessian.py index c7076f29..6e19ec96 100644 --- a/gpu4pyscf/solvent/tests/test_pcm_hessian.py +++ b/gpu4pyscf/solvent/tests/test_pcm_hessian.py @@ -21,7 +21,7 @@ from gpu4pyscf.solvent import pcm from gpu4pyscf import scf, dft from packaging import version -from gpu4pyscf.solvent.hessian.pcm import analytic_grad_vmat +from gpu4pyscf.solvent.hessian.pcm import analytical_grad_vmat, analytical_hess_nuc, analytical_hess_solver, analytical_hess_qv from gpu4pyscf.lib.cupy_helper import contract pyscf_25 = version.parse(pyscf.__version__) <= version.parse('2.5.0') @@ -130,6 +130,37 @@ def pcm_vmat_scanner(mol): pcmobj.reset(pmol) return vmat +def _fd_hess_contribution(pcmobj, dm, gradient_function): + pmol = pcmobj.mol.copy() + mol = pmol.copy() + coords = mol.atom_coords(unit='Bohr') + + def pcm_grad_scanner(mol): + pcmobj.reset(mol) + e, v = pcmobj._get_vind(dm) + pcm_grad = gradient_function(pcmobj, dm) + # pcm_grad = grad_nuc(pcmobj, dm) + # pcm_grad+= grad_solver(pcmobj, dm) + # pcm_grad+= grad_qv(pcmobj, dm) + return pcm_grad + + mol.verbose = 0 + de = np.zeros([mol.natm, mol.natm, 3, 3]) + eps = 1e-5 + for ia in range(mol.natm): + for ix in range(3): + dv = np.zeros_like(coords) + dv[ia,ix] = eps + mol.set_geom_(coords + dv, unit='Bohr') + g0 = pcm_grad_scanner(mol) + + mol.set_geom_(coords - dv, unit='Bohr') + g1 = pcm_grad_scanner(mol) + + de[ia,:,ix,:] = (g0 - g1)/2.0/eps + pcmobj.reset(pmol) + return de + @unittest.skipIf(pcm.libsolvent is None, "solvent extension not compiled") class KnownValues(unittest.TestCase): def test_df_hess_cpcm(self): @@ -192,7 +223,7 @@ def test_grad_vmat_cpcm(self): mo_coeff = mf.mo_coeff mo_occ = mf.mo_occ - test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + test_grad_vmat = analytical_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) @@ -206,7 +237,7 @@ def test_grad_vmat_iefpcm(self): mo_coeff = mf.mo_coeff mo_occ = mf.mo_occ - test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + test_grad_vmat = analytical_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) @@ -220,11 +251,71 @@ def test_grad_vmat_ssvpe(self): mo_coeff = mf.mo_coeff mo_occ = mf.mo_occ - test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + test_grad_vmat = analytical_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) + def test_hess_nuc_iefpcm(self): + print("testing IEF-PCM d2E_nuc/dx2") + mf = _make_mf(method='IEF-PCM') + hobj = mf.Hessian() + dm = mf.make_rdm1() + + test_grad_vmat = analytical_hess_nuc(hobj.base.with_solvent, dm) + from gpu4pyscf.solvent.grad.pcm import grad_nuc + ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_nuc) + + cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) + + def test_hess_qv_iefpcm(self): + print("testing IEF-PCM d2E_elec/dx2") + mf = _make_mf(method='IEF-PCM') + hobj = mf.Hessian() + dm = mf.make_rdm1() + + test_grad_vmat = analytical_hess_qv(hobj.base.with_solvent, dm) + from gpu4pyscf.solvent.grad.pcm import grad_qv + ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_qv) + + cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) + + def test_hess_solver_cpcm(self): + print("testing C-PCM d2E_KR/dx2") + mf = _make_mf(method='C-PCM') + hobj = mf.Hessian() + dm = mf.make_rdm1() + + test_grad_vmat = analytical_hess_solver(hobj.base.with_solvent, dm) + from gpu4pyscf.solvent.grad.pcm import grad_solver + ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_solver) + + cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) + + def test_hess_solver_iefpcm(self): + print("testing IEF-PCM d2E_KR/dx2") + mf = _make_mf(method='IEF-PCM') + hobj = mf.Hessian() + dm = mf.make_rdm1() + + test_grad_vmat = analytical_hess_solver(hobj.base.with_solvent, dm) + from gpu4pyscf.solvent.grad.pcm import grad_solver + ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_solver) + + cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) + + def test_hess_solver_ssvpe(self): + print("testing SS(V)PE d2E_KR/dx2") + mf = _make_mf(method='SS(V)PE') + hobj = mf.Hessian() + dm = mf.make_rdm1() + + test_grad_vmat = analytical_hess_solver(hobj.base.with_solvent, dm) + from gpu4pyscf.solvent.grad.pcm import grad_solver + ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_solver) + + cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) + @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher') def test_to_gpu(self): import pyscf