From 9b694fa0419525799194df14092526d3986e8375 Mon Sep 17 00:00:00 2001 From: henryw7 <60555480+henryw7@users.noreply.github.com> Date: Wed, 8 Jan 2025 23:09:54 -0800 Subject: [PATCH] Analytical PCM righthand side of CPHF, analytic derivative of V_ia(q) (#298) * C-PCM righthand side of CPHF, analytic derivative of V_munu(q), trash implementation that works * dVia/dx works for IEFPCM * SSVPE is different from ICEPCM without contraction * Small mistake * Remove ngrids * nao * nao memory bottleneck of pcm V_munu matrix derivative * Fix linter error * Remove natm * 3 * nao * nao memory bottleneck, now the bottleneck is natm * 3 * nmo * nocc. Also bug fix here and there * Remove the numerical implementation of dVia/dx * Change finite difference implementation to test * Remove abuse use of cp.einsum and cp.zeros * Reorganize code for 2nd derivative term --- gpu4pyscf/gto/int3c1e_ip.py | 191 +++++++++++- gpu4pyscf/gto/moleintor.py | 58 ---- gpu4pyscf/gto/tests/test_int1e_grids_ip.py | 71 ++++- gpu4pyscf/lib/gint/g1e_ip_root_1.cu | 176 +++++++++++ gpu4pyscf/lib/gint/g3c1e_ip.cu | 215 +++++++++++++ gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu | 209 ++++++++++++- gpu4pyscf/qmmm/pbc/itrf.py | 2 +- gpu4pyscf/solvent/hessian/pcm.py | 327 ++++++++++++++++---- gpu4pyscf/solvent/hessian/smd.py | 6 +- gpu4pyscf/solvent/tests/test_pcm_hessian.py | 88 +++++- 10 files changed, 1197 insertions(+), 146 deletions(-) delete mode 100644 gpu4pyscf/gto/moleintor.py diff --git a/gpu4pyscf/gto/int3c1e_ip.py b/gpu4pyscf/gto/int3c1e_ip.py index c2aeb1be..8b47adce 100644 --- a/gpu4pyscf/gto/int3c1e_ip.py +++ b/gpu4pyscf/gto/int3c1e_ip.py @@ -105,7 +105,7 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt): int3c_angular_slice = cart2sph(int3c_angular_slice, axis=2, ang=lj) int3c_angular_slice = cart2sph(int3c_angular_slice, axis=3, ang=li) - int3c_grid_slice[:, :, j0:j1, i0:i1] = int3c_angular_slice + int3c_grid_slice[:, :, i0:i1, j0:j1] = int3c_angular_slice.transpose(0,1,3,2) ao_idx = np.argsort(intopt._ao_idx) grid_idx = np.arange(p1-p0) @@ -193,10 +193,69 @@ def get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, int int1e_angular_slice = cart2sph(int1e_angular_slice, axis=1, ang=lj) int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=li) - int1e_charge_contracted[:, j0:j1, i0:i1] = int1e_angular_slice + int1e_charge_contracted[:, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,2,1) return intopt.unsort_orbitals(int1e_charge_contracted, axis=[1,2]) +def get_int3c1e_ip1_density_contracted(mol, grids, charge_exponents, dm, intopt): + omega = mol.omega + assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." + + ngrids = grids.shape[0] + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + dm = cp.asarray(dm) + assert dm.ndim == 2 + assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao + + dm = intopt.sort_orbitals(dm, [0,1]) + if not mol.cart: + cart2sph_transformation_matrix = intopt.cart2sph + # TODO: This part is inefficient (O(N^3)), should be changed to the O(N^2) algorithm + dm = cart2sph_transformation_matrix @ dm @ cart2sph_transformation_matrix.T + dm = dm.flatten(order='F') # Column major order matches (i + j * n_ao) access pattern in the C function + + nao = intopt._sorted_mol.nao + + i_atom_of_each_shell = intopt._sorted_mol._bas[:, ATOM_OF] + i_atom_of_each_shell = cp.array(i_atom_of_each_shell, dtype=np.int32) + + ip1_per_atom = cp.zeros([mol.natm, 3, ngrids]) + + for cp_ij_id, _ in enumerate(intopt.log_qs): + stream = cp.cuda.get_current_stream() + + log_q_ij = intopt.log_qs[cp_ij_id] + + nbins = 1 + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents.data.ptr + + err = libgint.GINTfill_int3c1e_ip1_density_contracted( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(grids.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.cast(ip1_per_atom.data.ptr, ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.cast(dm.data.ptr, ctypes.c_void_p), + ctypes.cast(i_atom_of_each_shell.data.ptr, ctypes.c_void_p), + ctypes.c_int(nao), + ctypes.c_double(omega)) + + if err != 0: + raise RuntimeError('GINTfill_int3c1e_charge_contracted failed') + + return ip1_per_atom + def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt): omega = mol.omega assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." @@ -290,6 +349,82 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) return int3c_density_contracted +def get_int3c1e_ip2_charge_contracted(mol, grids, charge_exponents, charges, gridslice, output, intopt): + omega = mol.omega + assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." + + ngrids = grids.shape[0] + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + assert charges.ndim == 1 and charges.shape[0] == grids.shape[0] + charges = cp.asarray(charges).astype(np.float64) + + charges = charges.reshape([-1, 1], order='C') + grids = cp.concatenate([grids, charges], axis=1) + + n_atom = len(gridslice) + i_atom_of_each_charge = [[i_atom] * (gridslice[i_atom][1] - gridslice[i_atom][0]) for i_atom in range(n_atom)] + i_atom_of_each_charge = sum(i_atom_of_each_charge, []) + i_atom_of_each_charge = cp.array(i_atom_of_each_charge, dtype=np.int32) + + assert isinstance(output, cp.ndarray) + assert output.shape == (n_atom, 3, mol.nao, mol.nao) + + for cp_ij_id, _ in enumerate(intopt.log_qs): + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + li = intopt.angular[cpi] + lj = intopt.angular[cpj] + + stream = cp.cuda.get_current_stream() + + log_q_ij = intopt.log_qs[cp_ij_id] + + nbins = 1 + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + + i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] + j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] + ni = i1 - i0 + nj = j1 - j0 + + ao_offsets = np.array([i0, j0], dtype=np.int32) + strides = np.array([ni, ni*nj], dtype=np.int32) + + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents.data.ptr + + int1e_angular_slice = cp.zeros([n_atom, 3, j1-j0, i1-i0], order='C') + + err = libgint.GINTfill_int3c1e_ip2_charge_contracted( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(grids.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p), + strides.ctypes.data_as(ctypes.c_void_p), + ao_offsets.ctypes.data_as(ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.cast(i_atom_of_each_charge.data.ptr, ctypes.c_void_p), + ctypes.c_double(omega)) + + if err != 0: + raise RuntimeError('GINTfill_int3c1e_charge_contracted failed') + + i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] + j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] + if not mol.cart: + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj) + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li) + + output[np.ix_(range(n_atom), range(3), intopt._ao_idx[i0:i1], intopt._ao_idx[j0:j1])] += int1e_angular_slice.transpose(0,1,3,2) + def get_int3c1e_ip1_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt): dm = cp.asarray(dm) if dm.ndim == 3: @@ -302,7 +437,7 @@ def get_int3c1e_ip1_charge_and_density_contracted(mol, grids, charge_exponents, assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao int3c_ip1 = get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, intopt) - int3c_ip1 = cp.einsum('xji,ij->xi', int3c_ip1, dm) + int3c_ip1 = cp.einsum('xij,ij->xi', int3c_ip1, dm) return int3c_ip1 def get_int3c1e_ip2_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt): @@ -319,13 +454,18 @@ def int1e_grids_ip1(mol, grids, charge_exponents=None, dm=None, charges=None, di $$\left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ where $\mu(\vec{r})$ centers at $\vec{A}$ and $\nu(\vec{r})$ centers at $\vec{B}$. - If charges is not None, the function computes the following contraction: + If charges is not None and density is None, the function computes the following contraction: $$\sum_{C}^{n_{charge}} q_C \left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ where $q_C$ is the charge centered at $\vec{C}$. If charges is not None and dm is not None, the function computes the following contraction: $$\sum_\nu^{n_{ao}} D_{\mu\nu} \sum_{C}^{n_{charge}} q_C \left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ + + If dm is not None and charges is None, the function computes the following contraction: + $$\sum_{\mu \in \{\text{AO of atom A}\}} \sum_\nu^{n_{ao}} D_{\mu\nu} + \left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ + The output dimension is $(n_{atom}, 3, n_{charge})$. ''' assert grids is not None @@ -340,12 +480,14 @@ def int1e_grids_ip1(mol, grids, charge_exponents=None, dm=None, charges=None, di if dm is None and charges is None: return get_int3c1e_ip(mol, grids, charge_exponents, intopt)[0] - else: - assert charges is not None + elif charges is not None: if dm is not None: return get_int3c1e_ip1_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt) else: return get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, intopt) + else: + assert dm is not None + return get_int3c1e_ip1_density_contracted(mol, grids, charge_exponents, dm, intopt) def int1e_grids_ip2(mol, grids, charge_exponents=None, dm=None, charges=None, direct_scf_tol=1e-13, intopt=None): r''' @@ -353,12 +495,16 @@ def int1e_grids_ip2(mol, grids, charge_exponents=None, dm=None, charges=None, di $$\left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ where $\mu(\vec{r})$ centers at $\vec{A}$ and $\nu(\vec{r})$ centers at $\vec{B}$. - If dm is not None, the function computes the following contraction: + If dm is not None and charges is None, the function computes the following contraction: $$\sum_{\mu, \nu}^{n_{ao}} D_{\mu\nu} \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ If dm is not None and charges is not None, the function computes the following contraction: $$q_C \sum_{\mu, \nu}^{n_{ao}} D_{\mu\nu} \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ where $q_C$ is the charge centered at $\vec{C}$. + + If charges is not None and dm is None, the function computes the following contraction: + $$\sum_{C}^{n_{charge}} q_C \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ + Notice that this summation should not be performed if the charges originates from different atomic centers. ''' assert grids is not None @@ -373,9 +519,36 @@ def int1e_grids_ip2(mol, grids, charge_exponents=None, dm=None, charges=None, di if dm is None and charges is None: return get_int3c1e_ip(mol, grids, charge_exponents, intopt)[1] - else: - assert dm is not None + elif dm is not None: if charges is not None: return get_int3c1e_ip2_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt) else: return get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) + else: + assert charges is not None + output = cp.zeros([1, 3, mol.nao, mol.nao]) + get_int3c1e_ip2_charge_contracted(mol, grids, charge_exponents, charges, [[0, grids.shape[0]]], output, intopt) + return output.reshape([3, mol.nao, mol.nao]) + +def int1e_grids_ip2_charge_contracted(mol, grids, charges, gridslice, output, charge_exponents=None, direct_scf_tol=1e-13, intopt=None): + r''' + This function computes the following contraction: + $$\sum_{C \in \{\text{grid attached to atom A}\}} q_C + \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ + where $q_C$ is the charge centered at $\vec{C}$. The output dimension is $(n_{atom}, 3, n_{ao}, n_{ao})$. + ''' + assert grids is not None + assert charges is not None + assert gridslice is not None + assert output is not None + + if intopt is None: + intopt = VHFOpt(mol) + intopt.build(direct_scf_tol, aosym=False) + else: + assert isinstance(intopt, VHFOpt), \ + f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object." + assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." + assert not intopt.aosym + + return get_int3c1e_ip2_charge_contracted(mol, grids, charge_exponents, charges, gridslice, output, intopt) diff --git a/gpu4pyscf/gto/moleintor.py b/gpu4pyscf/gto/moleintor.py deleted file mode 100644 index f386aed2..00000000 --- a/gpu4pyscf/gto/moleintor.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import ctypes -import cupy as cp -import numpy as np - -from gpu4pyscf.gto.int3c1e import VHFOpt, get_int3c1e, get_int3c1e_density_contracted, get_int3c1e_charge_contracted -from gpu4pyscf.gto.int3c1e_ip import get_int3c1e_ip, get_int3c1e_ip_contracted - -def intor(mol, intor, grids, charge_exponents=None, dm=None, charges=None, direct_scf_tol=1e-13, intopt=None): - assert grids is not None - - if intopt is None: - intopt = VHFOpt(mol) - aosym = False if 'ip' in intor else True - intopt.build(direct_scf_tol, aosym=aosym) - else: - assert isinstance(intopt, VHFOpt), \ - f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object." - assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." - - if intor == 'int1e_grids': - assert dm is None or charges is None, \ - "Are you sure you want to contract the one electron integrals with both charge and density? " + \ - "If so, pass in density, obtain the result with n_charge and contract with the charges yourself." - assert intopt.aosym - - if dm is None and charges is None: - return get_int3c1e(mol, grids, charge_exponents, intopt) - elif dm is not None: - return get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt) - elif charges is not None: - return get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt) - else: - raise ValueError(f"Logic error in {__file__} {__name__}") - elif intor == 'int1e_grids_ip': - assert not intopt.aosym - - if dm is None and charges is None: - return get_int3c1e_ip(mol, grids, charge_exponents, intopt) - else: - assert dm is not None - assert charges is not None - return get_int3c1e_ip_contracted(mol, grids, charge_exponents, dm, charges, intopt) - else: - raise NotImplementedError(f"GPU intor {intor} is not implemented.") diff --git a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py index e77f30ec..56f87e4b 100644 --- a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py +++ b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py @@ -18,7 +18,7 @@ import cupy as cp import pyscf from pyscf import lib, gto, df -from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2 +from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2, int1e_grids_ip2_charge_contracted def setUpModule(): global mol_sph, mol_cart, grid_points, integral_threshold, density_contraction_threshold, charge_contraction_threshold @@ -74,8 +74,8 @@ def test_int1e_grids_ip_full_tensor_cart(self): test_int1e_dA = int1e_grids_ip1(mol, grid_points) test_int1e_dC = int1e_grids_ip2(mol, grid_points) - test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1) - test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1) + test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1) + test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1) np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) @@ -94,8 +94,8 @@ def test_int1e_grids_ip_full_tensor_sph(self): test_int1e_dA = int1e_grids_ip1(mol, grid_points) test_int1e_dC = int1e_grids_ip2(mol, grid_points) - test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1) - test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1) + test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1) + test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1) np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) @@ -117,8 +117,8 @@ def test_int1e_grids_ip_full_tensor_gaussian_charge(self): test_int1e_dA = int1e_grids_ip1(mol, grid_points, charge_exponents = charge_exponents) test_int1e_dC = int1e_grids_ip2(mol, grid_points, charge_exponents = charge_exponents) - test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1) - test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1) + test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1) + test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1) np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) @@ -141,8 +141,8 @@ def test_int1e_grids_ip_full_tensor_omega(self): test_int1e_dA = int1e_grids_ip1(mol, grid_points) test_int1e_dC = int1e_grids_ip2(mol, grid_points) - test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1) - test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1) + test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1) + test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1) np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) @@ -168,8 +168,8 @@ def test_int1e_grids_ip_full_tensor_gaussian_charge_omega(self): test_int1e_dA = int1e_grids_ip1(mol, grid_points, charge_exponents = charge_exponents) test_int1e_dC = int1e_grids_ip2(mol, grid_points, charge_exponents = charge_exponents) - test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1) - test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1) + test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1) + test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1) np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) @@ -314,6 +314,55 @@ def test_int1e_grids_ip_contracted_gaussian_charge_omega(self): cp.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) cp.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) + def test_int1e_grids_ip2_charge_contracted(self): + np.random.seed(12346) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + mol = mol_sph + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ip2 = mol._add_suffix('int3c2e_ip2') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip2) + q_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip2, aosym='s1', cintopt=cintopt) + + ngrids = grid_points.shape[0] + n_atom = mol.natm + nao = mol.nao + gridslice = [[ngrids * i // n_atom, ngrids * (i + 1) // n_atom] for i in range(n_atom)] + ref_int1e_dC = np.zeros([n_atom, 3, nao, nao]) + for i_atom in range(n_atom): + g0,g1 = gridslice[i_atom] + ref_int1e_dC[i_atom, :, :, :] += np.einsum('dijq,q->dij', q_nj[:, :, :, g0:g1], charges[g0:g1]) + + test_int1e_dC = cp.zeros([n_atom, 3, nao, nao]) + int1e_grids_ip2_charge_contracted(mol, grid_points, charges, gridslice, test_int1e_dC) + + cp.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) + + def test_int1e_grids_ip1_density_contracted(self): + np.random.seed(12347) + dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao)) + + mol = mol_sph + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ip1 = mol._add_suffix('int3c2e_ip1') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1, aosym='s1', cintopt=cintopt) + + v_nj = np.einsum('dijq,ij->dqi', v_nj, dm) + + ngrids = grid_points.shape[0] + aoslice = np.array(mol.aoslice_by_atom()) + ref_int1e_dA = np.empty([mol.natm, 3, ngrids]) + for i_atom in range(mol.natm): + p0,p1 = aoslice[i_atom, 2:] + ref_int1e_dA[i_atom,:,:] = np.einsum('dqi->dq', v_nj[:,:,p0:p1]) + + test_int1e_dA = int1e_grids_ip1(mol, grid_points, dm = dm) + + cp.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) + if __name__ == "__main__": print("Full Tests for One Electron Coulomb Integrals") unittest.main() diff --git a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu index d04b1b2f..1cf53f89 100644 --- a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu +++ b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu @@ -210,6 +210,100 @@ static void GINTfill_int3c1e_ip1_charge_contracted_kernel00(double* output, cons atomicAdd(output + (i0 + j0 * stride_j + 2 * stride_ij), deri_dAz_grid_sum); } +__global__ +static void GINTfill_int3c1e_ip1_density_contracted_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij, + const double* density, const int* aoslice, const int nao, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + const int task_grid = blockIdx.y * blockDim.y + threadIdx.y; + + if (task_ij >= ntasks_ij || task_grid >= ngrids) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + + const double* __restrict__ a12 = c_bpcache.a12; + const double* __restrict__ e12 = c_bpcache.e12; + const double* __restrict__ x12 = c_bpcache.x12; + const double* __restrict__ y12 = c_bpcache.y12; + const double* __restrict__ z12 = c_bpcache.z12; + + const double* __restrict__ a_exponents = c_bpcache.a1; + const int nbas = c_bpcache.nbas; + const double* __restrict__ bas_x = c_bpcache.bas_coords; + const double* __restrict__ bas_y = bas_x + nbas; + const double* __restrict__ bas_z = bas_y + nbas; + const double Ax = bas_x[ish]; + const double Ay = bas_y[ish]; + const double Az = bas_z[ish]; + + const double* grid_point = grid_points + task_grid * 3; + const double Cx = grid_point[0]; + const double Cy = grid_point[1]; + const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double deri_dAx = 0; + double deri_dAy = 0; + double deri_dAz = 0; + for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) { + const double aij = a12[ij]; + const double eij = e12[ij]; + const double Px = x12[ij]; + const double Py = y12[ij]; + const double Pz = z12[ij]; + const double PCx = Px - Cx; + const double PCy = Py - Cy; + const double PCz = Pz - Cz; + const double PAx = Px - Ax; + const double PAy = Py - Ay; + const double PAz = Pz - Az; + const double minus_two_a = -2.0 * a_exponents[ij]; + const double one_over_two_p = 0.5 / aij; + double a0 = aij; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; + a0 *= theta; + + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; + const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); + if (boys_input > 3.e-7) { + const double sqrt_boys_input = sqrt(boys_input); + const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input); + const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input; + deri_dAx += prefactor * minus_two_a * (PAx * R000_0 + one_over_two_p * R000_1 * PCx); + deri_dAy += prefactor * minus_two_a * (PAy * R000_0 + one_over_two_p * R000_1 * PCy); + deri_dAz += prefactor * minus_two_a * (PAz * R000_0 + one_over_two_p * R000_1 * PCz); + } + } + + const int* ao_loc = c_bpcache.ao_loc; + const int i0 = ao_loc[ish]; + const int j0 = ao_loc[jsh]; + + const double Dij = density[i0 + j0 * nao]; + deri_dAx *= Dij; + deri_dAy *= Dij; + deri_dAz *= Dij; + + const int i_atom = aoslice[ish]; + atomicAdd(output + (task_grid + ngrids * (i_atom * 3 + 0)), deri_dAx); + atomicAdd(output + (task_grid + ngrids * (i_atom * 3 + 1)), deri_dAy); + atomicAdd(output + (task_grid + ngrids * (i_atom * 3 + 2)), deri_dAz); +} + __global__ static void GINTfill_int3c1e_ip2_density_contracted_kernel00(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, const BasisProdOffsets offsets, const int nprim_ij, @@ -282,3 +376,85 @@ static void GINTfill_int3c1e_ip2_density_contracted_kernel00(double* output, con atomicAdd(output + task_grid + 1 * ngrids, deri_dCy_pair_sum); atomicAdd(output + task_grid + 2 * ngrids, deri_dCz_pair_sum); } + +__global__ +static void GINTfill_int3c1e_ip2_charge_contracted_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, const int* gridslice, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + const int task_grid = blockIdx.y * blockDim.y + threadIdx.y; + + if (task_ij >= ntasks_ij || task_grid >= ngrids) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + + const double* __restrict__ a12 = c_bpcache.a12; + const double* __restrict__ e12 = c_bpcache.e12; + const double* __restrict__ x12 = c_bpcache.x12; + const double* __restrict__ y12 = c_bpcache.y12; + const double* __restrict__ z12 = c_bpcache.z12; + + const double* grid_point = grid_points + task_grid * 4; + const double Cx = grid_point[0]; + const double Cy = grid_point[1]; + const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double deri_dCx = 0; + double deri_dCy = 0; + double deri_dCz = 0; + for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) { + const double aij = a12[ij]; + const double eij = e12[ij]; + const double Px = x12[ij]; + const double Py = y12[ij]; + const double Pz = z12[ij]; + const double PCx = Px - Cx; + const double PCy = Py - Cy; + const double PCz = Pz - Cz; + double a0 = aij; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; + a0 *= theta; + + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; + const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); + if (boys_input > 3.e-7) { + const double sqrt_boys_input = sqrt(boys_input); + const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input); + const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input; + const double R100_0 = R000_1 * PCx; + const double R010_0 = R000_1 * PCy; + const double R001_0 = R000_1 * PCz; + deri_dCx += prefactor * R100_0; + deri_dCy += prefactor * R010_0; + deri_dCz += prefactor * R001_0; + } + } + + const double charge = grid_point[3]; + deri_dCx *= charge; + deri_dCy *= charge; + deri_dCz *= charge; + + const int i_atom = gridslice[task_grid]; + const int* ao_loc = c_bpcache.ao_loc; + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; + atomicAdd(output + (i0 + j0 * stride_j + 0 * stride_ij + i_atom * 3 * stride_ij), deri_dCx); + atomicAdd(output + (i0 + j0 * stride_j + 1 * stride_ij + i_atom * 3 * stride_ij), deri_dCy); + atomicAdd(output + (i0 + j0 * stride_j + 2 * stride_ij + i_atom * 3 * stride_ij), deri_dCz); +} diff --git a/gpu4pyscf/lib/gint/g3c1e_ip.cu b/gpu4pyscf/lib/gint/g3c1e_ip.cu index b7524ca2..9a806bef 100644 --- a/gpu4pyscf/lib/gint/g3c1e_ip.cu +++ b/gpu4pyscf/lib/gint/g3c1e_ip.cu @@ -366,6 +366,104 @@ static void GINTfill_int3c1e_ip1_charge_contracted_kernel_general(double* output } } +template +__device__ +static void GINTwrite_int3c1e_ip1_density_contracted(const double* g, double* output, const double minus_two_a, const double* density, const int* aoslice, const int nao, + const int ish, const int jsh, const int i_grid, const int i_l, const int j_l, const int ngrids) +{ + const int* ao_loc = c_bpcache.ao_loc; + + const int i0 = ao_loc[ish]; + const int j0 = ao_loc[jsh]; + + const int i_atom = aoslice[ish]; + + const int *idx = c_idx; + const int *idy = c_idx + TOT_NF; + const int *idz = c_idx + TOT_NF * 2; + + const int g_size = NROOTS * (i_l + 1 + 1) * (j_l + 1); + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + g_size; + const double* __restrict__ gz = g + g_size * 2; + + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const int loc_j = c_l_locs[j_l] + j; + const int loc_i = c_l_locs[i_l] + i; + const int ix = idx[loc_i]; + const int iy = idy[loc_i]; + const int iz = idz[loc_i]; + const int jx = idx[loc_j]; + const int jy = idy[loc_j]; + const int jz = idz[loc_j]; + const int gx_offset = ix + jx * (i_l + 1 + 1); + const int gy_offset = iy + jy * (i_l + 1 + 1); + const int gz_offset = iz + jz * (i_l + 1 + 1); + + double deri_dAx = 0; + double deri_dAy = 0; + double deri_dAz = 0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_0 = gx[gx_offset * NROOTS + i_root]; + const double gy_0 = gy[gy_offset * NROOTS + i_root]; + const double gz_0 = gz[gz_offset * NROOTS + i_root]; + const double dgx_dAx = (ix > 0 ? ix * gx[(gx_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gx[(gx_offset + 1) * NROOTS + i_root]; + const double dgy_dAy = (iy > 0 ? iy * gy[(gy_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gy[(gy_offset + 1) * NROOTS + i_root]; + const double dgz_dAz = (iz > 0 ? iz * gz[(gz_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gz[(gz_offset + 1) * NROOTS + i_root]; + deri_dAx += dgx_dAx * gy_0 * gz_0; + deri_dAy += gx_0 * dgy_dAy * gz_0; + deri_dAz += gx_0 * gy_0 * dgz_dAz; + } + const double Dij = density[(i + i0) + (j + j0) * nao]; + deri_dAx *= Dij; + deri_dAy *= Dij; + deri_dAz *= Dij; + atomicAdd(output + (i_grid + ngrids * (i_atom * 3 + 0)), deri_dAx); + atomicAdd(output + (i_grid + ngrids * (i_atom * 3 + 1)), deri_dAy); + atomicAdd(output + (i_grid + ngrids * (i_atom * 3 + 2)), deri_dAz); + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ip1_density_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const double* density, const int* aoslice, const int nao, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + const int task_grid = blockIdx.y * blockDim.y + threadIdx.y; + + if (task_ij >= ntasks_ij || task_grid >= ngrids) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + const double* __restrict__ a_exponents = c_bpcache.a1; + + const double* grid_point = grid_points + task_grid * 3; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double g[GSIZE_INT3C_1E]; + + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + GINT_g1e(g, grid_point, ish, jsh, ij, i_l + 1, j_l, charge_exponent, omega); + const double minus_two_a = -2.0 * a_exponents[ij]; + GINTwrite_int3c1e_ip1_density_contracted(g, output, minus_two_a, density, aoslice, nao, ish, jsh, task_grid, i_l, j_l, ngrids); + } +} + template __global__ static void GINTfill_int3c1e_ip2_density_contracted_kernel_general(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, @@ -464,3 +562,120 @@ static void GINTfill_int3c1e_ip2_density_contracted_kernel_general(double* outpu atomicAdd(output + task_grid + ngrids * 1, deri_dCy_pair_sum); atomicAdd(output + task_grid + ngrids * 2, deri_dCz_pair_sum); } + +template +__device__ +static void GINTwrite_int3c1e_ip2_charge_contracted(const double* g, double* output, const double minus_two_a, const double* u2, const double* AC, const double prefactor, + const int ish, const int jsh, const int i_grid, const int i_l, const int j_l, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, const int* gridslice, const int ngrids) +{ + const int* ao_loc = c_bpcache.ao_loc; + + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; + + const int i_atom = gridslice[i_grid]; + + const int *idx = c_idx; + const int *idy = c_idx + TOT_NF; + const int *idz = c_idx + TOT_NF * 2; + + const int g_size = NROOTS * (i_l + 1 + 1) * (j_l + 1); + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + g_size; + const double* __restrict__ gz = g + g_size * 2; + + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const int loc_j = c_l_locs[j_l] + j; + const int loc_i = c_l_locs[i_l] + i; + const int ix = idx[loc_i]; + const int iy = idy[loc_i]; + const int iz = idz[loc_i]; + const int jx = idx[loc_j]; + const int jy = idy[loc_j]; + const int jz = idz[loc_j]; + const int gx_offset = ix + jx * (i_l + 1 + 1); + const int gy_offset = iy + jy * (i_l + 1 + 1); + const int gz_offset = iz + jz * (i_l + 1 + 1); + + double deri_dCx = 0; + double deri_dCy = 0; + double deri_dCz = 0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_0 = gx[gx_offset * NROOTS + i_root]; + const double gy_0 = gy[gy_offset * NROOTS + i_root]; + const double gz_0 = gz[gz_offset * NROOTS + i_root]; + const double gx_1 = gx[(gx_offset + 1) * NROOTS + i_root]; + const double gy_1 = gy[(gy_offset + 1) * NROOTS + i_root]; + const double gz_1 = gz[(gz_offset + 1) * NROOTS + i_root]; + const double minus_two_u2 = -2.0 * u2[i_root]; + const double dgx_dCx = minus_two_u2 * (gx_1 + AC[0] * gx_0); + const double dgy_dCy = minus_two_u2 * (gy_1 + AC[1] * gy_0); + const double dgz_dCz = minus_two_u2 * (gz_1 + AC[2] * gz_0); + deri_dCx += dgx_dCx * gy_0 * gz_0; + deri_dCy += gx_0 * dgy_dCy * gz_0; + deri_dCz += gx_0 * gy_0 * dgz_dCz; + } + deri_dCx *= prefactor; + deri_dCy *= prefactor; + deri_dCz *= prefactor; + + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij + i_atom * 3 * stride_ij), deri_dCx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij + i_atom * 3 * stride_ij), deri_dCy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij + i_atom * 3 * stride_ij), deri_dCz); + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ip2_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, const int* gridslice, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + const int task_grid = blockIdx.y * blockDim.y + threadIdx.y; + + if (task_ij >= ntasks_ij || task_grid >= ngrids) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + const double* __restrict__ a_exponents = c_bpcache.a1; + const int nbas = c_bpcache.nbas; + const double* __restrict__ bas_x = c_bpcache.bas_coords; + const double* __restrict__ bas_y = bas_x + nbas; + const double* __restrict__ bas_z = bas_y + nbas; + const double Ax = bas_x[ish]; + const double Ay = bas_y[ish]; + const double Az = bas_z[ish]; + + const double* grid_point = grid_points + task_grid * 4; + const double Cx = grid_point[0]; + const double Cy = grid_point[1]; + const double Cz = grid_point[2]; + const double charge = grid_point[3]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + const double AC[3] { Ax - Cx, Ay - Cy, Az - Cz }; + + double g[GSIZE_INT3C_1E]; + double u2[NROOTS]; + + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + GINT_g1e_save_u2(g, u2, grid_point, ish, jsh, ij, i_l + 1, j_l, charge_exponent, omega); + const double minus_two_a = -2.0 * a_exponents[ij]; + GINTwrite_int3c1e_ip2_charge_contracted(g, output, minus_two_a, u2, AC, charge, ish, jsh, task_grid, i_l, j_l, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, ngrids); + } +} diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu index e0ace197..3ee7c423 100644 --- a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu +++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu @@ -113,6 +113,55 @@ static int GINTfill_int3c1e_ip1_charge_contracted_tasks(double* output, const Ba return 0; } +static int GINTfill_int3c1e_ip1_density_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const double* density, const int* aoslice, const int nao, + const double omega, const double* grid_points, const double* charge_exponents, + const cudaStream_t stream) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + + const dim3 threads(THREADSX, THREADSY); + const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); + const int type_ij = i_l * 10 + j_l; + switch (type_ij) { + case 00: GINTfill_int3c1e_ip1_density_contracted_kernel00<<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 01: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 1> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 02: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 2> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 03: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 3> <<>>(output, offsets, nprim_ij, density, shell, nao, omega, grid_points, charge_exponents); break; + // case 04: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 4> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 10: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 0> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 11: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 1> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 12: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 2> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 13: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 3> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 20: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<2, 0> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 21: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<2, 1> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 22: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<2, 2> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 30: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<3, 0> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 31: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<3, 1> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 40: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<4, 0> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + default: + const int nrys_roots = (i_l + j_l + 1) / 2 + 1; + switch (nrys_roots) { + case 1: GINTfill_int3c1e_ip1_density_contracted_kernel_general<1, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + case 2: GINTfill_int3c1e_ip1_density_contracted_kernel_general<2, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_ip1_density_contracted_kernel_general<3, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_ip1_density_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ip1_density_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + default: + fprintf(stderr, "type_ij = %d, nrys_roots = %d out of range\n", type_ij, nrys_roots); + return 1; + } + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err)); + return 1; + } + return 0; +} + static int GINTfill_int3c1e_ip2_density_contracted_tasks(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, const double omega, const double* grid_points, const double* charge_exponents, @@ -147,12 +196,62 @@ static int GINTfill_int3c1e_ip2_density_contracted_tasks(double* output, const d return 0; } +static int GINTfill_int3c1e_ip2_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const int* gridslice, + const double omega, const double* grid_points, const double* charge_exponents, + const cudaStream_t stream) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + + const dim3 threads(THREADSX, THREADSY); + const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); + const int type_ij = i_l * 10 + j_l; + switch (type_ij) { + case 00: GINTfill_int3c1e_ip2_charge_contracted_kernel00<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 01: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 02: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 2> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 03: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 3> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 04: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 4> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 10: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 11: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 12: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 2> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 13: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 3> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 20: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<2, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 21: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<2, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 22: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<2, 2> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 30: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<3, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 31: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<3, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 40: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<4, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + default: + const int nrys_roots = (i_l + j_l + 1) / 2 + 1; + switch (nrys_roots) { + case 1: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<1, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + case 2: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<2, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<3, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + default: + fprintf(stderr, "type_ij = %d, nrys_roots = %d out of range\n", type_ij, nrys_roots); + return 1; + } + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err)); + return 1; + } + return 0; +} + extern "C" { int GINTfill_int3c1e_ip(const cudaStream_t stream, const BasisProdCache* bpcache, const double* grid_points, const double* charge_exponents, const int ngrids, double* integrals, const int* strides, const int* ao_offsets, - const int* bins_locs_ij, int nbins, + const int* bins_locs_ij, const int nbins, const int cp_ij_id, const double omega) { const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; @@ -198,11 +297,63 @@ int GINTfill_int3c1e_ip(const cudaStream_t stream, const BasisProdCache* bpcache return 0; } +int GINTfill_int3c1e_ip1_density_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, + const double* grid_points, const double* charge_exponents, const int ngrids, + double* integral_charge_contracted, + const int* bins_locs_ij, const int nbins, + const int cp_ij_id, + const double* density, const int* aoslice, const int nao, + const double omega) +{ + const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; + const int i_l = cp_ij->l_bra; + const int j_l = cp_ij->l_ket; + const int nrys_roots = (i_l + j_l + 1) / 2 + 1; + const int nprim_ij = cp_ij->nprim_12; + + if (nrys_roots > MAX_NROOTS_INT3C_1E + 1) { + fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots); + return 2; + } + + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + + const int* bas_pairs_locs = bpcache->bas_pairs_locs; + const int* primitive_pairs_locs = bpcache->primitive_pairs_locs; + for (int ij_bin = 0; ij_bin < nbins; ij_bin++) { + const int bas_ij0 = bins_locs_ij[ij_bin]; + const int bas_ij1 = bins_locs_ij[ij_bin + 1]; + const int ntasks_ij = bas_ij1 - bas_ij0; + if (ntasks_ij <= 0) { + continue; + } + + BasisProdOffsets offsets; + offsets.ntasks_ij = ntasks_ij; + offsets.ntasks_kl = ngrids; + offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0; + offsets.bas_kl = -1; + offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij; + offsets.primitive_kl = -1; + + const int err = GINTfill_int3c1e_ip1_density_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij, + density, aoslice, nao, + omega, grid_points, charge_exponents, stream); + + if (err != 0) { + return err; + } + } + + return 0; +} + + int GINTfill_int3c1e_ip1_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, const double* grid_points, const double* charge_exponents, const int ngrids, double* integral_charge_contracted, const int* strides, const int* ao_offsets, - const int* bins_locs_ij, int nbins, + const int* bins_locs_ij, const int nbins, const int cp_ij_id, const double omega, const int n_charge_sum_per_thread) { const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; @@ -252,7 +403,7 @@ int GINTfill_int3c1e_ip2_density_contracted(const cudaStream_t stream, const Bas const double* grid_points, const double* charge_exponents, const int ngrids, const double* dm_pair_ordered, const int* density_offset, double* integral_density_contracted, - const int* bins_locs_ij, int nbins, + const int* bins_locs_ij, const int nbins, const int cp_ij_id, const double omega, const int n_pair_sum_per_thread) { const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; @@ -302,4 +453,56 @@ int GINTfill_int3c1e_ip2_density_contracted(const cudaStream_t stream, const Bas return 0; } + +int GINTfill_int3c1e_ip2_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, + const double* grid_points, const double* charge_exponents, const int ngrids, + double* integral_charge_contracted, + const int* strides, const int* ao_offsets, + const int* bins_locs_ij, const int nbins, + const int cp_ij_id, + const int* gridslice, + const double omega) +{ + const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; + const int i_l = cp_ij->l_bra; + const int j_l = cp_ij->l_ket; + const int nrys_roots = (i_l + j_l + 1) / 2 + 1; + const int nprim_ij = cp_ij->nprim_12; + + if (nrys_roots > MAX_NROOTS_INT3C_1E + 1) { + fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots); + return 2; + } + + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + + const int* bas_pairs_locs = bpcache->bas_pairs_locs; + const int* primitive_pairs_locs = bpcache->primitive_pairs_locs; + for (int ij_bin = 0; ij_bin < nbins; ij_bin++) { + const int bas_ij0 = bins_locs_ij[ij_bin]; + const int bas_ij1 = bins_locs_ij[ij_bin + 1]; + const int ntasks_ij = bas_ij1 - bas_ij0; + if (ntasks_ij <= 0) { + continue; + } + + BasisProdOffsets offsets; + offsets.ntasks_ij = ntasks_ij; + offsets.ntasks_kl = ngrids; + offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0; + offsets.bas_kl = -1; + offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij; + offsets.primitive_kl = -1; + + const int err = GINTfill_int3c1e_ip2_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij, + strides[0], strides[1], ao_offsets[0], ao_offsets[1], + gridslice, omega, grid_points, charge_exponents, stream); + + if (err != 0) { + return err; + } + } + + return 0; +} } diff --git a/gpu4pyscf/qmmm/pbc/itrf.py b/gpu4pyscf/qmmm/pbc/itrf.py index 236e3e10..1b098c8f 100644 --- a/gpu4pyscf/qmmm/pbc/itrf.py +++ b/gpu4pyscf/qmmm/pbc/itrf.py @@ -1009,7 +1009,7 @@ def calculate_h1e(self, h1_gpu): nao = mol.nao if mm_mol.charge_model == 'gaussian' and len(coords) != 0: expnts = cp.hstack([mm_mol.get_zetas()] * len(Ls))[mask] - g_qm += int1e_grids_ip1(mol, coords, charges = charges, charge_exponents = expnts).transpose(0,2,1) + g_qm += int1e_grids_ip1(mol, coords, charges = charges, charge_exponents = expnts) elif mm_mol.charge_model == 'point' and len(coords) != 0: raise RuntimeError("Not tested yet") max_memory = self.max_memory - lib.current_memory()[0] diff --git a/gpu4pyscf/solvent/hessian/pcm.py b/gpu4pyscf/solvent/hessian/pcm.py index ba7cd5cf..538cb859 100644 --- a/gpu4pyscf/solvent/hessian/pcm.py +++ b/gpu4pyscf/solvent/hessian/pcm.py @@ -22,13 +22,16 @@ from pyscf import lib, gto from gpu4pyscf import scf from gpu4pyscf.solvent.pcm import PI -from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc +from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc, get_dD_dS, get_dF_dA, get_dSii from gpu4pyscf.df import int3c2e -from gpu4pyscf.lib.cupy_helper import contract from gpu4pyscf.lib import logger from gpu4pyscf.hessian.jk import _ao2mo +from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2 +from gpu4pyscf.gto import int3c1e +from gpu4pyscf.gto.int3c1e import int1e_grids def hess_nuc(pcmobj): + raise NotImplementedError("Not tested") if not pcmobj._intermediates: pcmobj.build() mol = pcmobj.mol @@ -150,76 +153,282 @@ def pcm_grad_scanner(mol): pcmobj.reset(pmol) return de -def fd_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None): - ''' - dv_solv / da - slow version with finite difference - ''' - log = logger.new_logger(pcmobj, verbose) - t1 = log.init_timer() - pmol = pcmobj.mol.copy() - mol = pmol.copy() - if atmlst is None: - atmlst = range(mol.natm) - nao, nmo = mo_coeff.shape - mocc = mo_coeff[:,mo_occ>0] - nocc = mocc.shape[1] - coords = mol.atom_coords(unit='Bohr') - def pcm_vmat_scanner(mol): - pcmobj.reset(mol) - e, v = pcmobj._get_vind(dm) - return v +def get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K): + assert pcmobj._intermediates is not None - mol.verbose = 0 - vmat = cupy.empty([len(atmlst), 3, nao, nocc]) - eps = 1e-3 - for i0, ia in enumerate(atmlst): - for ix in range(3): - dv = numpy.zeros_like(coords) - dv[ia,ix] = eps - mol.set_geom_(coords + dv, unit='Bohr') - vmat0 = pcm_vmat_scanner(mol) + gridslice = pcmobj.surface['gslice_by_atom'] + v_grids = pcmobj._intermediates['v_grids'] + A = pcmobj._intermediates['A'] + D = pcmobj._intermediates['D'] + S = pcmobj._intermediates['S'] + R = pcmobj._intermediates['R'] + q_sym = pcmobj._intermediates['q_sym'] + f_epsilon = pcmobj._intermediates['f_epsilon'] - mol.set_geom_(coords - dv, unit='Bohr') - vmat1 = pcm_vmat_scanner(mol) + ngrids = q_sym.shape[0] - grad_vmat = (vmat0 - vmat1)/2.0/eps - grad_vmat = contract("ij,jq->iq", grad_vmat, mocc) - grad_vmat = contract("iq,ip->pq", grad_vmat, mo_coeff) - vmat[i0,ix] = grad_vmat - t1 = log.timer_debug1('computing solvent grad veff', *t1) - pcmobj.reset(pmol) - return vmat + def get_dS_dot_q(dS, dSii, q, atmlst, gridslice): + output = cupy.einsum('diA,i->Adi', dSii[:,:,atmlst], q) + for i_atom in atmlst: + g0,g1 = gridslice[i_atom] + output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dS[:,g0:g1,:], q) + output[i_atom, :, :] -= cupy.einsum('dij,j->di', dS[:,:,g0:g1], q[g0:g1]) + return output + def get_dST_dot_q(dS, dSii, q, atmlst, gridslice): + return get_dS_dot_q(-dS.transpose(0,2,1), dSii, q, atmlst, gridslice) + + def get_dA_dot_q(dA, q, atmlst, gridslice): + return cupy.einsum('diA,i->Adi', dA[:,:,atmlst], q) + + def get_dD_dot_q(dD, q, atmlst, gridslice): + output = cupy.zeros([len(atmlst), 3, ngrids]) + for i_atom in atmlst: + g0,g1 = gridslice[i_atom] + output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dD[:,g0:g1,:], q) + output[i_atom, :, :] -= cupy.einsum('dij,j->di', dD[:,:,g0:g1], q[g0:g1]) + return output + def get_dDT_dot_q(dD, q, atmlst, gridslice): + return get_dD_dot_q(-dD.transpose(0,2,1), q, atmlst, gridslice) + + if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']: + _, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True) + dF, _ = get_dF_dA(pcmobj.surface) + dSii = get_dSii(pcmobj.surface, dF) + dF = None + + # dR = 0, dK = dS + dSdx_dot_q = get_dS_dot_q(dS, dSii, q_sym, atmlst, gridslice) + + dqdx_fix_Vq = cupy.einsum('ij,Adj->Adi', inverse_K, dSdx_dot_q) + + elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']: + dF, dA = get_dF_dA(pcmobj.surface) + dSii = get_dSii(pcmobj.surface, dF) + dF = None + + dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True) + + # dR = f_eps/(2*pi) * (dD*A + D*dA) + # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS) + f_eps_over_2pi = f_epsilon/(2.0*PI) + + q = inverse_K @ R @ v_grids + dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice) + + DA = D*A + dKdx_dot_q = dSdx_dot_q - f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q) + + dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq) + + AS = (A * S.T).T # It's just diag(A) @ S + dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_2pi * dDdx_dot_ASq + + dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q) + + dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice) + + dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice) + + dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V)) + dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V) + + invKT_V = inverse_K.T @ v_grids + dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice) + + DT_invKT_V = D.T @ invKT_V + dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice) + dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V) + + dSdxT_dot_invKT_V = get_dST_dot_q(dS, dSii, invKT_V, atmlst, gridslice) + dKdxT_dot_invKT_V = dSdxT_dot_invKT_V + + dKdxT_dot_invKT_V -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_invKT_V) + dKdxT_dot_invKT_V -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_invKT_V) + + dSdxT_dot_AT_DT_invKT_V = get_dST_dot_q(dS, dSii, DA.T @ invKT_V, atmlst, gridslice) + dKdxT_dot_invKT_V -= f_eps_over_2pi * dSdxT_dot_AT_DT_invKT_V + + dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdxT_dot_invKT_V) + + dqdx_fix_Vq *= -0.5 + + elif pcmobj.method.upper() in ['SS(V)PE']: + dF, dA = get_dF_dA(pcmobj.surface) + dSii = get_dSii(pcmobj.surface, dF) + dF = None + + dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True) -""" -def analytic_grad_vmat(pcmobj, mo_coeff, mo_occ, atmlst=None, verbose=None): + f_eps_over_4pi = f_epsilon/(4.0*PI) + + def dK_dot_q(q): + dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice) + + DA = D*A + dKdx_dot_q = dSdx_dot_q - f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q) + + dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq) + + AS = (A * S.T).T # It's just diag(A) @ S + dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_4pi * dDdx_dot_ASq + + dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_q) + + dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_q) + + dSdxT_dot_AT_DT_q = get_dST_dot_q(dS, dSii, DA.T @ q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_4pi * dSdxT_dot_AT_DT_q + + return dKdx_dot_q + + f_eps_over_2pi = f_epsilon/(2.0*PI) + + q = inverse_K @ R @ v_grids + dKdx_dot_q = dK_dot_q(q) + dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q) + + dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice) + + dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice) + + dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V)) + dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V) + + invKT_V = inverse_K.T @ v_grids + dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice) + + DT_invKT_V = D.T @ invKT_V + dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice) + dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V) + + dKdx_dot_invKT_V = dK_dot_q(invKT_V) + dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdx_dot_invKT_V) + + dqdx_fix_Vq *= -0.5 + + else: + raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}") + + return dqdx_fix_Vq + +def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative): + assert pcmobj._intermediates is not None + + mol = pcmobj.mol + gridslice = pcmobj.surface['gslice_by_atom'] + charge_exp = pcmobj.surface['charge_exp'] + grid_coords = pcmobj.surface['grid_coords'] + R = pcmobj._intermediates['R'] + + atom_coords = mol.atom_coords(unit='B') + atom_charges = numpy.asarray(mol.atom_charges(), dtype=numpy.float64) + atom_coords = atom_coords[atmlst] + atom_charges = atom_charges[atmlst] + fakemol_nuc = gto.fakemol_for_charges(atom_coords) + fakemol = gto.fakemol_for_charges(grid_coords.get(), expnt=charge_exp.get()**2) + int2c2e_ip1 = mol._add_suffix('int2c2e_ip1') + v_ng_ip1 = gto.mole.intor_cross(int2c2e_ip1, fakemol_nuc, fakemol) + v_ng_ip1 = cupy.array(v_ng_ip1) + dV_on_charge_dx = cupy.einsum('dAq,A->Adq', v_ng_ip1, atom_charges) + + v_ng_ip2 = gto.mole.intor_cross(int2c2e_ip1, fakemol, fakemol_nuc) + v_ng_ip2 = cupy.array(v_ng_ip2) + for i_atom in atmlst: + g0,g1 = gridslice[i_atom] + dV_on_charge_dx[i_atom,:,g0:g1] += cupy.einsum('dqA,A->dq', v_ng_ip2[:,g0:g1,:], atom_charges) + + dIdA = int1e_grids_ip1(mol, grid_coords, dm = dm + dm.T, intopt = intopt_derivative, charge_exponents = charge_exp**2) + dV_on_charge_dx[atmlst,:,:] -= dIdA[atmlst,:,:] + + dIdC = int1e_grids_ip2(mol, grid_coords, intopt = intopt_derivative, dm = dm, charge_exponents = charge_exp**2) + for i_atom in atmlst: + g0,g1 = gridslice[i_atom] + dV_on_charge_dx[i_atom,:,g0:g1] -= dIdC[:,g0:g1] + + KR_symmetrized = 0.5 * (inverse_K @ R + R.T @ inverse_K.T) + dqdx_fix_K_R = cupy.einsum('ij,Adj->Adi', KR_symmetrized, dV_on_charge_dx) + + return dqdx_fix_K_R + +def get_dqsym_dx(pcmobj, dm, atmlst, intopt_derivative): + K = pcmobj._intermediates['K'] + inverse_K = cupy.linalg.inv(K) + return get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K) + get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative) + +def analytic_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None): ''' dv_solv / da - slow version with finite difference ''' + if not pcmobj._intermediates: + pcmobj.build() + dm_cache = pcmobj._intermediates.get('dm', None) + if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10: + pass + else: + pcmobj._get_vind(dm) + mol = pcmobj.mol log = logger.new_logger(pcmobj, verbose) t1 = log.init_timer() - pmol = pcmobj.mol.copy() - mol = pmol.copy() - if atmlst is None: - atmlst = range(mol.natm) + nao, nmo = mo_coeff.shape mocc = mo_coeff[:,mo_occ>0] nocc = mocc.shape[1] - dm = cupy.dot(mocc, mocc.T) * 2 - coords = mol.atom_coords(unit='Bohr') - # TODO: add those contributions - # contribution due to _get_v - # contribution due to linear solver - # contribution due to _get_vmat + if atmlst is None: + atmlst = range(mol.natm) + + gridslice = pcmobj.surface['gslice_by_atom'] + charge_exp = pcmobj.surface['charge_exp'] + grid_coords = pcmobj.surface['grid_coords'] + q_sym = pcmobj._intermediates['q_sym'] - vmat = cupy.zeros([len(atmlst), 3, nao, nocc]) + aoslice = mol.aoslice_by_atom() + aoslice = numpy.array(aoslice) + + intopt_fock = int3c1e.VHFOpt(mol) + intopt_fock.build(cutoff = 1e-14, aosym = True) + intopt_derivative = int3c1e.VHFOpt(mol) + intopt_derivative.build(cutoff = 1e-14, aosym = False) + + dIdx_mo = cupy.empty([len(atmlst), 3, nmo, nocc]) + + dIdA = int1e_grids_ip1(mol, grid_coords, charges = q_sym, intopt = intopt_derivative, charge_exponents = charge_exp**2) + for i_atom in atmlst: + p0,p1 = aoslice[i_atom, 2:] + # dIdx[i_atom, :, :, :] = 0 + # dIdx[i_atom, :, p0:p1, :] += dIdA[:, p0:p1, :] + # dIdx[i_atom, :, :, p0:p1] += dIdA[:, p0:p1, :].transpose(0,2,1) + dIdA_mo = dIdA[:, p0:p1, :] @ mocc + dIdA_mo = cupy.einsum('ip,dpj->dij', mo_coeff[p0:p1, :].T, dIdA_mo) + dIdB_mo = dIdA[:, p0:p1, :].transpose(0,2,1) @ mocc[p0:p1, :] + dIdB_mo = cupy.einsum('ip,dpj->dij', mo_coeff.T, dIdB_mo) + dIdx_mo[i_atom, :, :, :] = dIdA_mo + dIdB_mo + + for i_atom in atmlst: + g0,g1 = gridslice[i_atom] + dIdC = int1e_grids_ip2(mol, grid_coords[g0:g1,:], charges = q_sym[g0:g1], + intopt = intopt_derivative, charge_exponents = charge_exp[g0:g1]**2) + dIdC_mo = dIdC @ mocc + dIdC_mo = cupy.einsum('ip,dpj->dij', mo_coeff.T, dIdC_mo) + dIdx_mo[i_atom, :, :, :] += dIdC_mo + + dV_on_molecule_dx_mo = dIdx_mo + + dqdx = get_dqsym_dx(pcmobj, dm, atmlst, intopt_derivative) + for i_atom in atmlst: + for i_xyz in range(3): + dIdx_from_dqdx = int1e_grids(mol, grid_coords, charges = dqdx[i_atom, i_xyz, :], + intopt = intopt_fock, charge_exponents = charge_exp**2) + dV_on_molecule_dx_mo[i_atom, i_xyz, :, :] += mo_coeff.T @ dIdx_from_dqdx @ mocc t1 = log.timer_debug1('computing solvent grad veff', *t1) - pcmobj.reset(pmol) - return vmat -""" + return dV_on_molecule_dx_mo def make_hess_object(hess_method): if hess_method.base.with_solvent.frozen: @@ -274,7 +483,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) if isinstance(self.base, scf.hf.RHF): dm = self.base.make_rdm1(ao_repr=True) - dv = fd_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) + dv = analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) for i0, ia in enumerate(atmlst): h1ao[i0] += dv[i0] return h1ao @@ -283,8 +492,8 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): solvent = self.base.with_solvent dm = self.base.make_rdm1(ao_repr=True) dm = dm[0] + dm[1] - dva = fd_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose) - dvb = fd_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose) + dva = analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose) + dvb = analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose) for i0, ia in enumerate(atmlst): h1aoa[i0] += dva[i0] h1aob[i0] += dvb[i0] diff --git a/gpu4pyscf/solvent/hessian/smd.py b/gpu4pyscf/solvent/hessian/smd.py index 644c0e3f..49897d74 100644 --- a/gpu4pyscf/solvent/hessian/smd.py +++ b/gpu4pyscf/solvent/hessian/smd.py @@ -154,7 +154,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) if isinstance(self.base, scf.hf.RHF): dm = self.base.make_rdm1(ao_repr=True) - dv = pcm_hess.fd_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) + dv = pcm_hess.analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) for i0, ia in enumerate(atmlst): h1ao[i0] += dv[i0] return h1ao @@ -163,8 +163,8 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): solvent = self.base.with_solvent dm = self.base.make_rdm1(ao_repr=True) dm = dm[0] + dm[1] - dva = pcm_hess.fd_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose) - dvb = pcm_hess.fd_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose) + dva = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose) + dvb = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose) for i0, ia in enumerate(atmlst): h1aoa[i0] += dva[i0] h1aob[i0] += dvb[i0] diff --git a/gpu4pyscf/solvent/tests/test_pcm_hessian.py b/gpu4pyscf/solvent/tests/test_pcm_hessian.py index 33bf0e67..c7076f29 100644 --- a/gpu4pyscf/solvent/tests/test_pcm_hessian.py +++ b/gpu4pyscf/solvent/tests/test_pcm_hessian.py @@ -14,12 +14,15 @@ import unittest import numpy as np +import cupy as cp import pyscf import pytest from pyscf import gto from gpu4pyscf.solvent import pcm from gpu4pyscf import scf, dft from packaging import version +from gpu4pyscf.solvent.hessian.pcm import analytic_grad_vmat +from gpu4pyscf.lib.cupy_helper import contract pyscf_25 = version.parse(pyscf.__version__) <= version.parse('2.5.0') @@ -50,7 +53,7 @@ def _make_mf(method='C-PCM', restricted=True, density_fit=True): mf = dft.rks.RKS(mol, xc=xc) else: mf = dft.uks.UKS(mol, xc=xc) - + if density_fit: mf = mf.density_fit() mf = mf.PCM() @@ -89,6 +92,44 @@ def _check_hessian(mf, h, ix=0, iy=0): print(f'Norm of H({ix},{iy}) diff, {np.linalg.norm(h[ix,:,iy,:] - h_fd)}') assert(np.linalg.norm(h[ix,:,iy,:] - h_fd) < tol) +def _fd_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None): + ''' + dv_solv / da + slow version with finite difference + ''' + pmol = pcmobj.mol.copy() + mol = pmol.copy() + if atmlst is None: + atmlst = range(mol.natm) + nao, nmo = mo_coeff.shape + mocc = mo_coeff[:,mo_occ>0] + nocc = mocc.shape[1] + coords = mol.atom_coords(unit='Bohr') + def pcm_vmat_scanner(mol): + pcmobj.reset(mol) + e, v = pcmobj._get_vind(dm) + return v + + mol.verbose = 0 + vmat = cp.empty([len(atmlst), 3, nao, nocc]) + eps = 1e-5 + for i0, ia in enumerate(atmlst): + for ix in range(3): + dv = np.zeros_like(coords) + dv[ia,ix] = eps + mol.set_geom_(coords + dv, unit='Bohr') + vmat0 = pcm_vmat_scanner(mol) + + mol.set_geom_(coords - dv, unit='Bohr') + vmat1 = pcm_vmat_scanner(mol) + + grad_vmat = (vmat0 - vmat1)/2.0/eps + grad_vmat = contract("ij,jq->iq", grad_vmat, mocc) + grad_vmat = contract("iq,ip->pq", grad_vmat, mo_coeff) + vmat[i0,ix] = grad_vmat + pcmobj.reset(pmol) + return vmat + @unittest.skipIf(pcm.libsolvent is None, "solvent extension not compiled") class KnownValues(unittest.TestCase): def test_df_hess_cpcm(self): @@ -142,6 +183,48 @@ def test_uks_hess_iefpcm(self): _check_hessian(mf, h, ix=0, iy=0) _check_hessian(mf, h, ix=0, iy=1) + def test_grad_vmat_cpcm(self): + print("testing C-PCM dV_solv/dx") + mf = _make_mf(method='C-PCM') + hobj = mf.Hessian() + + dm = mf.make_rdm1() + mo_coeff = mf.mo_coeff + mo_occ = mf.mo_occ + + test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + + cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) + + def test_grad_vmat_iefpcm(self): + print("testing IEF-PCM dV_solv/dx") + mf = _make_mf(method='IEF-PCM') + hobj = mf.Hessian() + + dm = mf.make_rdm1() + mo_coeff = mf.mo_coeff + mo_occ = mf.mo_occ + + test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + + cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) + + def test_grad_vmat_ssvpe(self): + print("testing SS(V)PE dV_solv/dx") + mf = _make_mf(method='SS(V)PE') + hobj = mf.Hessian() + + dm = mf.make_rdm1() + mo_coeff = mf.mo_coeff + mo_occ = mf.mo_occ + + test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + + cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) + @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher') def test_to_gpu(self): import pyscf @@ -187,7 +270,7 @@ def test_to_cpu(self): mol.basis = 'sto-3g' mol.output = '/dev/null' mol.build(verbose=0) - + mf = dft.RKS(mol, xc='b3lyp').PCM() mf.conv_tol = 1e-12 mf.conv_tol_cpscf = 1e-7 @@ -209,6 +292,7 @@ def test_to_cpu(self): hessobj = hessobj.to_cpu() hess_cpu = hessobj.kernel() assert np.linalg.norm(hess_cpu - hess_gpu) < 1e-5 + if __name__ == "__main__": print("Full Tests for Hessian of PCMs") unittest.main()