From 9b694fa0419525799194df14092526d3986e8375 Mon Sep 17 00:00:00 2001
From: henryw7 <60555480+henryw7@users.noreply.github.com>
Date: Wed, 8 Jan 2025 23:09:54 -0800
Subject: [PATCH] Analytical PCM righthand side of CPHF, analytic derivative of
 V_ia(q) (#298)

* C-PCM righthand side of CPHF, analytic derivative of V_munu(q), trash implementation that works

* dVia/dx works for IEFPCM

* SSVPE is different from ICEPCM without contraction

* Small mistake

* Remove ngrids * nao * nao memory bottleneck of pcm V_munu matrix derivative

* Fix linter error

* Remove natm * 3 * nao * nao memory bottleneck, now the bottleneck is natm * 3 * nmo * nocc. Also bug fix here and there

* Remove the numerical implementation of dVia/dx

* Change finite difference implementation to test

* Remove abuse use of cp.einsum and cp.zeros

* Reorganize code for 2nd derivative term
---
 gpu4pyscf/gto/int3c1e_ip.py                 | 191 +++++++++++-
 gpu4pyscf/gto/moleintor.py                  |  58 ----
 gpu4pyscf/gto/tests/test_int1e_grids_ip.py  |  71 ++++-
 gpu4pyscf/lib/gint/g1e_ip_root_1.cu         | 176 +++++++++++
 gpu4pyscf/lib/gint/g3c1e_ip.cu              | 215 +++++++++++++
 gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu | 209 ++++++++++++-
 gpu4pyscf/qmmm/pbc/itrf.py                  |   2 +-
 gpu4pyscf/solvent/hessian/pcm.py            | 327 ++++++++++++++++----
 gpu4pyscf/solvent/hessian/smd.py            |   6 +-
 gpu4pyscf/solvent/tests/test_pcm_hessian.py |  88 +++++-
 10 files changed, 1197 insertions(+), 146 deletions(-)
 delete mode 100644 gpu4pyscf/gto/moleintor.py

diff --git a/gpu4pyscf/gto/int3c1e_ip.py b/gpu4pyscf/gto/int3c1e_ip.py
index c2aeb1be..8b47adce 100644
--- a/gpu4pyscf/gto/int3c1e_ip.py
+++ b/gpu4pyscf/gto/int3c1e_ip.py
@@ -105,7 +105,7 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt):
                 int3c_angular_slice = cart2sph(int3c_angular_slice, axis=2, ang=lj)
                 int3c_angular_slice = cart2sph(int3c_angular_slice, axis=3, ang=li)
 
-            int3c_grid_slice[:, :, j0:j1, i0:i1] = int3c_angular_slice
+            int3c_grid_slice[:, :, i0:i1, j0:j1] = int3c_angular_slice.transpose(0,1,3,2)
 
         ao_idx = np.argsort(intopt._ao_idx)
         grid_idx = np.arange(p1-p0)
@@ -193,10 +193,69 @@ def get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, int
             int1e_angular_slice = cart2sph(int1e_angular_slice, axis=1, ang=lj)
             int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=li)
 
-        int1e_charge_contracted[:, j0:j1, i0:i1] = int1e_angular_slice
+        int1e_charge_contracted[:, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,2,1)
 
     return intopt.unsort_orbitals(int1e_charge_contracted, axis=[1,2])
 
+def get_int3c1e_ip1_density_contracted(mol, grids, charge_exponents, dm, intopt):
+    omega = mol.omega
+    assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
+
+    ngrids = grids.shape[0]
+    grids = cp.asarray(grids, order='C')
+    if charge_exponents is not None:
+        charge_exponents = cp.asarray(charge_exponents, order='C')
+
+    dm = cp.asarray(dm)
+    assert dm.ndim == 2
+    assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao
+
+    dm = intopt.sort_orbitals(dm, [0,1])
+    if not mol.cart:
+        cart2sph_transformation_matrix = intopt.cart2sph
+        # TODO: This part is inefficient (O(N^3)), should be changed to the O(N^2) algorithm
+        dm = cart2sph_transformation_matrix @ dm @ cart2sph_transformation_matrix.T
+    dm = dm.flatten(order='F') # Column major order matches (i + j * n_ao) access pattern in the C function
+
+    nao = intopt._sorted_mol.nao
+
+    i_atom_of_each_shell = intopt._sorted_mol._bas[:, ATOM_OF]
+    i_atom_of_each_shell = cp.array(i_atom_of_each_shell, dtype=np.int32)
+
+    ip1_per_atom = cp.zeros([mol.natm, 3, ngrids])
+
+    for cp_ij_id, _ in enumerate(intopt.log_qs):
+        stream = cp.cuda.get_current_stream()
+
+        log_q_ij = intopt.log_qs[cp_ij_id]
+
+        nbins = 1
+        bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+
+        charge_exponents_pointer = c_null_ptr()
+        if charge_exponents is not None:
+            charge_exponents_pointer = charge_exponents.data.ptr
+
+        err = libgint.GINTfill_int3c1e_ip1_density_contracted(
+            ctypes.cast(stream.ptr, ctypes.c_void_p),
+            intopt.bpcache,
+            ctypes.cast(grids.data.ptr, ctypes.c_void_p),
+            ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
+            ctypes.c_int(ngrids),
+            ctypes.cast(ip1_per_atom.data.ptr, ctypes.c_void_p),
+            bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nbins),
+            ctypes.c_int(cp_ij_id),
+            ctypes.cast(dm.data.ptr, ctypes.c_void_p),
+            ctypes.cast(i_atom_of_each_shell.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(nao),
+            ctypes.c_double(omega))
+
+        if err != 0:
+            raise RuntimeError('GINTfill_int3c1e_charge_contracted failed')
+
+    return ip1_per_atom
+
 def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt):
     omega = mol.omega
     assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
@@ -290,6 +349,82 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
 
     return int3c_density_contracted
 
+def get_int3c1e_ip2_charge_contracted(mol, grids, charge_exponents, charges, gridslice, output, intopt):
+    omega = mol.omega
+    assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
+
+    ngrids = grids.shape[0]
+    grids = cp.asarray(grids, order='C')
+    if charge_exponents is not None:
+        charge_exponents = cp.asarray(charge_exponents, order='C')
+
+    assert charges.ndim == 1 and charges.shape[0] == grids.shape[0]
+    charges = cp.asarray(charges).astype(np.float64)
+
+    charges = charges.reshape([-1, 1], order='C')
+    grids = cp.concatenate([grids, charges], axis=1)
+
+    n_atom = len(gridslice)
+    i_atom_of_each_charge = [[i_atom] * (gridslice[i_atom][1] - gridslice[i_atom][0]) for i_atom in range(n_atom)]
+    i_atom_of_each_charge = sum(i_atom_of_each_charge, [])
+    i_atom_of_each_charge = cp.array(i_atom_of_each_charge, dtype=np.int32)
+
+    assert isinstance(output, cp.ndarray)
+    assert output.shape == (n_atom, 3, mol.nao, mol.nao)
+
+    for cp_ij_id, _ in enumerate(intopt.log_qs):
+        cpi = intopt.cp_idx[cp_ij_id]
+        cpj = intopt.cp_jdx[cp_ij_id]
+        li = intopt.angular[cpi]
+        lj = intopt.angular[cpj]
+
+        stream = cp.cuda.get_current_stream()
+
+        log_q_ij = intopt.log_qs[cp_ij_id]
+
+        nbins = 1
+        bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+
+        i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
+        j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
+        ni = i1 - i0
+        nj = j1 - j0
+
+        ao_offsets = np.array([i0, j0], dtype=np.int32)
+        strides = np.array([ni, ni*nj], dtype=np.int32)
+
+        charge_exponents_pointer = c_null_ptr()
+        if charge_exponents is not None:
+            charge_exponents_pointer = charge_exponents.data.ptr
+
+        int1e_angular_slice = cp.zeros([n_atom, 3, j1-j0, i1-i0], order='C')
+
+        err = libgint.GINTfill_int3c1e_ip2_charge_contracted(
+            ctypes.cast(stream.ptr, ctypes.c_void_p),
+            intopt.bpcache,
+            ctypes.cast(grids.data.ptr, ctypes.c_void_p),
+            ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
+            ctypes.c_int(ngrids),
+            ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p),
+            strides.ctypes.data_as(ctypes.c_void_p),
+            ao_offsets.ctypes.data_as(ctypes.c_void_p),
+            bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nbins),
+            ctypes.c_int(cp_ij_id),
+            ctypes.cast(i_atom_of_each_charge.data.ptr, ctypes.c_void_p),
+            ctypes.c_double(omega))
+
+        if err != 0:
+            raise RuntimeError('GINTfill_int3c1e_charge_contracted failed')
+
+        i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
+        j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
+        if not mol.cart:
+            int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj)
+            int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li)
+
+        output[np.ix_(range(n_atom), range(3), intopt._ao_idx[i0:i1], intopt._ao_idx[j0:j1])] += int1e_angular_slice.transpose(0,1,3,2)
+
 def get_int3c1e_ip1_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt):
     dm = cp.asarray(dm)
     if dm.ndim == 3:
@@ -302,7 +437,7 @@ def get_int3c1e_ip1_charge_and_density_contracted(mol, grids, charge_exponents,
     assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao
 
     int3c_ip1 = get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, intopt)
-    int3c_ip1 = cp.einsum('xji,ij->xi', int3c_ip1, dm)
+    int3c_ip1 = cp.einsum('xij,ij->xi', int3c_ip1, dm)
     return int3c_ip1
 
 def get_int3c1e_ip2_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt):
@@ -319,13 +454,18 @@ def int1e_grids_ip1(mol, grids, charge_exponents=None, dm=None, charges=None, di
     $$\left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
     where $\mu(\vec{r})$ centers at $\vec{A}$ and $\nu(\vec{r})$ centers at $\vec{B}$.
 
-    If charges is not None, the function computes the following contraction:
+    If charges is not None and density is None, the function computes the following contraction:
     $$\sum_{C}^{n_{charge}} q_C \left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
     where $q_C$ is the charge centered at $\vec{C}$.
 
     If charges is not None and dm is not None, the function computes the following contraction:
     $$\sum_\nu^{n_{ao}} D_{\mu\nu} \sum_{C}^{n_{charge}} q_C
         \left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
+
+    If dm is not None and charges is None, the function computes the following contraction:
+    $$\sum_{\mu \in \{\text{AO of atom A}\}} \sum_\nu^{n_{ao}} D_{\mu\nu}
+        \left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
+    The output dimension is $(n_{atom}, 3, n_{charge})$.
     '''
     assert grids is not None
 
@@ -340,12 +480,14 @@ def int1e_grids_ip1(mol, grids, charge_exponents=None, dm=None, charges=None, di
 
     if dm is None and charges is None:
         return get_int3c1e_ip(mol, grids, charge_exponents, intopt)[0]
-    else:
-        assert charges is not None
+    elif charges is not None:
         if dm is not None:
             return get_int3c1e_ip1_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt)
         else:
             return get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, intopt)
+    else:
+        assert dm is not None
+        return get_int3c1e_ip1_density_contracted(mol, grids, charge_exponents, dm, intopt)
 
 def int1e_grids_ip2(mol, grids, charge_exponents=None, dm=None, charges=None, direct_scf_tol=1e-13, intopt=None):
     r'''
@@ -353,12 +495,16 @@ def int1e_grids_ip2(mol, grids, charge_exponents=None, dm=None, charges=None, di
     $$\left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
     where $\mu(\vec{r})$ centers at $\vec{A}$ and $\nu(\vec{r})$ centers at $\vec{B}$.
 
-    If dm is not None, the function computes the following contraction:
+    If dm is not None and charges is None, the function computes the following contraction:
     $$\sum_{\mu, \nu}^{n_{ao}} D_{\mu\nu} \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
 
     If dm is not None and charges is not None, the function computes the following contraction:
     $$q_C \sum_{\mu, \nu}^{n_{ao}} D_{\mu\nu} \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
     where $q_C$ is the charge centered at $\vec{C}$.
+
+    If charges is not None and dm is None, the function computes the following contraction:
+    $$\sum_{C}^{n_{charge}} q_C \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
+    Notice that this summation should not be performed if the charges originates from different atomic centers.
     '''
     assert grids is not None
 
@@ -373,9 +519,36 @@ def int1e_grids_ip2(mol, grids, charge_exponents=None, dm=None, charges=None, di
 
     if dm is None and charges is None:
         return get_int3c1e_ip(mol, grids, charge_exponents, intopt)[1]
-    else:
-        assert dm is not None
+    elif dm is not None:
         if charges is not None:
             return get_int3c1e_ip2_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt)
         else:
             return get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
+    else:
+        assert charges is not None
+        output = cp.zeros([1, 3, mol.nao, mol.nao])
+        get_int3c1e_ip2_charge_contracted(mol, grids, charge_exponents, charges, [[0, grids.shape[0]]], output, intopt)
+        return output.reshape([3, mol.nao, mol.nao])
+
+def int1e_grids_ip2_charge_contracted(mol, grids, charges, gridslice, output, charge_exponents=None, direct_scf_tol=1e-13, intopt=None):
+    r'''
+    This function computes the following contraction:
+    $$\sum_{C \in \{\text{grid attached to atom A}\}} q_C
+        \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$
+    where $q_C$ is the charge centered at $\vec{C}$. The output dimension is $(n_{atom}, 3, n_{ao}, n_{ao})$.
+    '''
+    assert grids is not None
+    assert charges is not None
+    assert gridslice is not None
+    assert output is not None
+
+    if intopt is None:
+        intopt = VHFOpt(mol)
+        intopt.build(direct_scf_tol, aosym=False)
+    else:
+        assert isinstance(intopt, VHFOpt), \
+            f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
+        assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
+        assert not intopt.aosym
+
+    return get_int3c1e_ip2_charge_contracted(mol, grids, charge_exponents, charges, gridslice, output, intopt)
diff --git a/gpu4pyscf/gto/moleintor.py b/gpu4pyscf/gto/moleintor.py
deleted file mode 100644
index f386aed2..00000000
--- a/gpu4pyscf/gto/moleintor.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import ctypes
-import cupy as cp
-import numpy as np
-
-from gpu4pyscf.gto.int3c1e import VHFOpt, get_int3c1e, get_int3c1e_density_contracted, get_int3c1e_charge_contracted
-from gpu4pyscf.gto.int3c1e_ip import get_int3c1e_ip, get_int3c1e_ip_contracted
-
-def intor(mol, intor, grids, charge_exponents=None, dm=None, charges=None, direct_scf_tol=1e-13, intopt=None):
-    assert grids is not None
-
-    if intopt is None:
-        intopt = VHFOpt(mol)
-        aosym = False if 'ip' in intor else True
-        intopt.build(direct_scf_tol, aosym=aosym)
-    else:
-        assert isinstance(intopt, VHFOpt), \
-            f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
-        assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
-
-    if intor == 'int1e_grids':
-        assert dm is None or charges is None, \
-            "Are you sure you want to contract the one electron integrals with both charge and density? " + \
-            "If so, pass in density, obtain the result with n_charge and contract with the charges yourself."
-        assert intopt.aosym
-
-        if dm is None and charges is None:
-            return get_int3c1e(mol, grids, charge_exponents, intopt)
-        elif dm is not None:
-            return get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt)
-        elif charges is not None:
-            return get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt)
-        else:
-            raise ValueError(f"Logic error in {__file__} {__name__}")
-    elif intor == 'int1e_grids_ip':
-        assert not intopt.aosym
-
-        if dm is None and charges is None:
-            return get_int3c1e_ip(mol, grids, charge_exponents, intopt)
-        else:
-            assert dm is not None
-            assert charges is not None
-            return get_int3c1e_ip_contracted(mol, grids, charge_exponents, dm, charges, intopt)
-    else:
-        raise NotImplementedError(f"GPU intor {intor} is not implemented.")
diff --git a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
index e77f30ec..56f87e4b 100644
--- a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
+++ b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
@@ -18,7 +18,7 @@
 import cupy as cp
 import pyscf
 from pyscf import lib, gto, df
-from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2
+from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2, int1e_grids_ip2_charge_contracted
 
 def setUpModule():
     global mol_sph, mol_cart, grid_points, integral_threshold, density_contraction_threshold, charge_contraction_threshold
@@ -74,8 +74,8 @@ def test_int1e_grids_ip_full_tensor_cart(self):
 
         test_int1e_dA = int1e_grids_ip1(mol, grid_points)
         test_int1e_dC = int1e_grids_ip2(mol, grid_points)
-        test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1)
-        test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1)
+        test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1)
+        test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1)
 
         np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
         np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
@@ -94,8 +94,8 @@ def test_int1e_grids_ip_full_tensor_sph(self):
 
         test_int1e_dA = int1e_grids_ip1(mol, grid_points)
         test_int1e_dC = int1e_grids_ip2(mol, grid_points)
-        test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1)
-        test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1)
+        test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1)
+        test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1)
 
         np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
         np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
@@ -117,8 +117,8 @@ def test_int1e_grids_ip_full_tensor_gaussian_charge(self):
 
         test_int1e_dA = int1e_grids_ip1(mol, grid_points, charge_exponents = charge_exponents)
         test_int1e_dC = int1e_grids_ip2(mol, grid_points, charge_exponents = charge_exponents)
-        test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1)
-        test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1)
+        test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1)
+        test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1)
 
         np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
         np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
@@ -141,8 +141,8 @@ def test_int1e_grids_ip_full_tensor_omega(self):
 
         test_int1e_dA = int1e_grids_ip1(mol, grid_points)
         test_int1e_dC = int1e_grids_ip2(mol, grid_points)
-        test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1)
-        test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1)
+        test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1)
+        test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1)
 
         np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
         np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
@@ -168,8 +168,8 @@ def test_int1e_grids_ip_full_tensor_gaussian_charge_omega(self):
 
         test_int1e_dA = int1e_grids_ip1(mol, grid_points, charge_exponents = charge_exponents)
         test_int1e_dC = int1e_grids_ip2(mol, grid_points, charge_exponents = charge_exponents)
-        test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1)
-        test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1)
+        test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1)
+        test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1)
 
         np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
         np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
@@ -314,6 +314,55 @@ def test_int1e_grids_ip_contracted_gaussian_charge_omega(self):
         cp.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
         cp.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
 
+    def test_int1e_grids_ip2_charge_contracted(self):
+        np.random.seed(12346)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+        mol = mol_sph
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ip2 = mol._add_suffix('int3c2e_ip2')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip2)
+        q_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip2, aosym='s1', cintopt=cintopt)
+
+        ngrids = grid_points.shape[0]
+        n_atom = mol.natm
+        nao = mol.nao
+        gridslice = [[ngrids * i // n_atom, ngrids * (i + 1) // n_atom] for i in range(n_atom)]
+        ref_int1e_dC = np.zeros([n_atom, 3, nao, nao])
+        for i_atom in range(n_atom):
+            g0,g1 = gridslice[i_atom]
+            ref_int1e_dC[i_atom, :, :, :] += np.einsum('dijq,q->dij', q_nj[:, :, :, g0:g1], charges[g0:g1])
+
+        test_int1e_dC = cp.zeros([n_atom, 3, nao, nao])
+        int1e_grids_ip2_charge_contracted(mol, grid_points, charges, gridslice, test_int1e_dC)
+
+        cp.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold)
+
+    def test_int1e_grids_ip1_density_contracted(self):
+        np.random.seed(12347)
+        dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao))
+
+        mol = mol_sph
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ip1 = mol._add_suffix('int3c2e_ip1')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1, aosym='s1', cintopt=cintopt)
+
+        v_nj = np.einsum('dijq,ij->dqi', v_nj, dm)
+
+        ngrids = grid_points.shape[0]
+        aoslice = np.array(mol.aoslice_by_atom())
+        ref_int1e_dA = np.empty([mol.natm, 3, ngrids])
+        for i_atom in range(mol.natm):
+            p0,p1 = aoslice[i_atom, 2:]
+            ref_int1e_dA[i_atom,:,:] = np.einsum('dqi->dq', v_nj[:,:,p0:p1])
+
+        test_int1e_dA = int1e_grids_ip1(mol, grid_points, dm = dm)
+
+        cp.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
+
 if __name__ == "__main__":
     print("Full Tests for One Electron Coulomb Integrals")
     unittest.main()
diff --git a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu
index d04b1b2f..1cf53f89 100644
--- a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu
+++ b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu
@@ -210,6 +210,100 @@ static void GINTfill_int3c1e_ip1_charge_contracted_kernel00(double* output, cons
     atomicAdd(output + (i0 + j0 * stride_j + 2 * stride_ij), deri_dAz_grid_sum);
 }
 
+__global__
+static void GINTfill_int3c1e_ip1_density_contracted_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij,
+                                                             const double* density, const int* aoslice, const int nao,
+                                                             const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    const int task_grid = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (task_ij >= ntasks_ij || task_grid >= ngrids) {
+        return;
+    }
+
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+
+    const double* __restrict__ a12 = c_bpcache.a12;
+    const double* __restrict__ e12 = c_bpcache.e12;
+    const double* __restrict__ x12 = c_bpcache.x12;
+    const double* __restrict__ y12 = c_bpcache.y12;
+    const double* __restrict__ z12 = c_bpcache.z12;
+
+    const double* __restrict__ a_exponents = c_bpcache.a1;
+    const int nbas = c_bpcache.nbas;
+    const double* __restrict__ bas_x = c_bpcache.bas_coords;
+    const double* __restrict__ bas_y = bas_x + nbas;
+    const double* __restrict__ bas_z = bas_y + nbas;
+    const double Ax = bas_x[ish];
+    const double Ay = bas_y[ish];
+    const double Az = bas_z[ish];
+
+    const double* grid_point = grid_points + task_grid * 3;
+    const double Cx = grid_point[0];
+    const double Cy = grid_point[1];
+    const double Cz = grid_point[2];
+    const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+    double deri_dAx = 0;
+    double deri_dAy = 0;
+    double deri_dAz = 0;
+    for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) {
+        const double aij = a12[ij];
+        const double eij = e12[ij];
+        const double Px  = x12[ij];
+        const double Py  = y12[ij];
+        const double Pz  = z12[ij];
+        const double PCx = Px - Cx;
+        const double PCy = Py - Cy;
+        const double PCz = Pz - Cz;
+        const double PAx = Px - Ax;
+        const double PAy = Py - Ay;
+        const double PAz = Pz - Az;
+        const double minus_two_a = -2.0 * a_exponents[ij];
+        const double one_over_two_p = 0.5 / aij;
+        double a0 = aij;
+        const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0;
+        const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0;
+        a0 *= q_over_p_plus_q;
+        const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
+        const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0;
+        a0 *= theta;
+
+        const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q;
+        const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz);
+        if (boys_input > 3.e-7) {
+            const double sqrt_boys_input = sqrt(boys_input);
+            const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input);
+            const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input;
+            deri_dAx += prefactor * minus_two_a * (PAx * R000_0 + one_over_two_p * R000_1 * PCx);
+            deri_dAy += prefactor * minus_two_a * (PAy * R000_0 + one_over_two_p * R000_1 * PCy);
+            deri_dAz += prefactor * minus_two_a * (PAz * R000_0 + one_over_two_p * R000_1 * PCz);
+        }
+    }
+
+    const int* ao_loc = c_bpcache.ao_loc;
+    const int i0 = ao_loc[ish];
+    const int j0 = ao_loc[jsh];
+
+    const double Dij = density[i0 + j0 * nao];
+    deri_dAx *= Dij;
+    deri_dAy *= Dij;
+    deri_dAz *= Dij;
+
+    const int i_atom = aoslice[ish];
+    atomicAdd(output + (task_grid + ngrids * (i_atom * 3 + 0)), deri_dAx);
+    atomicAdd(output + (task_grid + ngrids * (i_atom * 3 + 1)), deri_dAy);
+    atomicAdd(output + (task_grid + ngrids * (i_atom * 3 + 2)), deri_dAz);
+}
+
 __global__
 static void GINTfill_int3c1e_ip2_density_contracted_kernel00(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
                                                              const BasisProdOffsets offsets, const int nprim_ij,
@@ -282,3 +376,85 @@ static void GINTfill_int3c1e_ip2_density_contracted_kernel00(double* output, con
     atomicAdd(output + task_grid + 1 * ngrids, deri_dCy_pair_sum);
     atomicAdd(output + task_grid + 2 * ngrids, deri_dCz_pair_sum);
 }
+
+__global__
+static void GINTfill_int3c1e_ip2_charge_contracted_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij,
+                                                            const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, const int* gridslice,
+                                                            const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    const int task_grid = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (task_ij >= ntasks_ij || task_grid >= ngrids) {
+        return;
+    }
+
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+
+    const double* __restrict__ a12 = c_bpcache.a12;
+    const double* __restrict__ e12 = c_bpcache.e12;
+    const double* __restrict__ x12 = c_bpcache.x12;
+    const double* __restrict__ y12 = c_bpcache.y12;
+    const double* __restrict__ z12 = c_bpcache.z12;
+
+    const double* grid_point = grid_points + task_grid * 4;
+    const double Cx = grid_point[0];
+    const double Cy = grid_point[1];
+    const double Cz = grid_point[2];
+    const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+    double deri_dCx = 0;
+    double deri_dCy = 0;
+    double deri_dCz = 0;
+    for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) {
+        const double aij = a12[ij];
+        const double eij = e12[ij];
+        const double Px  = x12[ij];
+        const double Py  = y12[ij];
+        const double Pz  = z12[ij];
+        const double PCx = Px - Cx;
+        const double PCy = Py - Cy;
+        const double PCz = Pz - Cz;
+        double a0 = aij;
+        const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0;
+        const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0;
+        a0 *= q_over_p_plus_q;
+        const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
+        const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0;
+        a0 *= theta;
+
+        const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q;
+        const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz);
+        if (boys_input > 3.e-7) {
+            const double sqrt_boys_input = sqrt(boys_input);
+            const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input);
+            const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input;
+            const double R100_0 = R000_1 * PCx;
+            const double R010_0 = R000_1 * PCy;
+            const double R001_0 = R000_1 * PCz;
+            deri_dCx += prefactor * R100_0;
+            deri_dCy += prefactor * R010_0;
+            deri_dCz += prefactor * R001_0;
+        }
+    }
+
+    const double charge = grid_point[3];
+    deri_dCx *= charge;
+    deri_dCy *= charge;
+    deri_dCz *= charge;
+
+    const int i_atom = gridslice[task_grid];
+    const int* ao_loc = c_bpcache.ao_loc;
+    const int i0 = ao_loc[ish] - ao_offsets_i;
+    const int j0 = ao_loc[jsh] - ao_offsets_j;
+    atomicAdd(output + (i0 + j0 * stride_j + 0 * stride_ij + i_atom * 3 * stride_ij), deri_dCx);
+    atomicAdd(output + (i0 + j0 * stride_j + 1 * stride_ij + i_atom * 3 * stride_ij), deri_dCy);
+    atomicAdd(output + (i0 + j0 * stride_j + 2 * stride_ij + i_atom * 3 * stride_ij), deri_dCz);
+}
diff --git a/gpu4pyscf/lib/gint/g3c1e_ip.cu b/gpu4pyscf/lib/gint/g3c1e_ip.cu
index b7524ca2..9a806bef 100644
--- a/gpu4pyscf/lib/gint/g3c1e_ip.cu
+++ b/gpu4pyscf/lib/gint/g3c1e_ip.cu
@@ -366,6 +366,104 @@ static void GINTfill_int3c1e_ip1_charge_contracted_kernel_general(double* output
     }
 }
 
+template <int NROOTS>
+__device__
+static void GINTwrite_int3c1e_ip1_density_contracted(const double* g, double* output, const double minus_two_a, const double* density, const int* aoslice, const int nao,
+                                                     const int ish, const int jsh, const int i_grid, const int i_l, const int j_l, const int ngrids)
+{
+    const int* ao_loc = c_bpcache.ao_loc;
+
+    const int i0 = ao_loc[ish];
+    const int j0 = ao_loc[jsh];
+
+    const int i_atom = aoslice[ish];
+
+    const int *idx = c_idx;
+    const int *idy = c_idx + TOT_NF;
+    const int *idz = c_idx + TOT_NF * 2;
+
+    const int g_size = NROOTS * (i_l + 1 + 1) * (j_l + 1);
+    const double* __restrict__ gx = g;
+    const double* __restrict__ gy = g + g_size;
+    const double* __restrict__ gz = g + g_size * 2;
+
+    const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+    const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+    for (int j = 0; j < n_density_elements_j; j++) {
+        for (int i = 0; i < n_density_elements_i; i++) {
+            const int loc_j = c_l_locs[j_l] + j;
+            const int loc_i = c_l_locs[i_l] + i;
+            const int ix = idx[loc_i];
+            const int iy = idy[loc_i];
+            const int iz = idz[loc_i];
+            const int jx = idx[loc_j];
+            const int jy = idy[loc_j];
+            const int jz = idz[loc_j];
+            const int gx_offset = ix + jx * (i_l + 1 + 1);
+            const int gy_offset = iy + jy * (i_l + 1 + 1);
+            const int gz_offset = iz + jz * (i_l + 1 + 1);
+
+            double deri_dAx = 0;
+            double deri_dAy = 0;
+            double deri_dAz = 0;
+#pragma unroll
+            for (int i_root = 0; i_root < NROOTS; i_root++) {
+                const double gx_0 = gx[gx_offset * NROOTS + i_root];
+                const double gy_0 = gy[gy_offset * NROOTS + i_root];
+                const double gz_0 = gz[gz_offset * NROOTS + i_root];
+                const double dgx_dAx = (ix > 0 ? ix * gx[(gx_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gx[(gx_offset + 1) * NROOTS + i_root];
+                const double dgy_dAy = (iy > 0 ? iy * gy[(gy_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gy[(gy_offset + 1) * NROOTS + i_root];
+                const double dgz_dAz = (iz > 0 ? iz * gz[(gz_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gz[(gz_offset + 1) * NROOTS + i_root];
+                deri_dAx += dgx_dAx * gy_0 * gz_0;
+                deri_dAy += gx_0 * dgy_dAy * gz_0;
+                deri_dAz += gx_0 * gy_0 * dgz_dAz;
+            }
+            const double Dij = density[(i + i0) + (j + j0) * nao];
+            deri_dAx *= Dij;
+            deri_dAy *= Dij;
+            deri_dAz *= Dij;
+            atomicAdd(output + (i_grid + ngrids * (i_atom * 3 + 0)), deri_dAx);
+            atomicAdd(output + (i_grid + ngrids * (i_atom * 3 + 1)), deri_dAy);
+            atomicAdd(output + (i_grid + ngrids * (i_atom * 3 + 2)), deri_dAz);
+        }
+    }
+}
+
+template <int NROOTS, int GSIZE_INT3C_1E>
+__global__
+static void GINTfill_int3c1e_ip1_density_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                                   const double* density, const int* aoslice, const int nao,
+                                                                   const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    const int task_grid = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (task_ij >= ntasks_ij || task_grid >= ngrids) {
+        return;
+    }
+
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+    const double* __restrict__ a_exponents = c_bpcache.a1;
+
+    const double* grid_point = grid_points + task_grid * 3;
+    const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+    double g[GSIZE_INT3C_1E];
+
+    for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
+        GINT_g1e<NROOTS>(g, grid_point, ish, jsh, ij, i_l + 1, j_l, charge_exponent, omega);
+        const double minus_two_a = -2.0 * a_exponents[ij];
+        GINTwrite_int3c1e_ip1_density_contracted<NROOTS>(g, output, minus_two_a, density, aoslice, nao, ish, jsh, task_grid, i_l, j_l, ngrids);
+    }
+}
+
 template <int L_SUM>
 __global__
 static void GINTfill_int3c1e_ip2_density_contracted_kernel_general(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
@@ -464,3 +562,120 @@ static void GINTfill_int3c1e_ip2_density_contracted_kernel_general(double* outpu
     atomicAdd(output + task_grid + ngrids * 1, deri_dCy_pair_sum);
     atomicAdd(output + task_grid + ngrids * 2, deri_dCz_pair_sum);
 }
+
+template <int NROOTS>
+__device__
+static void GINTwrite_int3c1e_ip2_charge_contracted(const double* g, double* output, const double minus_two_a, const double* u2, const double* AC, const double prefactor,
+                                                    const int ish, const int jsh, const int i_grid, const int i_l, const int j_l,
+                                                    const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, const int* gridslice, const int ngrids)
+{
+    const int* ao_loc = c_bpcache.ao_loc;
+
+    const int i0 = ao_loc[ish] - ao_offsets_i;
+    const int j0 = ao_loc[jsh] - ao_offsets_j;
+
+    const int i_atom = gridslice[i_grid];
+
+    const int *idx = c_idx;
+    const int *idy = c_idx + TOT_NF;
+    const int *idz = c_idx + TOT_NF * 2;
+
+    const int g_size = NROOTS * (i_l + 1 + 1) * (j_l + 1);
+    const double* __restrict__ gx = g;
+    const double* __restrict__ gy = g + g_size;
+    const double* __restrict__ gz = g + g_size * 2;
+
+    const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+    const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+    for (int j = 0; j < n_density_elements_j; j++) {
+        for (int i = 0; i < n_density_elements_i; i++) {
+            const int loc_j = c_l_locs[j_l] + j;
+            const int loc_i = c_l_locs[i_l] + i;
+            const int ix = idx[loc_i];
+            const int iy = idy[loc_i];
+            const int iz = idz[loc_i];
+            const int jx = idx[loc_j];
+            const int jy = idy[loc_j];
+            const int jz = idz[loc_j];
+            const int gx_offset = ix + jx * (i_l + 1 + 1);
+            const int gy_offset = iy + jy * (i_l + 1 + 1);
+            const int gz_offset = iz + jz * (i_l + 1 + 1);
+
+            double deri_dCx = 0;
+            double deri_dCy = 0;
+            double deri_dCz = 0;
+#pragma unroll
+            for (int i_root = 0; i_root < NROOTS; i_root++) {
+                const double gx_0 = gx[gx_offset * NROOTS + i_root];
+                const double gy_0 = gy[gy_offset * NROOTS + i_root];
+                const double gz_0 = gz[gz_offset * NROOTS + i_root];
+                const double gx_1 = gx[(gx_offset + 1) * NROOTS + i_root];
+                const double gy_1 = gy[(gy_offset + 1) * NROOTS + i_root];
+                const double gz_1 = gz[(gz_offset + 1) * NROOTS + i_root];
+                const double minus_two_u2 = -2.0 * u2[i_root];
+                const double dgx_dCx = minus_two_u2 * (gx_1 + AC[0] * gx_0);
+                const double dgy_dCy = minus_two_u2 * (gy_1 + AC[1] * gy_0);
+                const double dgz_dCz = minus_two_u2 * (gz_1 + AC[2] * gz_0);
+                deri_dCx += dgx_dCx * gy_0 * gz_0;
+                deri_dCy += gx_0 * dgy_dCy * gz_0;
+                deri_dCz += gx_0 * gy_0 * dgz_dCz;
+            }
+            deri_dCx *= prefactor;
+            deri_dCy *= prefactor;
+            deri_dCz *= prefactor;
+
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij + i_atom * 3 * stride_ij), deri_dCx);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij + i_atom * 3 * stride_ij), deri_dCy);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij + i_atom * 3 * stride_ij), deri_dCz);
+        }
+    }
+}
+
+template <int NROOTS, int GSIZE_INT3C_1E>
+__global__
+static void GINTfill_int3c1e_ip2_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                                  const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, const int* gridslice,
+                                                                  const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    const int task_grid = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (task_ij >= ntasks_ij || task_grid >= ngrids) {
+        return;
+    }
+
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+    const double* __restrict__ a_exponents = c_bpcache.a1;
+    const int nbas = c_bpcache.nbas;
+    const double* __restrict__ bas_x = c_bpcache.bas_coords;
+    const double* __restrict__ bas_y = bas_x + nbas;
+    const double* __restrict__ bas_z = bas_y + nbas;
+    const double Ax = bas_x[ish];
+    const double Ay = bas_y[ish];
+    const double Az = bas_z[ish];
+
+    const double* grid_point = grid_points + task_grid * 4;
+    const double Cx = grid_point[0];
+    const double Cy = grid_point[1];
+    const double Cz = grid_point[2];
+    const double charge = grid_point[3];
+    const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+    const double AC[3] { Ax - Cx, Ay - Cy, Az - Cz };
+
+    double g[GSIZE_INT3C_1E];
+    double u2[NROOTS];
+
+    for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
+        GINT_g1e_save_u2<NROOTS>(g, u2, grid_point, ish, jsh, ij, i_l + 1, j_l, charge_exponent, omega);
+        const double minus_two_a = -2.0 * a_exponents[ij];
+        GINTwrite_int3c1e_ip2_charge_contracted<NROOTS>(g, output, minus_two_a, u2, AC, charge, ish, jsh, task_grid, i_l, j_l, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, ngrids);
+    }
+}
diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu
index e0ace197..3ee7c423 100644
--- a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu
+++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu
@@ -113,6 +113,55 @@ static int GINTfill_int3c1e_ip1_charge_contracted_tasks(double* output, const Ba
     return 0;
 }
 
+static int GINTfill_int3c1e_ip1_density_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                         const double* density, const int* aoslice, const int nao,
+                                                         const double omega, const double* grid_points, const double* charge_exponents,
+                                                         const cudaStream_t stream)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+
+    const dim3 threads(THREADSX, THREADSY);
+    const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
+    const int type_ij = i_l * 10 + j_l;
+    switch (type_ij) {
+    case 00: GINTfill_int3c1e_ip1_density_contracted_kernel00<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 01: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 02: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 2> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 03: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 3> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, shell, nao, omega, grid_points, charge_exponents); break;
+    // case 04: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 4> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 10: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 11: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 12: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 2> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 13: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 3> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 20: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<2, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 21: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<2, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 22: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<2, 2> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 30: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<3, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 31: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<3, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    // case 40: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<4, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+    default:
+        const int nrys_roots = (i_l + j_l + 1) / 2 + 1;
+        switch (nrys_roots) {
+        case 1: GINTfill_int3c1e_ip1_density_contracted_kernel_general<1, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+        case 2: GINTfill_int3c1e_ip1_density_contracted_kernel_general<2, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+        case 3: GINTfill_int3c1e_ip1_density_contracted_kernel_general<3, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+        case 4: GINTfill_int3c1e_ip1_density_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+        case 5: GINTfill_int3c1e_ip1_density_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break;
+        default:
+            fprintf(stderr, "type_ij = %d, nrys_roots = %d out of range\n", type_ij, nrys_roots);
+            return 1;
+        }
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
 static int GINTfill_int3c1e_ip2_density_contracted_tasks(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
                                                          const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
                                                          const double omega, const double* grid_points, const double* charge_exponents,
@@ -147,12 +196,62 @@ static int GINTfill_int3c1e_ip2_density_contracted_tasks(double* output, const d
     return 0;
 }
 
+static int GINTfill_int3c1e_ip2_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                        const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+                                                        const int* gridslice,
+                                                        const double omega, const double* grid_points, const double* charge_exponents,
+                                                        const cudaStream_t stream)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+
+    const dim3 threads(THREADSX, THREADSY);
+    const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
+    const int type_ij = i_l * 10 + j_l;
+    switch (type_ij) {
+    case 00: GINTfill_int3c1e_ip2_charge_contracted_kernel00<<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 01: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 02: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 2> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 03: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 3> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 04: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 4> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 10: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 11: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 12: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 2> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 13: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 3> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 20: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<2, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 21: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<2, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 22: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<2, 2> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 30: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<3, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 31: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<3, 1> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    // case 40: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<4, 0> <<<blocks, threads, 0, stream>>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+    default:
+        const int nrys_roots = (i_l + j_l + 1) / 2 + 1;
+        switch (nrys_roots) {
+        case 1: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<1, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+        case 2: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<2, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+        case 3: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<3, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+        case 4: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+        case 5: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break;
+        default:
+            fprintf(stderr, "type_ij = %d, nrys_roots = %d out of range\n", type_ij, nrys_roots);
+            return 1;
+        }
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
 extern "C" {
 int GINTfill_int3c1e_ip(const cudaStream_t stream, const BasisProdCache* bpcache,
                         const double* grid_points, const double* charge_exponents, const int ngrids,
                         double* integrals,
                         const int* strides, const int* ao_offsets,
-                        const int* bins_locs_ij, int nbins,
+                        const int* bins_locs_ij, const int nbins,
                         const int cp_ij_id, const double omega)
 {
     const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
@@ -198,11 +297,63 @@ int GINTfill_int3c1e_ip(const cudaStream_t stream, const BasisProdCache* bpcache
     return 0;
 }
 
+int GINTfill_int3c1e_ip1_density_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
+                                            const double* grid_points, const double* charge_exponents, const int ngrids,
+                                            double* integral_charge_contracted,
+                                            const int* bins_locs_ij, const int nbins,
+                                            const int cp_ij_id,
+                                            const double* density, const int* aoslice, const int nao,
+                                            const double omega)
+{
+    const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
+    const int i_l = cp_ij->l_bra;
+    const int j_l = cp_ij->l_ket;
+    const int nrys_roots = (i_l + j_l + 1) / 2 + 1;
+    const int nprim_ij = cp_ij->nprim_12;
+
+    if (nrys_roots > MAX_NROOTS_INT3C_1E + 1) {
+        fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots);
+        return 2;
+    }
+
+    checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+
+    const int* bas_pairs_locs = bpcache->bas_pairs_locs;
+    const int* primitive_pairs_locs = bpcache->primitive_pairs_locs;
+    for (int ij_bin = 0; ij_bin < nbins; ij_bin++) {
+        const int bas_ij0 = bins_locs_ij[ij_bin];
+        const int bas_ij1 = bins_locs_ij[ij_bin + 1];
+        const int ntasks_ij = bas_ij1 - bas_ij0;
+        if (ntasks_ij <= 0) {
+            continue;
+        }
+
+        BasisProdOffsets offsets;
+        offsets.ntasks_ij = ntasks_ij;
+        offsets.ntasks_kl = ngrids;
+        offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0;
+        offsets.bas_kl = -1;
+        offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij;
+        offsets.primitive_kl = -1;
+
+        const int err = GINTfill_int3c1e_ip1_density_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij,
+                                                                      density, aoslice, nao,
+                                                                      omega, grid_points, charge_exponents, stream);
+
+        if (err != 0) {
+            return err;
+        }
+    }
+
+    return 0;
+}
+
+
 int GINTfill_int3c1e_ip1_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
                                            const double* grid_points, const double* charge_exponents, const int ngrids,
                                            double* integral_charge_contracted,
                                            const int* strides, const int* ao_offsets,
-                                           const int* bins_locs_ij, int nbins,
+                                           const int* bins_locs_ij, const int nbins,
                                            const int cp_ij_id, const double omega, const int n_charge_sum_per_thread)
 {
     const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
@@ -252,7 +403,7 @@ int GINTfill_int3c1e_ip2_density_contracted(const cudaStream_t stream, const Bas
                                             const double* grid_points, const double* charge_exponents, const int ngrids,
                                             const double* dm_pair_ordered, const int* density_offset,
                                             double* integral_density_contracted,
-                                            const int* bins_locs_ij, int nbins,
+                                            const int* bins_locs_ij, const int nbins,
                                             const int cp_ij_id, const double omega, const int n_pair_sum_per_thread)
 {
     const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
@@ -302,4 +453,56 @@ int GINTfill_int3c1e_ip2_density_contracted(const cudaStream_t stream, const Bas
 
     return 0;
 }
+
+int GINTfill_int3c1e_ip2_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
+                                           const double* grid_points, const double* charge_exponents, const int ngrids,
+                                           double* integral_charge_contracted,
+                                           const int* strides, const int* ao_offsets,
+                                           const int* bins_locs_ij, const int nbins,
+                                           const int cp_ij_id,
+                                           const int* gridslice,
+                                           const double omega)
+{
+    const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
+    const int i_l = cp_ij->l_bra;
+    const int j_l = cp_ij->l_ket;
+    const int nrys_roots = (i_l + j_l + 1) / 2 + 1;
+    const int nprim_ij = cp_ij->nprim_12;
+
+    if (nrys_roots > MAX_NROOTS_INT3C_1E + 1) {
+        fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots);
+        return 2;
+    }
+
+    checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+
+    const int* bas_pairs_locs = bpcache->bas_pairs_locs;
+    const int* primitive_pairs_locs = bpcache->primitive_pairs_locs;
+    for (int ij_bin = 0; ij_bin < nbins; ij_bin++) {
+        const int bas_ij0 = bins_locs_ij[ij_bin];
+        const int bas_ij1 = bins_locs_ij[ij_bin + 1];
+        const int ntasks_ij = bas_ij1 - bas_ij0;
+        if (ntasks_ij <= 0) {
+            continue;
+        }
+
+        BasisProdOffsets offsets;
+        offsets.ntasks_ij = ntasks_ij;
+        offsets.ntasks_kl = ngrids;
+        offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0;
+        offsets.bas_kl = -1;
+        offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij;
+        offsets.primitive_kl = -1;
+
+        const int err = GINTfill_int3c1e_ip2_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij,
+                                                                     strides[0], strides[1], ao_offsets[0], ao_offsets[1],
+                                                                     gridslice, omega, grid_points, charge_exponents, stream);
+
+        if (err != 0) {
+            return err;
+        }
+    }
+
+    return 0;
+}
 }
diff --git a/gpu4pyscf/qmmm/pbc/itrf.py b/gpu4pyscf/qmmm/pbc/itrf.py
index 236e3e10..1b098c8f 100644
--- a/gpu4pyscf/qmmm/pbc/itrf.py
+++ b/gpu4pyscf/qmmm/pbc/itrf.py
@@ -1009,7 +1009,7 @@ def calculate_h1e(self, h1_gpu):
             nao = mol.nao
             if mm_mol.charge_model == 'gaussian' and len(coords) != 0:
                 expnts = cp.hstack([mm_mol.get_zetas()] * len(Ls))[mask]
-                g_qm += int1e_grids_ip1(mol, coords, charges = charges, charge_exponents = expnts).transpose(0,2,1)
+                g_qm += int1e_grids_ip1(mol, coords, charges = charges, charge_exponents = expnts)
             elif mm_mol.charge_model == 'point' and len(coords) != 0:
                 raise RuntimeError("Not tested yet")
                 max_memory = self.max_memory - lib.current_memory()[0]
diff --git a/gpu4pyscf/solvent/hessian/pcm.py b/gpu4pyscf/solvent/hessian/pcm.py
index ba7cd5cf..538cb859 100644
--- a/gpu4pyscf/solvent/hessian/pcm.py
+++ b/gpu4pyscf/solvent/hessian/pcm.py
@@ -22,13 +22,16 @@
 from pyscf import lib, gto
 from gpu4pyscf import scf
 from gpu4pyscf.solvent.pcm import PI
-from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc
+from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc, get_dD_dS, get_dF_dA, get_dSii
 from gpu4pyscf.df import int3c2e
-from gpu4pyscf.lib.cupy_helper import contract
 from gpu4pyscf.lib import logger
 from gpu4pyscf.hessian.jk import _ao2mo
+from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2
+from gpu4pyscf.gto import int3c1e
+from gpu4pyscf.gto.int3c1e import int1e_grids
 
 def hess_nuc(pcmobj):
+    raise NotImplementedError("Not tested")
     if not pcmobj._intermediates:
         pcmobj.build()
     mol = pcmobj.mol
@@ -150,76 +153,282 @@ def pcm_grad_scanner(mol):
     pcmobj.reset(pmol)
     return de
 
-def fd_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None):
-    '''
-    dv_solv / da
-    slow version with finite difference
-    '''
-    log = logger.new_logger(pcmobj, verbose)
-    t1 = log.init_timer()
-    pmol = pcmobj.mol.copy()
-    mol = pmol.copy()
-    if atmlst is None:
-        atmlst = range(mol.natm)
-    nao, nmo = mo_coeff.shape
-    mocc = mo_coeff[:,mo_occ>0]
-    nocc = mocc.shape[1]
-    coords = mol.atom_coords(unit='Bohr')
-    def pcm_vmat_scanner(mol):
-        pcmobj.reset(mol)
-        e, v = pcmobj._get_vind(dm)
-        return v
+def get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K):
+    assert pcmobj._intermediates is not None
 
-    mol.verbose = 0
-    vmat = cupy.empty([len(atmlst), 3, nao, nocc])
-    eps = 1e-3
-    for i0, ia in enumerate(atmlst):
-        for ix in range(3):
-            dv = numpy.zeros_like(coords)
-            dv[ia,ix] = eps
-            mol.set_geom_(coords + dv, unit='Bohr')
-            vmat0 = pcm_vmat_scanner(mol)
+    gridslice    = pcmobj.surface['gslice_by_atom']
+    v_grids      = pcmobj._intermediates['v_grids']
+    A            = pcmobj._intermediates['A']
+    D            = pcmobj._intermediates['D']
+    S            = pcmobj._intermediates['S']
+    R            = pcmobj._intermediates['R']
+    q_sym        = pcmobj._intermediates['q_sym']
+    f_epsilon    = pcmobj._intermediates['f_epsilon']
 
-            mol.set_geom_(coords - dv, unit='Bohr')
-            vmat1 = pcm_vmat_scanner(mol)
+    ngrids = q_sym.shape[0]
 
-            grad_vmat = (vmat0 - vmat1)/2.0/eps
-            grad_vmat = contract("ij,jq->iq", grad_vmat, mocc)
-            grad_vmat = contract("iq,ip->pq", grad_vmat, mo_coeff)
-            vmat[i0,ix] = grad_vmat
-    t1 = log.timer_debug1('computing solvent grad veff', *t1)
-    pcmobj.reset(pmol)
-    return vmat
+    def get_dS_dot_q(dS, dSii, q, atmlst, gridslice):
+        output = cupy.einsum('diA,i->Adi', dSii[:,:,atmlst], q)
+        for i_atom in atmlst:
+            g0,g1 = gridslice[i_atom]
+            output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dS[:,g0:g1,:], q)
+            output[i_atom, :, :] -= cupy.einsum('dij,j->di', dS[:,:,g0:g1], q[g0:g1])
+        return output
+    def get_dST_dot_q(dS, dSii, q, atmlst, gridslice):
+        return get_dS_dot_q(-dS.transpose(0,2,1), dSii, q, atmlst, gridslice)
+
+    def get_dA_dot_q(dA, q, atmlst, gridslice):
+        return cupy.einsum('diA,i->Adi', dA[:,:,atmlst], q)
+
+    def get_dD_dot_q(dD, q, atmlst, gridslice):
+        output = cupy.zeros([len(atmlst), 3, ngrids])
+        for i_atom in atmlst:
+            g0,g1 = gridslice[i_atom]
+            output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dD[:,g0:g1,:], q)
+            output[i_atom, :, :] -= cupy.einsum('dij,j->di', dD[:,:,g0:g1], q[g0:g1])
+        return output
+    def get_dDT_dot_q(dD, q, atmlst, gridslice):
+        return get_dD_dot_q(-dD.transpose(0,2,1), q, atmlst, gridslice)
+
+    if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']:
+        _, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True)
+        dF, _ = get_dF_dA(pcmobj.surface)
+        dSii = get_dSii(pcmobj.surface, dF)
+        dF = None
+
+        # dR = 0, dK = dS
+        dSdx_dot_q = get_dS_dot_q(dS, dSii, q_sym, atmlst, gridslice)
+
+        dqdx_fix_Vq = cupy.einsum('ij,Adj->Adi', inverse_K, dSdx_dot_q)
+
+    elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']:
+        dF, dA = get_dF_dA(pcmobj.surface)
+        dSii = get_dSii(pcmobj.surface, dF)
+        dF = None
+
+        dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
+
+        # dR = f_eps/(2*pi) * (dD*A + D*dA)
+        # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
+        f_eps_over_2pi = f_epsilon/(2.0*PI)
+
+        q = inverse_K @ R @ v_grids
+        dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
+
+        DA = D*A
+        dKdx_dot_q = dSdx_dot_q - f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q)
+
+        dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice)
+        dKdx_dot_q -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq)
+
+        AS = (A * S.T).T # It's just diag(A) @ S
+        dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice)
+        dKdx_dot_q -= f_eps_over_2pi * dDdx_dot_ASq
+
+        dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q)
+
+        dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice)
+
+        dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice)
+
+        dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V))
+        dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V)
+
+        invKT_V = inverse_K.T @ v_grids
+        dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice)
+
+        DT_invKT_V = D.T @ invKT_V
+        dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice)
+        dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V)
+
+        dSdxT_dot_invKT_V = get_dST_dot_q(dS, dSii, invKT_V, atmlst, gridslice)
+        dKdxT_dot_invKT_V = dSdxT_dot_invKT_V
+
+        dKdxT_dot_invKT_V -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_invKT_V)
+        dKdxT_dot_invKT_V -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_invKT_V)
+
+        dSdxT_dot_AT_DT_invKT_V = get_dST_dot_q(dS, dSii, DA.T @ invKT_V, atmlst, gridslice)
+        dKdxT_dot_invKT_V -= f_eps_over_2pi * dSdxT_dot_AT_DT_invKT_V
+
+        dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdxT_dot_invKT_V)
+
+        dqdx_fix_Vq *= -0.5
+
+    elif pcmobj.method.upper() in ['SS(V)PE']:
+        dF, dA = get_dF_dA(pcmobj.surface)
+        dSii = get_dSii(pcmobj.surface, dF)
+        dF = None
+
+        dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
 
-"""
-def analytic_grad_vmat(pcmobj, mo_coeff, mo_occ, atmlst=None, verbose=None):
+        f_eps_over_4pi = f_epsilon/(4.0*PI)
+
+        def dK_dot_q(q):
+            dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
+
+            DA = D*A
+            dKdx_dot_q = dSdx_dot_q - f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q)
+
+            dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice)
+            dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq)
+
+            AS = (A * S.T).T # It's just diag(A) @ S
+            dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice)
+            dKdx_dot_q -= f_eps_over_4pi * dDdx_dot_ASq
+
+            dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice)
+            dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_q)
+
+            dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst, gridslice)
+            dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_q)
+
+            dSdxT_dot_AT_DT_q = get_dST_dot_q(dS, dSii, DA.T @ q, atmlst, gridslice)
+            dKdx_dot_q -= f_eps_over_4pi * dSdxT_dot_AT_DT_q
+
+            return dKdx_dot_q
+
+        f_eps_over_2pi = f_epsilon/(2.0*PI)
+
+        q = inverse_K @ R @ v_grids
+        dKdx_dot_q = dK_dot_q(q)
+        dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q)
+
+        dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice)
+
+        dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice)
+
+        dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V))
+        dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V)
+
+        invKT_V = inverse_K.T @ v_grids
+        dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice)
+
+        DT_invKT_V = D.T @ invKT_V
+        dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice)
+        dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V)
+
+        dKdx_dot_invKT_V = dK_dot_q(invKT_V)
+        dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdx_dot_invKT_V)
+
+        dqdx_fix_Vq *= -0.5
+
+    else:
+        raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}")
+
+    return dqdx_fix_Vq
+
+def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative):
+    assert pcmobj._intermediates is not None
+
+    mol = pcmobj.mol
+    gridslice    = pcmobj.surface['gslice_by_atom']
+    charge_exp   = pcmobj.surface['charge_exp']
+    grid_coords  = pcmobj.surface['grid_coords']
+    R            = pcmobj._intermediates['R']
+
+    atom_coords = mol.atom_coords(unit='B')
+    atom_charges = numpy.asarray(mol.atom_charges(), dtype=numpy.float64)
+    atom_coords = atom_coords[atmlst]
+    atom_charges = atom_charges[atmlst]
+    fakemol_nuc = gto.fakemol_for_charges(atom_coords)
+    fakemol = gto.fakemol_for_charges(grid_coords.get(), expnt=charge_exp.get()**2)
+    int2c2e_ip1 = mol._add_suffix('int2c2e_ip1')
+    v_ng_ip1 = gto.mole.intor_cross(int2c2e_ip1, fakemol_nuc, fakemol)
+    v_ng_ip1 = cupy.array(v_ng_ip1)
+    dV_on_charge_dx = cupy.einsum('dAq,A->Adq', v_ng_ip1, atom_charges)
+
+    v_ng_ip2 = gto.mole.intor_cross(int2c2e_ip1, fakemol, fakemol_nuc)
+    v_ng_ip2 = cupy.array(v_ng_ip2)
+    for i_atom in atmlst:
+        g0,g1 = gridslice[i_atom]
+        dV_on_charge_dx[i_atom,:,g0:g1] += cupy.einsum('dqA,A->dq', v_ng_ip2[:,g0:g1,:], atom_charges)
+
+    dIdA = int1e_grids_ip1(mol, grid_coords, dm = dm + dm.T, intopt = intopt_derivative, charge_exponents = charge_exp**2)
+    dV_on_charge_dx[atmlst,:,:] -= dIdA[atmlst,:,:]
+
+    dIdC = int1e_grids_ip2(mol, grid_coords, intopt = intopt_derivative, dm = dm, charge_exponents = charge_exp**2)
+    for i_atom in atmlst:
+        g0,g1 = gridslice[i_atom]
+        dV_on_charge_dx[i_atom,:,g0:g1] -= dIdC[:,g0:g1]
+
+    KR_symmetrized = 0.5 * (inverse_K @ R + R.T @ inverse_K.T)
+    dqdx_fix_K_R = cupy.einsum('ij,Adj->Adi', KR_symmetrized, dV_on_charge_dx)
+
+    return dqdx_fix_K_R
+
+def get_dqsym_dx(pcmobj, dm, atmlst, intopt_derivative):
+    K = pcmobj._intermediates['K']
+    inverse_K = cupy.linalg.inv(K)
+    return get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K) + get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative)
+
+def analytic_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None):
     '''
     dv_solv / da
-    slow version with finite difference
     '''
+    if not pcmobj._intermediates:
+        pcmobj.build()
+    dm_cache = pcmobj._intermediates.get('dm', None)
+    if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10:
+        pass
+    else:
+        pcmobj._get_vind(dm)
+    mol = pcmobj.mol
     log = logger.new_logger(pcmobj, verbose)
     t1 = log.init_timer()
-    pmol = pcmobj.mol.copy()
-    mol = pmol.copy()
-    if atmlst is None:
-        atmlst = range(mol.natm)
+
     nao, nmo = mo_coeff.shape
     mocc = mo_coeff[:,mo_occ>0]
     nocc = mocc.shape[1]
-    dm = cupy.dot(mocc, mocc.T) * 2
-    coords = mol.atom_coords(unit='Bohr')
 
-    # TODO: add those contributions
-    # contribution due to _get_v
-    # contribution due to linear solver
-    # contribution due to _get_vmat
+    if atmlst is None:
+        atmlst = range(mol.natm)
+
+    gridslice    = pcmobj.surface['gslice_by_atom']
+    charge_exp   = pcmobj.surface['charge_exp']
+    grid_coords  = pcmobj.surface['grid_coords']
+    q_sym        = pcmobj._intermediates['q_sym']
 
-    vmat = cupy.zeros([len(atmlst), 3, nao, nocc])
+    aoslice = mol.aoslice_by_atom()
+    aoslice = numpy.array(aoslice)
+
+    intopt_fock = int3c1e.VHFOpt(mol)
+    intopt_fock.build(cutoff = 1e-14, aosym = True)
+    intopt_derivative = int3c1e.VHFOpt(mol)
+    intopt_derivative.build(cutoff = 1e-14, aosym = False)
+
+    dIdx_mo = cupy.empty([len(atmlst), 3, nmo, nocc])
+
+    dIdA = int1e_grids_ip1(mol, grid_coords, charges = q_sym, intopt = intopt_derivative, charge_exponents = charge_exp**2)
+    for i_atom in atmlst:
+        p0,p1 = aoslice[i_atom, 2:]
+        # dIdx[i_atom, :, :, :] = 0
+        # dIdx[i_atom, :, p0:p1, :] += dIdA[:, p0:p1, :]
+        # dIdx[i_atom, :, :, p0:p1] += dIdA[:, p0:p1, :].transpose(0,2,1)
+        dIdA_mo = dIdA[:, p0:p1, :] @ mocc
+        dIdA_mo = cupy.einsum('ip,dpj->dij', mo_coeff[p0:p1, :].T, dIdA_mo)
+        dIdB_mo = dIdA[:, p0:p1, :].transpose(0,2,1) @ mocc[p0:p1, :]
+        dIdB_mo = cupy.einsum('ip,dpj->dij', mo_coeff.T, dIdB_mo)
+        dIdx_mo[i_atom, :, :, :] = dIdA_mo + dIdB_mo
+
+    for i_atom in atmlst:
+        g0,g1 = gridslice[i_atom]
+        dIdC = int1e_grids_ip2(mol, grid_coords[g0:g1,:], charges = q_sym[g0:g1],
+                               intopt = intopt_derivative, charge_exponents = charge_exp[g0:g1]**2)
+        dIdC_mo = dIdC @ mocc
+        dIdC_mo = cupy.einsum('ip,dpj->dij', mo_coeff.T, dIdC_mo)
+        dIdx_mo[i_atom, :, :, :] += dIdC_mo
+
+    dV_on_molecule_dx_mo = dIdx_mo
+
+    dqdx = get_dqsym_dx(pcmobj, dm, atmlst, intopt_derivative)
+    for i_atom in atmlst:
+        for i_xyz in range(3):
+            dIdx_from_dqdx = int1e_grids(mol, grid_coords, charges = dqdx[i_atom, i_xyz, :],
+                                         intopt = intopt_fock, charge_exponents = charge_exp**2)
+            dV_on_molecule_dx_mo[i_atom, i_xyz, :, :] += mo_coeff.T @ dIdx_from_dqdx @ mocc
 
     t1 = log.timer_debug1('computing solvent grad veff', *t1)
-    pcmobj.reset(pmol)
-    return vmat
-"""
+    return dV_on_molecule_dx_mo
 
 def make_hess_object(hess_method):
     if hess_method.base.with_solvent.frozen:
@@ -274,7 +483,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
         if isinstance(self.base, scf.hf.RHF):
             dm = self.base.make_rdm1(ao_repr=True)
-            dv = fd_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+            dv = analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1ao[i0] += dv[i0]
             return h1ao
@@ -283,8 +492,8 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
             solvent = self.base.with_solvent
             dm = self.base.make_rdm1(ao_repr=True)
             dm = dm[0] + dm[1]
-            dva = fd_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
-            dvb = fd_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
+            dva = analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
+            dvb = analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1aoa[i0] += dva[i0]
                 h1aob[i0] += dvb[i0]
diff --git a/gpu4pyscf/solvent/hessian/smd.py b/gpu4pyscf/solvent/hessian/smd.py
index 644c0e3f..49897d74 100644
--- a/gpu4pyscf/solvent/hessian/smd.py
+++ b/gpu4pyscf/solvent/hessian/smd.py
@@ -154,7 +154,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
         if isinstance(self.base, scf.hf.RHF):
             dm = self.base.make_rdm1(ao_repr=True)
-            dv = pcm_hess.fd_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+            dv = pcm_hess.analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1ao[i0] += dv[i0]
             return h1ao
@@ -163,8 +163,8 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
             solvent = self.base.with_solvent
             dm = self.base.make_rdm1(ao_repr=True)
             dm = dm[0] + dm[1]
-            dva = pcm_hess.fd_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
-            dvb = pcm_hess.fd_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
+            dva = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
+            dvb = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1aoa[i0] += dva[i0]
                 h1aob[i0] += dvb[i0]
diff --git a/gpu4pyscf/solvent/tests/test_pcm_hessian.py b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
index 33bf0e67..c7076f29 100644
--- a/gpu4pyscf/solvent/tests/test_pcm_hessian.py
+++ b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
@@ -14,12 +14,15 @@
 
 import unittest
 import numpy as np
+import cupy as cp
 import pyscf
 import pytest
 from pyscf import gto
 from gpu4pyscf.solvent import pcm
 from gpu4pyscf import scf, dft
 from packaging import version
+from gpu4pyscf.solvent.hessian.pcm import analytic_grad_vmat
+from gpu4pyscf.lib.cupy_helper import contract
 
 pyscf_25 = version.parse(pyscf.__version__) <= version.parse('2.5.0')
 
@@ -50,7 +53,7 @@ def _make_mf(method='C-PCM', restricted=True, density_fit=True):
         mf = dft.rks.RKS(mol, xc=xc)
     else:
         mf = dft.uks.UKS(mol, xc=xc)
-    
+
     if density_fit:
         mf = mf.density_fit()
     mf = mf.PCM()
@@ -89,6 +92,44 @@ def _check_hessian(mf, h, ix=0, iy=0):
     print(f'Norm of H({ix},{iy}) diff, {np.linalg.norm(h[ix,:,iy,:] - h_fd)}')
     assert(np.linalg.norm(h[ix,:,iy,:] - h_fd) < tol)
 
+def _fd_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None):
+    '''
+    dv_solv / da
+    slow version with finite difference
+    '''
+    pmol = pcmobj.mol.copy()
+    mol = pmol.copy()
+    if atmlst is None:
+        atmlst = range(mol.natm)
+    nao, nmo = mo_coeff.shape
+    mocc = mo_coeff[:,mo_occ>0]
+    nocc = mocc.shape[1]
+    coords = mol.atom_coords(unit='Bohr')
+    def pcm_vmat_scanner(mol):
+        pcmobj.reset(mol)
+        e, v = pcmobj._get_vind(dm)
+        return v
+
+    mol.verbose = 0
+    vmat = cp.empty([len(atmlst), 3, nao, nocc])
+    eps = 1e-5
+    for i0, ia in enumerate(atmlst):
+        for ix in range(3):
+            dv = np.zeros_like(coords)
+            dv[ia,ix] = eps
+            mol.set_geom_(coords + dv, unit='Bohr')
+            vmat0 = pcm_vmat_scanner(mol)
+
+            mol.set_geom_(coords - dv, unit='Bohr')
+            vmat1 = pcm_vmat_scanner(mol)
+
+            grad_vmat = (vmat0 - vmat1)/2.0/eps
+            grad_vmat = contract("ij,jq->iq", grad_vmat, mocc)
+            grad_vmat = contract("iq,ip->pq", grad_vmat, mo_coeff)
+            vmat[i0,ix] = grad_vmat
+    pcmobj.reset(pmol)
+    return vmat
+
 @unittest.skipIf(pcm.libsolvent is None, "solvent extension not compiled")
 class KnownValues(unittest.TestCase):
     def test_df_hess_cpcm(self):
@@ -142,6 +183,48 @@ def test_uks_hess_iefpcm(self):
         _check_hessian(mf, h, ix=0, iy=0)
         _check_hessian(mf, h, ix=0, iy=1)
 
+    def test_grad_vmat_cpcm(self):
+        print("testing C-PCM dV_solv/dx")
+        mf = _make_mf(method='C-PCM')
+        hobj = mf.Hessian()
+
+        dm = mf.make_rdm1()
+        mo_coeff = mf.mo_coeff
+        mo_occ = mf.mo_occ
+
+        test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+        ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+
+        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
+    def test_grad_vmat_iefpcm(self):
+        print("testing IEF-PCM dV_solv/dx")
+        mf = _make_mf(method='IEF-PCM')
+        hobj = mf.Hessian()
+
+        dm = mf.make_rdm1()
+        mo_coeff = mf.mo_coeff
+        mo_occ = mf.mo_occ
+
+        test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+        ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+
+        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
+    def test_grad_vmat_ssvpe(self):
+        print("testing SS(V)PE dV_solv/dx")
+        mf = _make_mf(method='SS(V)PE')
+        hobj = mf.Hessian()
+
+        dm = mf.make_rdm1()
+        mo_coeff = mf.mo_coeff
+        mo_occ = mf.mo_occ
+
+        test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+        ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+
+        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
     @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher')
     def test_to_gpu(self):
         import pyscf
@@ -187,7 +270,7 @@ def test_to_cpu(self):
         mol.basis = 'sto-3g'
         mol.output = '/dev/null'
         mol.build(verbose=0)
-        
+
         mf = dft.RKS(mol, xc='b3lyp').PCM()
         mf.conv_tol = 1e-12
         mf.conv_tol_cpscf = 1e-7
@@ -209,6 +292,7 @@ def test_to_cpu(self):
         hessobj = hessobj.to_cpu()
         hess_cpu = hessobj.kernel()
         assert np.linalg.norm(hess_cpu - hess_gpu) < 1e-5
+
 if __name__ == "__main__":
     print("Full Tests for Hessian of PCMs")
     unittest.main()