diff --git a/.github/workflows/pypi_wheel.yml b/.github/workflows/pypi_wheel.yml
index e350cd40..bf0565af 100644
--- a/.github/workflows/pypi_wheel.yml
+++ b/.github/workflows/pypi_wheel.yml
@@ -28,7 +28,7 @@ jobs:
         ls ${{ github.workspace }}/wheelhouse
     - name: Publish to PyPI
       run: |
-          pip install twine
+          pip install twine==6.0.1
           export TWINE_USERNAME=__token__
           export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}"
           twine upload --verbose "${{ github.workspace }}/wheelhouse/*"
@@ -51,7 +51,7 @@ jobs:
         ls ${{ github.workspace }}/wheelhouse
     - name: Publish to PyPI
       run: |
-          pip install twine
+          pip install twine==6.0.1
           export TWINE_USERNAME=__token__
           export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}"
           twine upload --verbose "${{ github.workspace }}/wheelhouse/*"
@@ -66,7 +66,7 @@ jobs:
         python3 setup.py sdist
     - name: Publish to PyPI
       run: |
-          pip install twine
+          pip install twine==6.0.1
           export TWINE_USERNAME=__token__
           export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}"
           twine upload --verbose "${{ github.workspace }}/dist/*"
diff --git a/CHANGELOG b/CHANGELOG
index 7f747686..a95f5108 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,12 @@
+v1.3.1 (2025-02-04)
+-------------------
+* New Features
+  - Analytical Hessian for PCM solvent model
+  - Driver for 3c methods (wB97x-3c, R2Scan-3c, B97-3c, etc.)
+* Improvements
+  - Preconditioner and computation efficiency of Davidson iterations for TDDFT
+
+
 v1.3.0 (2025-01-07)
 -------------------
 * New Features
diff --git a/examples/40-all_electron_scf.py b/examples/40-all_electron_scf.py
new file mode 100644
index 00000000..a33f2953
--- /dev/null
+++ b/examples/40-all_electron_scf.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Gamma point Hartree-Fock/DFT using density fitting approximation
+'''
+
+import numpy as np
+import pyscf
+
+cell = pyscf.M(
+    a = np.eye(3)*3.5668,
+    atom = '''C     0.      0.      0.    
+              C     0.8917  0.8917  0.8917
+              C     1.7834  1.7834  0.    
+              C     2.6751  2.6751  0.8917
+              C     1.7834  0.      1.7834
+              C     2.6751  0.8917  2.6751
+              C     0.      1.7834  1.7834
+              C     0.8917  2.6751  2.6751''',
+    basis = 'ccpvdz',
+    verbose = 5,
+)
+
+#
+# Gamma point HF and DFT 
+#
+mf = cell.RHF().to_gpu().density_fit().run()
+
+mf = cell.RKS(xc='pbe0').to_gpu().density_fit().run()
+
+#
+# K-point sampled HF and DFT 
+#
+kpts = cell.make_kpts([2,2,2])
+kmf = cell.KRHF(kpts=kpts).to_gpu().density_fit().run()
+
+kmf = cell.KRKS(xc='pbe0', kpts=kpts).to_gpu().density_fit().run()
diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
index 1bb12f85..bfd8d8d8 100644
--- a/gpu4pyscf/__config__.py
+++ b/gpu4pyscf/__config__.py
@@ -14,11 +14,11 @@
 
 import cupy
 
-_num_devices = cupy.cuda.runtime.getDeviceCount()
+num_devices = cupy.cuda.runtime.getDeviceCount()
 
 # TODO: switch to non_blocking stream (currently blocked by libxc)
-_streams = [None] * _num_devices
-for device_id in range(_num_devices):
+_streams = [None] * num_devices
+for device_id in range(num_devices):
     with cupy.cuda.Device(device_id):
         _streams[device_id] = cupy.cuda.stream.Stream(non_blocking=False)
 
@@ -38,11 +38,16 @@
 mem_fraction = 0.9
 cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)
 
+if props['sharedMemPerBlockOptin'] > 65536:
+    shm_size = props['sharedMemPerBlockOptin']
+else:
+    shm_size = props['sharedMemPerBlock']
+
 # Check P2P data transfer is available
 _p2p_access = True
-if _num_devices > 1:
-    for src in range(_num_devices):
-        for dst in range(_num_devices):
+if num_devices > 1:
+    for src in range(num_devices):
+        for dst in range(num_devices):
             if src != dst:
                 can_access_peer = cupy.cuda.runtime.deviceCanAccessPeer(src, dst)
                 _p2p_access &= can_access_peer
diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
index af2c2982..4cd95fbc 100644
--- a/gpu4pyscf/__init__.py
+++ b/gpu4pyscf/__init__.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = '1.3.0'
+__version__ = '1.3.1'
 
 from . import lib, grad, hessian, solvent, scf, dft, tdscf
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index da61804c..c58c1428 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -25,7 +25,7 @@
 from gpu4pyscf.df import int3c2e, df_jk
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 
 MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128)
 ALIGNED = getattr(__config__, 'ao_aligned', 32)
@@ -218,7 +218,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
         # CDERI will be equally distributed to the devices
         # Other devices usually have more memory available than Device 0
         # CDERI will use up to 40% of the available memory
-        use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices
+        use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * num_devices
 
     if use_gpu_memory:
         log.debug("Saving CDERI on GPU")
@@ -226,9 +226,9 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
         log.debug("Saving CDERI on CPU")
 
     _cderi = {}
-    aux_blksize = (naux + _num_devices - 1) // _num_devices
+    aux_blksize = (naux + num_devices - 1) // num_devices
     aux_blksize = (aux_blksize + ALIGNED - 1) // ALIGNED * ALIGNED
-    for device_id in range(_num_devices):
+    for device_id in range(num_devices):
         p0 = min(aux_blksize*device_id, naux)
         p1 = min(aux_blksize*(device_id+1), naux)
         #for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
@@ -246,16 +246,16 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
     npairs_per_ctr = np.array(npairs_per_ctr)
     total_task_list = np.argsort(npairs_per_ctr)
     task_list_per_device = []
-    for device_id in range(_num_devices):
-        task_list_per_device.append(total_task_list[device_id::_num_devices])
+    for device_id in range(num_devices):
+        task_list_per_device.append(total_task_list[device_id::num_devices])
 
     cd_low_f = cupy.array(cd_low, order='F', copy=False)
     cd_low_f = tag_array(cd_low_f, tag=cd_low.tag)
 
     cupy.cuda.get_current_stream().synchronize()
     futures = []
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             task_list = task_list_per_device[device_id]
             future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, aux_blksize,
                                      omega=omega, sr_only=sr_only, device_id=device_id)
@@ -352,7 +352,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize,
                 for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
                     tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
                     copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1])
-            elif _num_devices > 1:
+            elif num_devices > 1:
                 # Multi-GPU case, copy data to other Devices
                 for dev_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
                     # Making a copy for contiguous data transfer
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index 5561cf9c..66f1dd49 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -26,7 +26,7 @@
 from gpu4pyscf.dft import rks, uks, numint
 from gpu4pyscf.scf import hf, uhf
 from gpu4pyscf.df import df, int3c2e
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 
 def _pin_memory(array):
     mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
@@ -453,8 +453,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
         mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1])
 
         futures = []
-        with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-            for device_id in range(_num_devices):
+        with ThreadPoolExecutor(max_workers=num_devices) as executor:
+            for device_id in range(num_devices):
                 future = executor.submit(
                     _jk_task_with_mo,
                     dfobj, dms, mo_coeff, mo_occ,
@@ -474,8 +474,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
         mo1s = [intopt.sort_orbitals(mo1, axis=[1]) for mo1 in mo1s]
 
         futures = []
-        with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-            for device_id in range(_num_devices):
+        with ThreadPoolExecutor(max_workers=num_devices) as executor:
+            for device_id in range(num_devices):
                 future = executor.submit(
                     _jk_task_with_mo1,
                     dfobj, dms, mo1s, occ_coeffs,
@@ -486,8 +486,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
     # general K matrix with density matrix
     else:
         futures = []
-        with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-            for device_id in range(_num_devices):
+        with ThreadPoolExecutor(max_workers=num_devices) as executor:
+            for device_id in range(num_devices):
                 future = executor.submit(
                     _jk_task_with_dm, dfobj, dms,
                     hermi=hermi, device_id=device_id,
diff --git a/gpu4pyscf/df/grad/jk.py b/gpu4pyscf/df/grad/jk.py
index 2bbf9d9e..4595af65 100644
--- a/gpu4pyscf/df/grad/jk.py
+++ b/gpu4pyscf/df/grad/jk.py
@@ -18,7 +18,7 @@
 from gpu4pyscf.df.int3c2e import get_int3c2e_ip_jk, VHFOpt, _split_tasks
 from gpu4pyscf.lib.cupy_helper import contract, concatenate, reduce_to_device
 from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 
 def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
     '''  # (L|ij) -> rhoj: (L), rhok: (L|oo)
@@ -61,8 +61,8 @@ def get_rhojk(with_df, dm, orbo, with_j=True, with_k=True):
     '''
     futures = []
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _jk_task, with_df, dm, orbo,
                 with_j=with_j, with_k=with_k, device_id=device_id)
@@ -161,12 +161,12 @@ def get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
 
     aux_ao_loc = np.array(intopt.aux_ao_loc)
     loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
-    task_list = _split_tasks(loads, _num_devices)
+    task_list = _split_tasks(loads, num_devices)
 
     futures = []
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _jk_ip_task, intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list[device_id],
                 with_j=with_j, with_k=with_k, device_id=device_id, omega=omega)
diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
index 40ab3bfd..5baff1d0 100644
--- a/gpu4pyscf/df/hessian/jk.py
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -23,7 +23,7 @@
 from gpu4pyscf.hessian.jk import _ao2mo
 from gpu4pyscf.lib import logger
 from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 
 NROOT_ON_GPU = 7
 
@@ -171,8 +171,8 @@ def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0,
     mo_coeff = [intopt.sort_orbitals(mo, axis=[0]) for mo in mo_coeff]
 
     futures = []
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _jk_task_with_mo1,
                 dfobj, dms, mo_coeff, mo1s, occ_coeffs,
@@ -415,12 +415,12 @@ def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_j=True, with_k=True,
     ncp_ij = len(intopt.log_qs)
     tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij))))
     task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
+    for device_id in range(num_devices):
+        task_list.append(tasks[device_id::num_devices])
 
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _int3c2e_ipip_tasks, intopt, task_list[device_id],
                 rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k,
diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
index e0d5cd90..321c9654 100644
--- a/gpu4pyscf/df/hessian/rks.py
+++ b/gpu4pyscf/df/hessian/rks.py
@@ -46,8 +46,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     mocc = mo_coeff[:,mo_occ>0]
     dm0 = numpy.dot(mocc, mocc.T) * 2
 
-    if mf.nlc != '':
-        raise NotImplementedError
+    if mf.do_nlc():
+        raise NotImplementedError("2nd derivative of NLC is not implemented.")
 
     omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py
index 059f571c..99661740 100644
--- a/gpu4pyscf/df/hessian/uks.py
+++ b/gpu4pyscf/df/hessian/uks.py
@@ -48,8 +48,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     moccb = mo_coeff[1][:,mo_occ[1]>0]
     dm0a = numpy.dot(mocca, mocca.T)
     dm0b = numpy.dot(moccb, moccb.T)
-    if mf.nlc != '':
-        raise NotImplementedError
+    if mf.do_nlc():
+        raise NotImplementedError("2nd derivative of NLC is not implemented.")
 
     omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index e77e30ca..28e7e49e 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -24,7 +24,7 @@
                                        reduce_to_device, copy_array, transpose_sum)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.gto.mole import basis_seg_contraction
-from gpu4pyscf.__config__ import _num_devices, _streams
+from gpu4pyscf.__config__ import num_devices, _streams
 
 LMAX_ON_GPU = 8
 FREE_CUPY_CACHE = True
@@ -824,11 +824,11 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
     futures = []
     aux_ao_loc = np.array(intopt.aux_ao_loc)
     loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
-    task_list = _split_tasks(loads, _num_devices)
+    task_list = _split_tasks(loads, num_devices)
 
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _int3c2e_jk_task, intopt, task_list[device_id],
                 dm0_tag, orbo, device_id=device_id, omega=omega)
@@ -935,11 +935,11 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_j=True,
 
     aux_ao_loc = np.array(intopt.aux_ao_loc)
     loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
-    task_list = _split_tasks(loads, _num_devices)
+    task_list = _split_tasks(loads, num_devices)
 
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _int3c2e_ip1_vjk_task, intopt, task_list[device_id],
                 rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k,
@@ -1033,11 +1033,11 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices,
 
     aux_ao_loc = np.array(intopt.aux_ao_loc)
     loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
-    task_list = _split_tasks(loads, _num_devices)
+    task_list = _split_tasks(loads, num_devices)
 
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _int3c2e_ip2_vjk_task, intopt, task_list[device_id],
                 rhoj, rhok, dm0_tag, orbo, with_j=with_j,
@@ -1096,7 +1096,7 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
 
     aux_ao_loc = np.array(intopt.aux_ao_loc)
     loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
-    task_list = _split_tasks(loads, _num_devices)
+    task_list = _split_tasks(loads, num_devices)
 
     nao = intopt.mol.nao
     naux = intopt.auxmol.nao
@@ -1107,8 +1107,8 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
         wk = np.ndarray([naux,nao,nocc,3], dtype=np.float64, order='C', buffer=mem)
 
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _int3c2e_ip1_wjk_task, intopt, task_list[device_id],
                 dm0_tag, orbo, wk, with_k=with_k, device_id=device_id, omega=omega)
@@ -1156,11 +1156,11 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
 
     aux_ao_loc = np.array(intopt.aux_ao_loc)
     loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
-    task_list = _split_tasks(loads, _num_devices)
+    task_list = _split_tasks(loads, num_devices)
 
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _int3c2e_ip2_wjk, intopt, task_list[device_id],
                 dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega)
diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py
index 2908b9a3..9dd1d813 100644
--- a/gpu4pyscf/dft/gen_grid.py
+++ b/gpu4pyscf/dft/gen_grid.py
@@ -30,9 +30,10 @@
 import cupy
 from pyscf import lib
 from pyscf import gto
+from pyscf.dft import gen_grid as gen_grid_cpu
+from gpu4pyscf.lib import utils
 from pyscf.gto.eval_gto import BLKSIZE, NBINS, CUTOFF, make_screen_index
 from pyscf import __config__
-from cupyx.scipy.spatial.distance import cdist
 from gpu4pyscf.lib import logger
 from gpu4pyscf.dft import radi
 from gpu4pyscf.lib.cupy_helper import load_library
@@ -72,13 +73,17 @@ def sg1_prune(nuc, rads, n_ang, radii=radi.SG1RADII):
     '''
 # In SG1 the ang grids for the five regions
 #            6  38 86  194 86
-    leb_ngrid = cupy.array([6, 38, 86, 194, 86])
-    alphas = cupy.array((
+    if nuc >= 19:
+        return 194 * numpy.ones_like(rads, dtype=numpy.int64)
+
+    leb_ngrid = numpy.array([6, 38, 86, 194, 86], dtype=numpy.int64)
+    alphas = numpy.array((
         (0.25  , 0.5, 1.0, 4.5),
         (0.1667, 0.5, 0.9, 3.5),
         (0.1   , 0.4, 0.8, 2.5)))
+
     r_atom = radii[nuc] + 1e-200
-    rads = cupy.asarray(rads)
+    rads = numpy.asarray(rads)
     if nuc <= 2:  # H, He
         place = ((rads/r_atom).reshape(-1,1) > alphas[0]).sum(axis=1)
     elif nuc <= 10:  # Li - Ne
@@ -463,8 +468,6 @@ def _load_conf(mod, name, default):
     else:
         return var
 
-from pyscf.dft import gen_grid
-from gpu4pyscf.lib import utils
 class Grids(lib.StreamObject):
 
     from gpu4pyscf.lib.utils import to_gpu, device
@@ -481,9 +484,10 @@ class Grids(lib.StreamObject):
     level = getattr(__config__, 'dft_gen_grid_Grids_level', 3)
     alignment    = ALIGNMENT_UNIT
     cutoff       = CUTOFF
-    _keys        = gen_grid.Grids._keys
+    _keys        = gen_grid_cpu.Grids._keys
 
-    __init__    = gen_grid.Grids.__init__
+    __init__   = gen_grid_cpu.Grids.__init__
+    dump_flags = gen_grid_cpu.Grids.dump_flags
 
     def __setattr__(self, key, val):
         if key in ('atom_grid', 'atomic_radii', 'radii_adjust', 'radi_method',
@@ -581,12 +585,12 @@ def prune_by_density_(self, rho, threshold=0):
         return self
 
     def to_cpu(self):
-        grids = gen_grid.Grids(self.mol)
+        grids = gen_grid_cpu.Grids(self.mol)
         utils.to_cpu(self, out=grids)
         return grids
 
-_default_rad = gen_grid._default_rad
-RAD_GRIDS = gen_grid.RAD_GRIDS
-_default_ang = gen_grid._default_ang
-ANG_ORDER = gen_grid.ANG_ORDER
-_padding_size = gen_grid._padding_size
+_default_rad = gen_grid_cpu._default_rad
+RAD_GRIDS = gen_grid_cpu.RAD_GRIDS
+_default_ang = gen_grid_cpu._default_ang
+ANG_ORDER = gen_grid_cpu.ANG_ORDER
+_padding_size = gen_grid_cpu._padding_size
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index bf6c65c9..bb98e857 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -28,7 +28,7 @@
 from gpu4pyscf.dft import xc_deriv, xc_alias, libxc
 from gpu4pyscf import __config__
 from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 
 LMAX_ON_GPU = 6
 BAS_ALIGNED = 1
@@ -395,7 +395,7 @@ def gen_grid_range(ngrids, device_id, blksize=MIN_BLK_SIZE):
     '''
     Calculate the range of grids assigned the given device
     '''
-    ngrids_per_device = (ngrids + _num_devices - 1) // _num_devices
+    ngrids_per_device = (ngrids + num_devices - 1) // num_devices
     ngrids_per_device = (ngrids_per_device + blksize - 1) // blksize * blksize
     grid_start = min(device_id * ngrids_per_device, ngrids)
     grid_end = min((device_id + 1) * ngrids_per_device, ngrids)
@@ -523,8 +523,8 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     release_gpu_stack()
     cupy.cuda.get_current_stream().synchronize()
     futures = []
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _nr_rks_task,
                 ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
@@ -914,8 +914,8 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     release_gpu_stack()
     cupy.cuda.get_current_stream().synchronize()
     futures = []
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _nr_uks_task,
                 ni, mol, grids, xc_code, (dma,dmb), mo_coeff, mo_occ,
@@ -1026,7 +1026,7 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
             ao_deriv = 1
 
         ngrids_glob = grids.coords.shape[0]
-        ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        ngrids_per_device = (ngrids_glob + num_devices - 1) // num_devices
         ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
         grid_start = min(device_id * ngrids_per_device, ngrids_glob)
         grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
@@ -1108,8 +1108,8 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
 
     futures = []
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _nr_rks_fxc_task,
                 ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
@@ -1178,7 +1178,7 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
             ao_deriv = 1
 
         ngrids_glob = grids.coords.shape[0]
-        ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        ngrids_per_device = (ngrids_glob + num_devices - 1) // num_devices
         ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
         grid_start = min(device_id * ngrids_per_device, ngrids_glob)
         grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
@@ -1277,8 +1277,8 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
 
     futures = []
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _nr_uks_fxc_task,
                 ni, mol, grids, xc_code, fxc, (dma, dmb), mo1, occ_coeff,
diff --git a/gpu4pyscf/dft/uks.py b/gpu4pyscf/dft/uks.py
index 5e11bb81..4d561e62 100644
--- a/gpu4pyscf/dft/uks.py
+++ b/gpu4pyscf/dft/uks.py
@@ -16,7 +16,7 @@
 from pyscf.dft import uks as uks_cpu
 from pyscf import lib
 from gpu4pyscf.lib import logger
-from gpu4pyscf.dft import numint, gen_grid, rks
+from gpu4pyscf.dft import rks
 from gpu4pyscf.scf import hf, uhf
 from gpu4pyscf.lib.cupy_helper import tag_array
 from gpu4pyscf.lib import utils
diff --git a/gpu4pyscf/drivers/basis_vDZP_NWCHEM.dat b/gpu4pyscf/drivers/basis_vDZP_NWCHEM.dat
new file mode 100644
index 00000000..1fc10e1e
--- /dev/null
+++ b/gpu4pyscf/drivers/basis_vDZP_NWCHEM.dat
@@ -0,0 +1,2310 @@
+BASIS "ao basis" PRINT
+#BASIS SET:
+H         S         
+     81.886780875039      0.008423954179
+     12.231063861388      0.064861285350
+      2.786815144183      0.311400883616
+      0.775786677408      0.985308081721
+      0.223433692783      1.256819962883
+H         S         
+      0.331097483644      0.052292300794
+      0.107455350812      0.104139302794
+      0.050680508365      0.245115714360
+H         P         
+      1.417043684193      0.759765848611
+      0.290781406697      1.522844626098
+
+#BASIS SET:
+He        S         
+    248.304359266256      0.005013791761
+     39.257359859983      0.034983701525
+      9.290242872987      0.162973195617
+      2.650678948299      0.489691016373
+      0.811596267579      0.932899350713
+He        S         
+      0.268607161928      0.222877468588
+      0.345025805948      0.062111112842
+      0.102007122911      0.091342429226
+He        P         
+      1.310041712606      0.696987984442
+      0.265008725379      0.178488628760
+
+#BASIS SET:
+Li        S         
+    261.504397395816      0.007995837268
+     39.435060612595      0.058753345768
+      8.903002628902      0.251012919032
+      2.312128536738      0.601136930396
+      0.673560740522      0.478266124634
+Li        S         
+      0.637937919385     -0.146472924115
+      0.063047215665      0.821661055674
+      0.020134930908      0.034773514803
+Li        S         
+      0.029187683675      1.062645883498
+      0.014762824299      1.095509715610
+Li        P         
+      1.607685808951      0.088808339206
+      0.261313614186      0.424066699835
+      0.078704441731      0.737998244728
+Li        D         
+      0.248573704371      0.694743515137
+      0.088415849082      0.788715353455
+
+#BASIS SET:
+Be        S         
+    510.032398065642      0.005805184625
+     76.405556797429      0.043342046125
+     17.255250508893      0.188908143114
+      4.586010018287      0.469633572220
+      1.356430512009      0.395896576117
+Be        S         
+      1.628923371459     -0.134493067135
+      0.158484974791      0.944737331855
+      0.063902892456      0.171032012077
+Be        S         
+      0.056984898840      0.893658825536
+      0.029154653875      1.463000499223
+Be        P         
+      2.658894134528      0.025302164759
+      0.449379800839      0.144909124946
+      0.113066880133      0.200552018717
+Be        D         
+      0.458082080027      0.881395926545
+      0.118291023217      0.993425036644
+
+#BASIS SET:
+B         S         
+      1.548610750968     -0.375453530717
+      1.230872527218      0.296383194735
+      0.288936160855      0.400281132411
+      0.105945534640      0.252339650517
+B         S         
+      0.149430266146      0.631907262941
+      0.046078378647      0.717819753504
+B         P         
+      6.824625642150      0.032055054798
+      1.786841467886      0.168629590401
+      0.529445685476      0.508222453209
+      0.171288703178      0.449200455124
+B         P         
+      0.137855130395      1.122762891963
+      0.045638926887      1.688087920718
+B         D         
+      0.685952928321      1.202317712080
+      0.217177410256      1.378136829807
+
+#BASIS SET:
+C         S         
+      2.174987790335     -0.276058910823
+      1.852226946510      0.240154751118
+      0.459098487623      0.191124862153
+      0.169128986465      0.087294467451
+C         S         
+      0.227455022611      0.987938126698
+      0.086918590957      1.016650788977
+C         P         
+     12.870085333684      0.024741438180
+      3.295443583046      0.152192079858
+      0.976645717756      0.448674298135
+      0.348425197118      0.494756938315
+C         P         
+      0.146489481077      1.594415811311
+      0.063733914289      0.807859142831
+C         D         
+      1.025317925428      0.833187865353
+      0.250249064514      0.858263567336
+
+#BASIS SET:
+N         S         
+      2.735378582931     -0.239593796561
+      2.219800598729      0.224989693361
+      0.592708671545      0.194517761229
+      0.227406999886      0.098125575699
+N         S         
+      0.280435456826      0.853515477480
+      0.105373393767      1.130727240507
+N         P         
+     19.104528476156      0.011792637499
+      4.831013846305      0.072900287221
+      1.454969505940      0.205542762778
+      0.494779046501      0.258358206618
+N         P         
+      0.178530326117      1.242146697946
+      0.068618652214      0.429051028293
+N         D         
+      1.230812954761      1.261894149963
+      0.370851969193      1.181659566926
+
+#BASIS SET:
+O         S         
+      3.543641820586     -0.263554292988
+      2.712717475497      0.265678831423
+      0.683184525058      0.279087292612
+      0.255976739185      0.112868556239
+O         S         
+      0.286500025367      1.027594577324
+      0.123012019465      1.159166516125
+O         P         
+     27.003841820377      0.012470154545
+      6.762654966183      0.074520776234
+      2.011704598628      0.210079890981
+      0.634428348949      0.291319728722
+O         P         
+      0.199067660399      1.512413659527
+      0.065643622039      0.312634979070
+O         D         
+      1.366109340367      0.858294230859
+      0.461931641275      0.846034666146
+
+#BASIS SET:
+F         S         
+      7.034861151373     -0.446046417537
+      4.872932295917      0.241120382569
+      1.330878323026      0.666593307204
+      0.525469722669      0.781917161284
+F         S         
+      0.821266047563      0.135544243991
+      0.198341185022      0.993554010066
+F         P         
+     35.526100039768      0.026818249901
+      9.093570542569      0.133596681058
+      2.685413226386      0.376326355144
+      0.833678395080      0.516681452384
+F         P         
+      0.262279770686      0.931221931175
+      0.082396241151      0.200669872602
+F         D         
+      1.454912451083      1.369428727075
+      0.446498292031      1.466564035452
+
+#BASIS SET:
+Ne        S         
+      8.511025142406     -0.346407731370
+      6.944709652311      0.242428346755
+      1.561041650912      0.456810607744
+      0.599496632788      0.413991739819
+Ne        S         
+      0.609511035171      0.175668695012
+      0.245683034081      0.854892929924
+Ne        P         
+     56.111190785374      0.018617197509
+     14.542058717068      0.119964511456
+      4.468992837375      0.351570092625
+      1.504786372395      0.552758556945
+Ne        P         
+      0.498808346618      0.959617655183
+      0.156283251576      0.293739815995
+Ne        D         
+      1.678877049768      0.562501368964
+      0.416060290014      1.450686746125
+
+#BASIS SET:
+Na        S         
+     17.859469979814     -0.094639395365
+      1.741589491951      0.454368094759
+      0.671216662419      0.326110219634
+      0.356748679884      0.099733348288
+Na        S         
+      1.040532238619      1.128928459961
+      0.363586481174      0.971270254416
+      0.030788669980      0.725021277866
+Na        S         
+      0.063388594723      0.461074706312
+      0.027886589957      0.865217967753
+Na        P         
+     94.408900907741      0.022285520404
+     22.833035343911      0.141094814336
+      7.392308589988      0.456452935832
+      2.700481523814      0.811931701139
+      0.980041469729      0.865637159210
+Na        P         
+      2.667291528209     -0.005824466879
+      0.336960843152      0.524769342105
+      0.055297986191      0.011767234747
+Na        D         
+      0.317736281462      0.216305744638
+      0.050399448912      1.084520528356
+
+#BASIS SET:
+Mg        S         
+     21.746078000863     -0.107712415417
+      2.054693963682      0.605039945129
+      0.775378201881      0.358253218691
+      0.646755053402      0.124456587448
+Mg        S         
+      1.518035874065      0.953325303523
+      0.626648472409      0.885052846709
+      0.061342469056      0.663200597406
+Mg        S         
+      0.114525781742      1.102821153701
+      0.049143798668      1.262827197035
+Mg        P         
+     98.149921577771      0.020030532297
+     22.986901774625      0.130645961601
+      7.073704413314      0.416305903439
+      2.357463360333      0.664621574936
+      0.776737081590      0.460953274264
+Mg        P         
+      3.771301810954     -0.008032934562
+      0.189166874642      0.340369688597
+      0.068419879293      0.260127856330
+Mg        D         
+      0.401806625040      0.846622892966
+      0.102441134001      1.533893458570
+
+#BASIS SET:
+Al        S         
+      3.307057950862      0.035887679384
+      1.064990666611     -0.301434323135
+      0.217094967607      0.408003674147
+      0.103399050795      0.504944565236
+Al        S         
+      0.091326549223      0.267250123332
+      0.040384684710      0.424093785538
+Al        P         
+      0.966979219663     -0.093593510129
+      0.418602698432      0.186041911562
+      0.179016462009      0.364571859325
+      0.092774532026      0.270083093687
+Al        P         
+      0.101275278069      0.562522659034
+      0.032903691927      0.829515939897
+Al        D         
+      0.653520505846      0.545097705481
+      0.177964130832      1.149228250111
+
+#BASIS SET:
+Si        S         
+      4.206061763818      0.036046100379
+      1.379029753670     -0.278621659842
+      0.265740193555      0.514032304554
+      0.112502697537      0.365777551979
+Si        S         
+      0.133673616794      0.251860678646
+      0.048297859440      1.001912217231
+Si        P         
+      1.367505459412     -0.045311794170
+      0.391314943212      0.288581385277
+      0.158228930721      0.372264544458
+      0.068937667225      0.100776575609
+Si        P         
+      0.117817448223      0.511784304415
+      0.041067655925      0.790699182240
+Si        D         
+      1.229803285857      0.317490526212
+      0.304062845855      1.160802102368
+
+#BASIS SET:
+P         S         
+      7.492690912960      0.036211256938
+      1.520979692504     -0.410346524622
+      0.389986469517      0.590431160497
+      0.185944246896      0.700026469133
+P         S         
+      0.426277431342      0.066946209604
+      0.075641155401      0.662868977638
+P         P         
+      1.182449802418     -0.070708334586
+      0.716815122233      0.179798137437
+      0.284356002842      0.347185537499
+      0.119844662929      0.186156103439
+P         P         
+      0.104174736899      0.642173678179
+      0.042996540650      0.592979489332
+P         D         
+      1.695162960423      0.257470657436
+      0.351822528642      1.089094143136
+
+#BASIS SET:
+S         S         
+      7.601355269472      0.046444664335
+      1.927121218723     -0.464421807155
+      0.521456307465      0.575890670867
+      0.230532240510      0.826584502527
+S         S         
+      0.470005347879      0.136072106019
+      0.083469210422      1.019121415962
+S         P         
+      1.711639257837     -0.172150677616
+      1.157276160172      0.256510381181
+      0.449727077619      0.514122297269
+      0.195290728699      0.449898131647
+S         P         
+      0.121042410856      0.700057493662
+      0.061043915320      0.824465481915
+S         D         
+      1.777751118268      0.301983328801
+      0.404467069554      1.098792945928
+
+#BASIS SET:
+Cl        S         
+     14.151131413181      0.017603655830
+      2.057697743755     -0.379080686343
+      0.825720874850      0.374023499125
+      0.302442901804      0.721003299953
+Cl        S         
+      0.514888452515      0.170089201402
+      0.098706322995      0.945113577067
+Cl        P         
+      2.844174271253     -0.135762495160
+      1.377515628579      0.223435753709
+      0.619123933124      0.728151126508
+      0.261922095680      0.819017057903
+Cl        P         
+      0.126421788724      0.990717918733
+      0.073988046008      0.696607999630
+Cl        D         
+      1.922210814739      0.588629611664
+      0.428318736672      1.937851927117
+
+#BASIS SET:
+Ar        S         
+     14.940101122322      0.011521861228
+      2.243165636285     -0.558968080854
+      1.438970926578      0.473342341500
+      0.406876486290      0.631293265007
+Ar        S         
+      0.440259652763      0.069249308310
+      0.161875873542      1.603023542921
+Ar        P         
+      3.165484511165     -0.077261666511
+      1.111111719803      0.471181733030
+      0.414649437581      0.751059102315
+      0.162616494311      0.223201162944
+Ar        P         
+      0.154347795709      1.472564774171
+      0.071270891501      0.581126940969
+Ar        D         
+      1.467813037801      0.644461354176
+      0.399396403391      1.401420583174
+
+#BASIS SET:
+K         S         
+      2.712682131864     -0.138120511191
+      0.959527159237      0.183647787140
+      0.415861396286      0.054504151428
+      0.222428317820      0.079993072198
+K         S         
+      0.745830108700     -0.013684884521
+      0.447736522029      0.507569624842
+      0.027242967193      0.912536205404
+K         S         
+      0.032660680703      0.739672700474
+      0.024351810227      1.104288980008
+K         P         
+     10.065499144539     -0.038491537428
+      0.974019481043      0.674332393862
+      0.374596372255      0.612538235871
+      0.156921520903      0.172520943233
+K         P         
+      4.042012088018     -0.007059823100
+      0.087259092608      0.077806374422
+      0.054714257801      0.688537533778
+K         D         
+      0.866638592894      0.502040108967
+      0.157085997854      1.055271878444
+
+#BASIS SET:
+Ca        S         
+      2.719301351535     -0.078978644821
+      1.883275651368      0.187115863443
+      0.642163903598      0.589578641721
+      0.336487867317      0.370804234632
+Ca        S         
+      3.026324320338     -0.233621817613
+      1.242283226509      0.166011018416
+      0.053488348283      0.035851055675
+Ca        S         
+      0.058373367494      1.094426980400
+      0.029921642326      0.431156605007
+Ca        P         
+     10.411513039603     -0.021761105222
+      1.181201852942      0.307819154635
+      0.472806545197      0.269256985339
+      0.206363671314      0.068406701922
+Ca        P         
+      1.103963525530     -0.009366778560
+      0.092210626014      0.588635174998
+      0.007655059125      0.028655018892
+Ca        D         
+      1.466572349546      0.333010877274
+      0.261547289431      0.740339846733
+
+#BASIS SET:
+Sc        S         
+     12.894521140222      0.079236996463
+      5.071686274181     -0.458078625675
+      1.167181899577      0.336412876746
+      0.663942013228      0.332736377800
+      0.309654579707      0.211456915306
+Sc        S         
+      1.040885767796      0.463978291008
+      0.568097001649      0.435148577227
+Sc        S         
+      0.070617692779      1.482515914675
+      0.027248406219      0.639348111708
+Sc        P         
+      6.368578496263     -0.093545873727
+      1.949004097442      0.341238050163
+      0.815301694561      0.601180009390
+      0.313815959827      0.269603495756
+Sc        P         
+      0.037710430517      0.936037163888
+      0.026841704778      1.288316160280
+Sc        D         
+     11.687225911271      0.048657783353
+      3.314758379726      0.202225028823
+      1.046336738397      0.396787231764
+      0.313968073144      0.458512682004
+Sc        D         
+      0.096369535224      0.705041436498
+      0.031054056783      0.578360893303
+
+#BASIS SET:
+Ti        S         
+     16.461347886752      0.082016774935
+      5.371386815523     -0.650480440851
+      1.448985756914      0.538706537935
+      0.638919559348      0.450440953157
+      0.241619151339      0.154995866347
+Ti        S         
+      1.051736442516      0.627756794746
+      0.589928288257      0.487164574898
+Ti        S         
+      0.075250745810      1.663780527189
+      0.026602025230      0.399689063950
+Ti        P         
+      7.122429655502     -0.037873691723
+      2.383153836626      0.114975280753
+      1.028878547766      0.221233308088
+      0.416069961536      0.117542200112
+Ti        P         
+      0.179440719634      1.269893747125
+      0.076704284265      0.585246578780
+Ti        D         
+     16.285523097231      0.057717359677
+      4.903186429987      0.253388428709
+      1.644289014934      0.545862699555
+      0.525281547715      0.674601799858
+Ti        D         
+      0.170096639197      1.166232841900
+      0.064103525267      0.622642131383
+
+#BASIS SET:
+V         S         
+     16.255319265212      0.094023708983
+      6.405063127696     -0.537187056948
+      1.457163624962      0.520258926643
+      0.499926424667      0.122520741372
+      0.301185417021      0.072736674208
+V         S         
+      1.012605416044      1.098550324678
+      0.547886952925      0.607303361815
+V         S         
+      0.090933679299      1.595189552198
+      0.036744292616      0.948508696780
+V         P         
+      8.103098572263     -0.081234318416
+      2.629836559131      0.267004275831
+      1.112202390392      0.485982458150
+      0.429812510937      0.238017396511
+V         P         
+      0.134170344208      0.865319211565
+      0.045541731704      1.001512124222
+V         D         
+     20.991852147597      0.047940709014
+      6.482973795152      0.218068368478
+      2.259607977658      0.483008624822
+      0.787388461330      0.596005568614
+V         D         
+      0.266299635377      0.873322009295
+      0.085848775242      0.412455368037
+
+#BASIS SET:
+Cr        S         
+     19.326807872923      0.115928746519
+      6.895352080094     -0.789981589302
+      1.869640267141      0.563707338990
+      0.843806445546      0.445197267976
+      0.311586877483      0.070648620143
+Cr        S         
+      1.226144167110      1.430412693717
+      0.536784361672      0.817200561042
+Cr        S         
+      0.103687455191      1.278362236512
+      0.040453943570      0.998838137196
+Cr        P         
+      9.274973304564     -0.067626882136
+      2.834803217107      0.248491248438
+      1.188671904758      0.418309250990
+      0.452672452476      0.190643237960
+Cr        P         
+      0.099943441911      0.572805817456
+      0.020690649882      0.829846109272
+Cr        D         
+     25.343518443315      0.043409250189
+      7.837194674080      0.208571751452
+      2.703730990740      0.481280315878
+      0.923329332261      0.600167729446
+Cr        D         
+      0.301729037231      1.148719306493
+      0.095659118228      0.610428134382
+
+#BASIS SET:
+Mn        S         
+     20.843228079998      0.103402377154
+      7.760419419145     -0.656141637759
+      1.904516413545      0.665798788078
+      1.001320433899      0.396522428503
+      0.481937472515      0.292024154733
+Mn        S         
+      1.268268420404      1.565497038368
+      0.857241213876      1.201323188808
+Mn        S         
+      0.109140028793      1.457741873878
+      0.045318992446      1.357680548185
+Mn        P         
+     10.512436868143     -0.097556999181
+      3.099073135769      0.375169172686
+      1.322948831985      0.597227851646
+      0.527055310067      0.279726392464
+Mn        P         
+      0.344507060257      0.623790512345
+      0.118361361298      1.250852309631
+Mn        D         
+     31.818227456997      0.047047726486
+     10.037421666655      0.238358793520
+      3.534516417893      0.579576958050
+      1.249761773374      0.755472666211
+Mn        D         
+      0.419663812162      1.107456649217
+      0.131489658435      0.625907283697
+
+#BASIS SET:
+Fe        S         
+     22.358655151183      0.092871540367
+      8.726722256050     -0.543175687899
+      1.883511346045      0.788008191373
+      0.898098829763      0.506118339909
+      0.499141495354      0.151467937894
+Fe        S         
+      1.414672733116      1.048090878533
+      0.838969961703      0.701944708617
+Fe        S         
+      0.126967189900      1.110584663987
+      0.048297686989      0.960183114424
+Fe        P         
+     11.737335796504     -0.087457071524
+      3.491740558354      0.321194930339
+      1.519670314056      0.525914230497
+      0.612230245037      0.263690954761
+Fe        P         
+      0.270740512750      0.802890646298
+      0.110956520505      0.614971889314
+Fe        D         
+     37.602436513551      0.035788062793
+     11.980469021430      0.185256836504
+      4.315862847956      0.455518034427
+      1.583817661861      0.610355600220
+Fe        D         
+      0.548357256279      1.544966076786
+      0.171477360605      0.810476849978
+
+#BASIS SET:
+Co        S         
+     25.335315694476      0.087766855961
+      9.488355054778     -0.550692716629
+      2.105770866101      0.731208569241
+      1.152741563097      0.435591231225
+      0.600134651908      0.260972326852
+Co        S         
+      1.728593048399      0.743738752336
+      0.965135408786      0.944149972951
+Co        S         
+      0.138217537887      1.152256178711
+      0.048799619525      1.322788514015
+Co        P         
+     13.019846825599     -0.106687768822
+      3.898569128239      0.379287211199
+      1.691912041740      0.646070467691
+      0.663194327000      0.321080022842
+Co        P         
+      0.266233736271      0.516713551942
+      0.087682602286      0.610544822757
+Co        D         
+     46.467186502091      0.036738936657
+     14.844727962964      0.206400986919
+      5.324781910313      0.545964703962
+      1.946193019964      0.772313951631
+Co        D         
+      0.663697766556      1.764841258136
+      0.197538703092      0.981959446237
+
+#BASIS SET:
+Ni        S         
+     28.117307328944      0.095054204041
+     10.323114146207     -0.619352378957
+      2.266501334743      0.903046067553
+      1.031698338004      0.572669971934
+      0.429763116477      0.107936112138
+Ni        S         
+      1.585096116449      0.536471821182
+      0.927688425987      0.466470527797
+Ni        S         
+      0.142723981700      1.052935132680
+      0.052288562321      0.852229071228
+Ni        P         
+     14.649263427379     -0.123572234826
+      4.487248555279      0.373049398485
+      1.926522201916      0.666618694496
+      0.755571182364      0.334357692295
+Ni        P         
+      0.320923338030      1.109796900506
+      0.124597438762      1.063986125250
+Ni        D         
+     55.795596873330      0.032016254993
+     17.671713442822      0.206558524336
+      6.458884370599      0.576880736568
+      2.375449580314      0.853632978049
+Ni        D         
+      0.808597906774      1.577004983294
+      0.239256209431      0.904057266556
+
+#BASIS SET:
+Cu        S         
+     30.095654464774      0.088111002450
+     11.189465415118     -0.566864618289
+      2.541452118390      0.690601613017
+      1.290510764029      0.526642987015
+      0.608961815658      0.132227842362
+Cu        S         
+      2.018430478515      0.946194744344
+      0.849138974882      0.961852815883
+Cu        S         
+      0.155609251028      1.205505591180
+      0.055100263916      1.040275581217
+Cu        P         
+     16.197916896934     -0.105945973220
+      4.487908313373      0.425417913649
+      1.928366077649      0.642632779569
+      0.763943236555      0.292317921669
+Cu        P         
+      0.299840227632      1.292856821394
+      0.116773403393      1.061205571547
+Cu        D         
+     60.487297938910      0.032026327350
+     19.233046405660      0.186974917418
+      6.965090689821      0.519387752985
+      2.568903372512      0.762425819652
+Cu        D         
+      0.875809628134      0.877877663513
+      0.257690509171      0.512991728701
+
+#BASIS SET:
+Zn        S         
+     33.874473714375      0.071277616937
+     11.972622956367     -0.506758729754
+      2.961714395977      0.493378998699
+      1.829981912490      0.307312140366
+      0.870554735732      0.249108397868
+Zn        S         
+      1.739203860675      1.189126584032
+      0.784958106810      0.340497149689
+Zn        S         
+      0.173777482831      1.056166866705
+      0.058936157612      0.875584102608
+Zn        P         
+     18.428852736536     -0.072783254370
+      4.757828516192      0.320487006186
+      2.051504911103      0.464955123397
+      0.811290090160      0.202072427695
+Zn        P         
+      0.280329527429      0.290680713435
+      0.074967629409      0.340832692654
+Zn        D         
+     69.345783920227      0.038894475768
+     22.344643463037      0.225936760009
+      8.230073746758      0.625199117295
+      3.129607637648      0.942430315944
+Zn        D         
+      1.115445344238      1.300039063947
+      0.350438433853      0.725428578056
+
+#BASIS SET:
+Ga        S         
+      3.461851099712      0.086596634698
+      1.673374271825     -0.270587362691
+      0.211318368014      0.351819394013
+      0.097828153295      0.285416686844
+Ga        S         
+      0.151627010733      0.297902650076
+      0.055604376270      0.811459212326
+Ga        P         
+      1.224072196918     -0.142334247355
+      0.570340687021      0.156300957280
+      0.176342181917      0.414233473261
+      0.069338127059      0.187202564348
+Ga        P         
+      0.103202148668      0.781680200651
+      0.032521362068      1.139904102344
+Ga        D         
+      0.434136758705      0.630724293037
+      0.136436950793      1.735376440057
+
+#BASIS SET:
+Ge        S         
+      3.217496101688      0.220957013438
+      1.918166572642     -0.511683753720
+      0.214903864260      0.831622355013
+      0.067589930320      0.205018958674
+Ge        S         
+      0.465696258761      0.242715316187
+      0.060174025120      1.570350456114
+Ge        P         
+      3.914179557047      0.071448877544
+      2.346468630555     -0.196014253790
+      0.273861111944      0.722477976627
+      0.105990071273      0.319809081571
+Ge        P         
+      0.202135065169      0.219317596243
+      0.061214687117      1.973742824912
+Ge        D         
+      0.370009633120      1.071258503403
+      0.132976201398      1.060360506723
+
+#BASIS SET:
+As        S         
+      3.507693642950      0.158031853980
+      1.888383025988     -0.490805795974
+      0.288111240456      0.729540893159
+      0.143180088068      0.326673579759
+As        S         
+      0.352438514226      0.168792374132
+      0.073126498505      1.058779710836
+As        P         
+      1.358255409051     -0.165061038563
+      0.934647702814      0.154119568450
+      0.292654437651      0.269752770917
+      0.122927318606      0.174509228238
+As        P         
+      0.118767537140      0.878680997698
+      0.047605103697      1.162342892188
+As        D         
+      0.314487938869      0.831547539129
+      0.218493965228      1.033423867748
+
+#BASIS SET:
+Se        S         
+      3.650865730594      0.231457444968
+      2.210903609188     -0.563533855155
+      0.335170928150      0.729862073845
+      0.151638836734      0.315657165245
+Se        S         
+      0.599638822173      0.126025157342
+      0.075582134264      1.069252579041
+Se        P         
+      1.507148257439     -0.162605422408
+      0.913367414636      0.183499918283
+      0.341221978698      0.372235321531
+      0.150432587271      0.271959003606
+Se        P         
+      0.157082854490      0.530816305712
+      0.061272426885      1.598391183858
+Se        D         
+      0.367597502630      1.060909735181
+      0.223013187434      0.893014584292
+
+#BASIS SET:
+Br        S         
+      4.214014635008      0.165692028470
+      2.412188698594     -0.476877193218
+      0.391067027010      0.710299602030
+      0.164674252704      0.362093036361
+Br        S         
+      0.485599664697      0.145825877593
+      0.074598994605      0.768087358690
+Br        P         
+      1.789947253144     -0.207356828201
+      1.292036187413      0.186107969204
+      0.449383252160      0.332524333195
+      0.194590167329      0.303778721054
+Br        P         
+      0.122759759967      0.804134734196
+      0.066738625731      1.319919526700
+Br        D         
+      0.442438146643      1.158672768266
+      0.245849018795      1.021366227819
+
+#BASIS SET:
+Kr        S         
+      3.986511440636      0.290559957024
+      2.943901744792     -0.516111029924
+      0.429901866727      0.564971586700
+      0.151821838900      0.272351694911
+Kr        S         
+      0.334711812968      0.193054210335
+      0.111745168911      0.897546361771
+Kr        P         
+      2.153957686193     -0.178565423896
+      1.283748685547      0.172715443611
+      0.501685561731      0.400700715929
+      0.237520267977      0.316528198613
+Kr        P         
+      0.140113161172      0.940357138145
+      0.089204908642      0.846922224775
+Kr        D         
+      0.526301285310      1.174418328612
+      0.203100266787      0.975302884833
+
+#BASIS SET:
+Rb        S         
+      3.869999599846      0.122319888582
+      2.130904636104     -0.666846831080
+      0.738002026783      0.418046906087
+      0.409528742839      0.516121898223
+Rb        S         
+      1.350580726758      0.441228993164
+      0.672123461407     -0.180183191143
+      0.215026040193      0.940988897210
+Rb        S         
+      0.041727573698      0.194823232373
+      0.018173085367      0.240784908108
+Rb        P         
+      2.767410417511     -0.100991461056
+      0.690279272515      0.523102112330
+      0.272134686488      0.509586156257
+      0.099861314223      0.105281356899
+Rb        P         
+      0.483868425918     -0.021967319393
+      0.064990339093      0.192198563244
+      0.026344167036      0.254778803087
+Rb        D         
+      0.482952908592      0.814918945654
+      0.118599680126      1.160272481701
+
+#BASIS SET:
+Sr        S         
+      1.692706637459     -0.393341748067
+      1.149795785772      0.208407810148
+      0.669131477932      0.410560473766
+      0.248220076725      0.223568940036
+Sr        S         
+      1.886266511407      0.403235193588
+      0.400164390156     -0.077244774899
+      0.087928617203      0.264020515846
+Sr        S         
+      0.059443196810      1.173409533858
+      0.028584686102      1.214114711178
+Sr        P         
+      2.806647699305     -0.059210927424
+      0.819376524702      0.263531483008
+      0.348158267797      0.253839198083
+      0.146003646962      0.054726018823
+Sr        P         
+      0.935150498254     -0.009917943921
+      0.087786356974      0.731454921275
+      0.045420993133      0.227916283507
+Sr        D         
+      0.688406352900      0.490903537712
+      0.192704647015      0.763662684452
+
+#BASIS SET:
+Y         S         
+      7.296549891946      0.195341789784
+      2.688422185597     -1.838359171848
+      2.177642572388      1.023950898895
+      0.659090683583      0.677612516206
+      0.318000313098      0.286803253760
+Y         S         
+      0.674812957467      0.585665285488
+      0.290353362618      0.292072052074
+Y         S         
+      0.061667823748      1.471139784374
+      0.025673512214      0.434089285930
+Y         P         
+      2.413073905893     -0.896109901350
+      1.991118335445      0.920914883211
+      0.644885028991      0.747249800994
+      0.262158149515      0.358967578245
+Y         P         
+      0.090251834443      1.587694599781
+      0.038734540875      1.124989169808
+Y         D         
+      2.640667082288     -0.051410329462
+      1.377231693280      0.259691609738
+      0.521170303505      0.647670134977
+      0.191317945677      0.722772032171
+Y         D         
+      0.066561171952      0.635575534061
+      0.021058714013      0.213198852821
+
+#BASIS SET:
+Zr        S         
+      7.636755459435      0.225594387816
+      3.059460604715     -2.021770934596
+      2.734289469745      1.043987761524
+      0.974030529831      0.649481513646
+      0.321237768185      0.271978542754
+Zr        S         
+      0.666118313328      0.610098372668
+      0.382004929999      0.279366801720
+Zr        S         
+      0.078638985037      1.573000069276
+      0.030977707836      0.591169478729
+Zr        P         
+      2.635304536716     -0.851203628339
+      2.139701672965      0.892120469003
+      0.701270261887      0.710477753317
+      0.294117575826      0.292528213505
+Zr        P         
+      0.153963461038      1.438560009408
+      0.088262101191      0.995725387494
+Zr        D         
+      3.201047577661     -0.038892498436
+      1.485048727987      0.275692468196
+      0.559574253517      0.646436509544
+      0.196618868868      0.666536113423
+Zr        D         
+      0.062451815401      0.649948004351
+      0.020325928756      0.194809845607
+
+#BASIS SET:
+Nb        S         
+      8.226907672370      0.173869214563
+      5.258853938239      0.400716421611
+      4.024909003000     -1.292442883143
+      1.195129963229      0.474286872382
+      0.359665734464      0.272918985707
+Nb        S         
+      0.780972111614      0.609456624827
+      0.468149789914      0.309910528963
+Nb        S         
+      0.084330299999      1.270848051253
+      0.034918666624      0.591189631131
+Nb        P         
+      2.871351590001     -0.873217473927
+      2.280935596682      0.940543482261
+      0.743551490746      0.760237258354
+      0.298069250267      0.269556485513
+Nb        P         
+      0.152595605504      1.401687661947
+      0.068327454661      1.010582696646
+Nb        D         
+      3.367880306345     -0.051672605872
+      1.756564781302      0.284516251789
+      0.658735017693      0.664232933582
+      0.231059396397      0.658825321794
+Nb        D         
+      0.072216226374      0.696966749733
+      0.024801099920      0.196624147646
+
+#BASIS SET:
+Mo        S         
+      9.673752512870      0.293639539791
+      3.572282146733     -1.857614871108
+      1.530849754894      1.224864128516
+      0.631996839839      0.660456106153
+      0.288839329473      0.296153352239
+Mo        S         
+      0.778417996936      0.603632414429
+      0.534172652813      0.301871306040
+Mo        S         
+      0.087626525151      1.317087783672
+      0.033562681859      0.580743179363
+Mo        P         
+      3.093707673366     -0.826631577474
+      2.403641873352      0.900674860790
+      0.818240314903      0.711828517694
+      0.336205879987      0.269489929318
+Mo        P         
+      0.141936162003      1.303129589177
+      0.051142057872      1.262881655764
+Mo        D         
+      4.248706637709     -0.056047421983
+      2.043076701133      0.260482078879
+      0.819219599195      0.645891778457
+      0.300623287176      0.680758170713
+Mo        D         
+      0.099924939859      0.934167552489
+      0.033837585352      0.225962855556
+
+#BASIS SET:
+Tc        S         
+      8.734488231535      0.191829141563
+      3.835899095068     -2.105775116524
+      3.469751744685      1.518466647586
+      0.967124516165      0.732292685600
+      0.424182603993      0.274139633354
+Tc        S         
+      1.003924527229      0.601613332020
+      0.543013116325      0.295145009859
+Tc        S         
+      0.088094236993      1.301256535828
+      0.033682774860      0.537823653934
+Tc        P         
+      3.405023984944     -0.854982996734
+      2.564265074003      0.959319009575
+      0.863581625172      0.828846615206
+      0.337411029995      0.255144526070
+Tc        P         
+      0.091795517667      1.101466987783
+      0.029879282774      1.937115484620
+Tc        D         
+      5.097732110758     -0.054667679941
+      2.187138333162      0.264182410092
+      0.976624475913      0.623831001407
+      0.396051934959      0.663361650807
+Tc        D         
+      0.148110040542      1.029802355721
+      0.059688691595      0.261393560258
+
+#BASIS SET:
+Ru        S         
+      9.696500481430      0.176368995944
+      3.986490207677     -1.981311830052
+      3.507270554217      1.413770781341
+      1.044398638037      0.657254090853
+      0.448751919987      0.252393374138
+Ru        S         
+      1.064403045557      0.566761648709
+      0.627779982819      0.302811587640
+Ru        S         
+      0.101793183953      1.334893817716
+      0.034776411982      0.605967167434
+Ru        P         
+      3.763404295092     -0.859444471047
+      2.770872457961      0.955526231776
+      0.988572431005      0.893940245276
+      0.408597854458      0.334564018189
+Ru        P         
+      0.187687004290      1.566749185235
+      0.051504739627      0.913525241281
+Ru        D         
+      5.550032090322     -0.057846761038
+      2.446355819382      0.253638478653
+      1.073342025516      0.583490878354
+      0.429389404890      0.589592130440
+Ru        D         
+      0.165052645167      1.116296297183
+      0.068844897289      0.265257106223
+
+#BASIS SET:
+Rh        S         
+     10.612277159968      0.168503102460
+      4.175711357562     -1.847338836606
+      3.540198264706      1.311067350155
+      1.054712457574      0.697993868303
+      0.463476941827      0.185865531158
+Rh        S         
+      1.120987885596      0.630953703067
+      0.505354440197      0.263165934158
+Rh        S         
+      0.107799055751      0.895058309015
+      0.035593708316      0.457714131915
+Rh        P         
+      3.915762760107     -1.155445931450
+      3.214946272686      1.203030672069
+      1.138390014316      0.792074916425
+      0.483900206512      0.361955969996
+Rh        P         
+      0.173801790703      1.467776545715
+      0.058955083779      1.888699833975
+Rh        D         
+      6.396827458805     -0.048409300225
+      2.908324768342      0.267035530162
+      1.367666120621      0.694020583366
+      0.573074572998      0.792961890374
+Rh        D         
+      0.215417984872      1.166151849377
+      0.074694475620      0.264937531216
+
+#BASIS SET:
+Pd        S         
+     11.260042205230      0.164097577056
+      4.447554407937     -1.866722502527
+      3.798950429913      1.359736862671
+      1.125191239801      0.565530046178
+      0.452556029260      0.188772887643
+Pd        S         
+      1.151505961507      0.944652624868
+      0.758022887254      0.204351049313
+Pd        S         
+      0.114341106908      0.954437219940
+      0.041622071605      0.471208585065
+Pd        P         
+      4.280202330909     -1.177377340534
+      3.258393937931      1.288735738708
+      1.165454990282      1.060685702915
+      0.488221099390      0.409953870721
+Pd        P         
+      0.201275394801      1.775717120960
+      0.057053584256      1.173935561093
+Pd        D         
+      6.939335039037     -0.045626149466
+      3.004130610160      0.311839410976
+      1.365590049615      0.718055557380
+      0.570518822592      0.709739345643
+Pd        D         
+      0.235211306335      0.761544637626
+      0.098548614743      0.252650801917
+
+#BASIS SET:
+Ag        S         
+     11.137380185754      0.170113394090
+      4.989686293940     -1.887735459567
+      4.475920194052      1.434356812706
+      1.201486343175      0.868528612669
+      0.510392667886      0.192122601443
+Ag        S         
+      1.168865294473      0.981239664457
+      0.703892508518      0.301221440276
+Ag        S         
+      0.109374172855      1.053993979072
+      0.033776584003      0.411521983508
+Ag        P         
+      4.556372684610     -1.202933355935
+      3.513166261338      1.313296588850
+      1.244794863986      1.036312910226
+      0.512047738553      0.383294270449
+Ag        P         
+      0.176416800129      0.973703560128
+      0.052372032836      0.912910487489
+Ag        D         
+      7.446453391764     -0.045685848836
+      3.255927976164      0.334321217612
+      1.458177385755      0.759037145474
+      0.596305718479      0.729668743368
+Ag        D         
+      0.229825513509      1.458004453308
+      0.090151969871      0.269101010696
+
+#BASIS SET:
+Cd        S         
+     12.017244504805      0.169339966251
+      5.114217825559     -1.898543443118
+      4.446940115430      1.450853794865
+      1.216628349228      0.933523984483
+      0.532052126747      0.193968463984
+Cd        S         
+      1.055105987411      1.272808254083
+      0.604148329306      0.229142637869
+Cd        S         
+      0.146383394326      0.696754211201
+      0.047046479439      0.358960905929
+Cd        P         
+      4.757233246634     -1.076910687488
+      3.867372408050      1.151779729705
+      1.358823077137      0.721432137820
+      0.577098779341      0.278790097257
+Cd        P         
+      0.182360815830      0.804988419982
+      0.062419275984      0.598193345980
+Cd        D         
+      8.110020059042     -0.025222412807
+      3.451726768569      0.211885297280
+      1.581708281795      0.447542350487
+      0.673407975664      0.418668166040
+Cd        D         
+      0.272634468912      1.116448847886
+      0.113339867322      0.175327136751
+
+#BASIS SET:
+In        S         
+      1.424753204835      0.195216369564
+      0.967776518278     -0.390335918373
+      0.189316580236      0.332066512393
+      0.069682986693      0.481912204467
+In        S         
+      0.262064023539      0.051928774476
+      0.064458770986      0.801906317169
+In        P         
+      1.810449027147      0.061344922403
+      1.046643496682     -0.193157129825
+      0.184987578096      0.458390096608
+      0.071861619824      0.194158130930
+In        P         
+      0.138098170856      0.104631733864
+      0.041247580889      0.868493244912
+In        D         
+      0.138242005138      0.573395637068
+      0.079777300441      1.016219806155
+
+#BASIS SET:
+Sn        S         
+      2.401062078850      0.149527197103
+      1.153316115528     -0.502781332360
+      0.226914764388      0.515242159192
+      0.110037948497      0.383620829459
+Sn        S         
+      0.317225487215      0.157353145385
+      0.061053997178      0.982405762325
+Sn        P         
+      2.566898425488      0.055051200864
+      1.499260435985     -0.169738769858
+      0.227034747645      0.493183210725
+      0.092179498934      0.327033786313
+Sn        P         
+      0.173407347066      0.225606877021
+      0.048674549711      2.065300464132
+Sn        D         
+      0.241102386017      1.101090309967
+      0.116551335998      1.474465735400
+
+#BASIS SET:
+Sb        S         
+      1.901291496434      0.372390986717
+      1.449475120294     -0.589821664528
+      0.255179663388      0.238020525012
+      0.159557301375      0.282602074546
+Sb        S         
+      0.355114126591      0.126701793951
+      0.067393141859      1.046620800528
+Sb        P         
+      2.375689851709      0.051122584238
+      1.348062774398     -0.161075779289
+      0.243744636819      0.405765152732
+      0.107166990009      0.186486532978
+Sb        P         
+      0.205419448722      0.140291569705
+      0.062916291344      1.926934328503
+Sb        D         
+      0.205284248872      1.923448831425
+      0.161194685111      1.084075276779
+
+#BASIS SET:
+Te        S         
+      2.083514130685      0.802285578371
+      1.714507901604     -1.147432002317
+      0.243721857039      0.709372799498
+      0.130681370450      0.126965084683
+Te        S         
+      0.315794405079      0.190600374708
+      0.088221331240      0.836704224185
+Te        P         
+      2.481012810644      0.073901692051
+      1.399169205477     -0.238783627662
+      0.229946308236      0.340455500866
+      0.127434638298      0.247951299805
+Te        P         
+      0.503816419435      0.919676716095
+      0.056809623374      0.797571264542
+Te        D         
+      0.192018182978      1.571397655653
+      0.064437864854      0.969777509197
+
+#BASIS SET:
+I         S         
+      2.467637755918      0.270191039383
+      1.696928162357     -0.514853581902
+      0.298836215328      0.446031974526
+      0.139199227031      0.193993291432
+I         S         
+      0.241437612793      0.214060325418
+      0.078209906425      0.853326113661
+I         P         
+      0.954417866049     -0.267890449748
+      0.663589859519      0.294307194920
+      0.236750134212      0.246021979653
+      0.130584739507      0.138773377806
+I         P         
+      0.229168242463      0.063105849732
+      0.067220884629      2.030056520658
+I         D         
+      0.327941623565      0.159626880427
+      0.228514545922      1.064910731508
+
+#BASIS SET:
+Xe        S         
+      2.755095924827      0.249896927023
+      1.738312366995     -0.537631660732
+      0.358046650467      0.489101738862
+      0.142124603625      0.268006037727
+Xe        S         
+      0.275478850221      0.193322700350
+      0.100907057598      0.887859271842
+Xe        P         
+      1.117839719300     -0.276367079429
+      0.691267910074      0.323178756010
+      0.279267713061      0.298358628779
+      0.155955498860      0.226885229369
+Xe        P         
+      0.265352026982      0.067336765896
+      0.075892744312      1.830243781388
+Xe        D         
+      0.402780208511      0.161378596156
+      0.243632826926      1.043744835741
+
+#BASIS SET:
+Cs        S         
+      2.172433323343      0.093944897136
+      1.213559429867     -0.574876851884
+      0.913074181038      0.380293350663
+      0.313880498180      0.275539125878
+Cs        S         
+      1.294434390251     -0.246882008315
+      1.088756263514      0.352493537018
+      0.141990201906      1.013884420396
+Cs        S         
+      0.110486818629      0.142948191575
+      0.021036226155      0.289955177474
+Cs        P         
+      1.296612062813     -0.158205270387
+      0.511587649506      0.411480813114
+      0.201550092329      0.414262558281
+      0.065338478075      0.072302175823
+Cs        P         
+      0.299675722332     -0.097355523294
+      0.063470843305      0.070064297558
+      0.029363753532      0.660221576187
+Cs        D         
+      0.288013152964      1.130417708310
+      0.094776153263      0.956697547107
+
+#BASIS SET:
+Ba        S         
+      2.165224439573      0.172126972669
+      1.563606663015     -0.412498520678
+      0.468388660094      0.403433792501
+      0.182490630329      0.153935914977
+Ba        S         
+      0.969811881199      0.332173932718
+      0.438335807571     -0.093598712810
+      0.051413927514      0.275829475814
+Ba        S         
+      0.038420685468      0.348249206633
+      0.034827156628      1.049222993274
+Ba        P         
+      1.246867677258     -0.090713352176
+      0.619719104166      0.186065346861
+      0.314722191970      0.053345917338
+      0.218214872641      0.124038886456
+Ba        P         
+      0.888823238004     -0.009266901736
+      0.112339424956      0.520029153739
+      0.048917251542      0.294753654056
+Ba        D         
+      0.376362798375      0.740747300068
+      0.122678391091      0.544085547842
+
+#BASIS SET:
+La        S         
+      2.961518815289      0.344436595849
+      2.116898887325     -0.624622276100
+      0.521152464968      0.515127237675
+      0.230069398586      0.246218458006
+La        S         
+      0.579546119270      0.937709515416
+      0.290509752938      1.164367331351
+La        S         
+      0.048916762584      1.105995776925
+      0.018604737481      0.464967111858
+La        P         
+      3.245245643206      0.176769094460
+      2.470890435412     -0.332713646676
+      0.545303856973      0.526757519263
+      0.216537966689      0.336610767661
+La        P         
+      0.033299369099      1.044022672845
+      0.006745724156      0.067353901679
+La        D         
+      1.509848815606     -0.091622463451
+      0.743009875436      0.179196344423
+      0.339879135044      0.485427467590
+      0.137512286533      0.463167031971
+La        D         
+      0.060432252241      0.866616108693
+      0.021446771088      0.436873072386
+
+#BASIS SET:
+Ce        S         
+      2.338708269400      0.130423191930
+      1.321938059900     -0.945407435390
+      0.936653430510      0.578660129970
+      0.433174507830      0.199228432480
+Ce        S         
+      0.787161385190      0.613999779320
+      0.256141028710      0.974944732900
+Ce        S         
+      0.042862440378      1.450044603900
+      0.019521352708      0.347469655730
+Ce        P         
+      2.219774933800      0.196030889890
+      2.018912507100     -0.320257523750
+      0.592087363040      0.406207592670
+      0.277158908340      0.243517746880
+Ce        P         
+      0.156062580290      1.000000000000
+Ce        D         
+      1.084221385500     -0.014352880349
+      0.448540175050      0.588466222730
+      0.151960945110      0.147553217860
+Ce        D         
+      0.139549259200      1.539344007900
+      0.040874025783      0.369009644880
+Ce        F         
+     34.145399262000      0.032170509402
+     12.065428998000      0.140071215210
+      4.450111205600      0.284220463290
+      1.676403229600      0.342402334160
+      0.652084636310      0.123817383030
+Ce        F         
+      0.545199397300      0.899844805050
+      0.161907400480      0.966849207400
+
+#BASIS SET:
+Pr        S         
+      3.168984651800      0.171631098330
+      1.667890716800     -0.995838233800
+      0.812186125420      0.272458450300
+      0.601529244950      0.152421654920
+Pr        S         
+      0.903875352100      0.657504248670
+      0.301189177150      0.542467435400
+Pr        S         
+      0.062615656705      0.959511574480
+      0.022405546382      0.289850823590
+Pr        P         
+      2.482108136900      0.203400612200
+      2.257381996700     -0.340953226390
+      0.685565850410      0.453133555060
+      0.387144669100      0.087858983753
+Pr        P         
+      0.220381064000      1.000000000000
+Pr        D         
+      1.002533215800     -0.019351122016
+      0.750223540940      0.598029336850
+      0.229329053320      0.138544903510
+Pr        D         
+      0.205673753170      0.684336589290
+      0.046531823684      0.191287740830
+Pr        F         
+     40.232881930000      0.039727695667
+     14.439096901000      0.185973568320
+      5.666824231800      0.364846754290
+      2.437873255300      0.393202983180
+      1.049827395300      0.151852361610
+Pr        F         
+      1.155904606900      0.649433443810
+      0.367184865050      0.738335210940
+
+#BASIS SET:
+Nd        S         
+      2.399445231100      0.330052519610
+      1.682884324300     -1.087812006300
+      0.988141728820      0.228026434820
+      0.208920206230      0.190568753630
+Nd        S         
+      1.127298797900      0.721120134100
+      0.489284491850      0.803593302000
+Nd        S         
+      0.024303840698      0.504862388910
+      0.013029609227      0.612696241810
+Nd        P         
+      2.324735346000      0.170147916960
+      2.056219145600     -0.355178646810
+      0.697829062940      0.572197291710
+      0.509194679390      0.085761284929
+Nd        P         
+      0.235469139510      1.000000000000
+Nd        D         
+      1.074577298900     -0.017620186217
+      0.472062103620      0.708203155280
+      0.200015863370      0.168647666540
+Nd        D         
+      0.142617264320      0.520890592900
+      0.057631923663      0.180610053610
+Nd        F         
+     46.847185945000      0.030502424037
+     17.019304283000      0.153104586980
+      6.798621254000      0.316876806310
+      2.887091563100      0.406866529590
+      1.108982204200      0.166396269320
+Nd        F         
+      1.261794730700      0.585966756640
+      0.388764008300      0.611998230940
+
+#BASIS SET:
+Pm        S         
+      2.930428338900      0.357177154220
+      1.927498493600     -1.170119154300
+      0.772548914370      0.230068336210
+      0.169843099460      0.183828159010
+Pm        S         
+      1.062021113300      0.229843223870
+      0.443061656340      0.640385348860
+Pm        S         
+      0.047080341941      0.566945207290
+      0.020131076829      0.951730188260
+Pm        P         
+      2.526409558100      0.186350621230
+      2.182882378400     -0.359383234040
+      0.678345299410      0.573936192200
+      0.368422273410      0.203240994710
+Pm        P         
+      0.217452026430      1.000000000000
+Pm        D         
+      1.175937624900     -0.019615039553
+      0.437438037980      0.684495543500
+      0.212867517890      0.138802799410
+Pm        D         
+      0.112237432370      0.346138298110
+      0.029513406922      0.263739259970
+Pm        F         
+     49.708022670000      0.030656850946
+     17.966438036000      0.154675899030
+      7.102080585100      0.330654822160
+      2.908142509900      0.444450968650
+      1.215606314300      0.270568901430
+Pm        F         
+      0.879756044810      0.562280426050
+      0.359805598200      0.722340818030
+
+#BASIS SET:
+Sm        S         
+      3.198468986000      0.388700993160
+      2.054638395700     -1.296672422600
+      0.824222222660      0.227766807290
+      0.160168477290      0.172833079050
+Sm        S         
+      0.946609884890      0.286948469730
+      0.428263406830      0.671620639940
+Sm        S         
+      0.055236922625      0.599595120810
+      0.024617411607      0.912179105210
+Sm        P         
+      2.513124078800      0.191310169700
+      2.237188900400     -0.361399138370
+      0.697827704550      0.575663871770
+      0.380565701450      0.175498933290
+Sm        P         
+      0.225623014280      1.000000000000
+Sm        D         
+      0.926535125110     -0.017463655728
+      0.519362892700      0.585997594070
+      0.221098036510      0.167648746790
+Sm        D         
+      0.154304824910      0.311945638600
+      0.053900115632      0.235546802760
+Sm        F         
+     50.128022078000      0.032967782994
+     18.164742527000      0.158422860610
+      7.183502716200      0.325839145800
+      2.962754106700      0.421637206220
+      1.232140976700      0.251720808500
+Sm        F         
+      0.930577398270      0.531939752500
+      0.372797890110      0.763575162480
+
+#BASIS SET:
+Eu        S         
+      3.345059905700      0.375970166750
+      2.075574062000     -1.405131776800
+      0.940426981650      0.232505604610
+      0.181674650410      0.192608133490
+Eu        S         
+      1.052166334100      0.327606031730
+      0.442926265500      0.643659880550
+Eu        S         
+      0.061414092824      0.588230789160
+      0.031784177184      0.930609513330
+Eu        P         
+      2.495634008500      0.194164540170
+      2.292506715600     -0.356747943100
+      0.720591307920      0.567428064980
+      0.386142526820      0.170263118340
+Eu        P         
+      0.229374588050      1.000000000000
+Eu        D         
+      0.964946609210     -0.016327079928
+      0.497327762740      0.714355608550
+      0.212503372040      0.164848026920
+Eu        D         
+      0.155452858570      0.317749826660
+      0.058419430722      0.210656287270
+Eu        F         
+     51.095384126000      0.035305949222
+     18.473596326000      0.162335136700
+      7.346815898000      0.324543595140
+      3.059653733000      0.409215096810
+      1.273688657300      0.267755957520
+Eu        F         
+      0.881971831840      0.506986166910
+      0.381292553470      0.853657650950
+
+#BASIS SET:
+Gd        S         
+      4.000019104300      0.333487234160
+      2.195731389500     -1.484942023100
+      0.965241154350      0.249401906810
+      0.162248187410      0.184150027830
+Gd        S         
+      1.098367709400      0.332219940840
+      0.432548270620      0.524530415210
+Gd        S         
+      0.064209408826      0.603912156030
+      0.033870628205      0.929468533350
+Gd        P         
+      2.427873110800      0.198067677630
+      2.311779823100     -0.357983931930
+      0.739703409910      0.578006472370
+      0.366725221840      0.189778608390
+Gd        P         
+      0.227828614570      1.000000000000
+Gd        D         
+      0.917667303880     -0.016219993919
+      0.529461794640      0.687152060260
+      0.221266400270      0.194933590720
+Gd        D         
+      0.150791182580      0.317109114010
+      0.055406934975      0.219244642480
+Gd        F         
+     52.702891850000      0.037944092036
+     18.968629359000      0.172950486860
+      7.483761636800      0.339296635260
+      3.124551141300      0.409244092080
+      1.297917258000      0.266432518960
+Gd        F         
+      0.882336787390      0.516585464810
+      0.384484579370      0.873328896390
+
+#BASIS SET:
+Tb        S         
+      3.201386505200      0.411894267370
+      2.126784415700     -1.506935953000
+      0.999097495030      0.254535056700
+      0.157417449500      0.117990777940
+Tb        S         
+      1.023627053700      0.340110769870
+      0.430697801590      0.454888900280
+Tb        S         
+      0.069974898990      0.698643296210
+      0.032940990975      0.740386009190
+Tb        P         
+      2.073989571400     -0.343595673730
+      1.828471268600      0.215362185340
+      0.775726769020      0.490677935480
+      0.384091371450      0.256087426350
+Tb        P         
+      0.217915710920      1.000000000000
+Tb        D         
+      0.825171758200     -0.015782273999
+      0.600154080990      0.515824182390
+      0.239457542090      0.219930276580
+Tb        D         
+      0.170840294520      0.339344511810
+      0.066393296853      0.179617582290
+Tb        F         
+     56.589558964000      0.038014387935
+     20.155230196000      0.179793848070
+      7.820360552900      0.362111598160
+      3.178882888700      0.431909631700
+      1.289918311100      0.254255670440
+Tb        F         
+      0.942818356420      0.524283327370
+      0.375168165820      0.774641908370
+
+#BASIS SET:
+Dy        S         
+      3.077638972300      0.474009059980
+      2.144516903200     -1.571543383700
+      1.032524860600      0.251872641160
+      0.149884855500      0.115997598420
+Dy        S         
+      1.187918492500      0.371841314040
+      0.439964465440      0.439801832140
+Dy        S         
+      0.049122676587      0.662889360870
+      0.020927956353      0.758733207730
+Dy        P         
+      2.282049763500     -0.335364965580
+      2.149986794100      0.212987377950
+      0.778513739630      0.469808752900
+      0.375219342670      0.251032791170
+Dy        P         
+      0.199939554420      1.000000000000
+Dy        D         
+      0.832650334010     -0.016672954384
+      0.586505648120      0.572585525810
+      0.249679875370      0.220091894330
+Dy        D         
+      0.180105077260      0.344582383530
+      0.065825970099      0.191502604620
+Dy        F         
+     57.861342453000      0.038972875107
+     20.635393825000      0.181045246390
+      8.004535365600      0.359424664790
+      3.226449556100      0.423360346770
+      1.294175171300      0.256326644590
+Dy        F         
+      0.881165570020      0.496894803850
+      0.339754421590      0.786379098440
+
+#BASIS SET:
+Ho        S         
+      3.215855287600      0.435915312520
+      2.254633264000     -1.409057919500
+      0.926678464460      0.236812959040
+      0.197978695910      0.113859531790
+Ho        S         
+      1.074584686700      0.349159708930
+      0.433613079530      0.295799100520
+Ho        S         
+      0.053966343732      0.578706693790
+      0.021991634871      0.803713364560
+Ho        P         
+      2.276740546700     -0.309919424620
+      1.875689434100      0.222433385250
+      0.769351845080      0.483617009030
+      0.342467015030      0.282620610930
+Ho        P         
+      0.189559521080      1.000000000000
+Ho        D         
+      0.723086365990     -0.014022935559
+      0.636267204790      0.519060618500
+      0.234931783170      0.244755281630
+Ho        D         
+      0.179485956040      0.338303537140
+      0.073557821351      0.179021787710
+Ho        F         
+     60.830053758000      0.038490576289
+     21.588065934000      0.181910610310
+      8.250479411400      0.365992385350
+      3.275619486500      0.415195997170
+      1.280581763200      0.223432973500
+Ho        F         
+      1.061858771100      0.560929808620
+      0.378053363800      0.866634478600
+
+#BASIS SET:
+Er        S         
+      3.300008194300      0.449912443110
+      2.261456184800     -1.616929874300
+      0.901142008760      0.243660305590
+      0.185129017200      0.116060607530
+Er        S         
+      1.234831222100      0.310759560470
+      0.435101114100      0.233295092470
+Er        S         
+      0.039648759574      0.783823145590
+      0.036670951258      0.635200383500
+Er        P         
+      2.327643667900     -0.301384780980
+      1.853492883500      0.228041624770
+      0.774338801330      0.459307886770
+      0.361332277440      0.285507707070
+Er        P         
+      0.186448517290      1.000000000000
+Er        D         
+      0.900253313010     -0.017255084799
+      0.645098906620      0.519588132000
+      0.171165586210      0.251556771310
+Er        D         
+      0.177202737280      0.375443551790
+      0.063204717631      0.194404298070
+Er        F         
+     66.650276003000      0.038515010289
+     23.862873532000      0.185215593670
+      9.413269867800      0.371237926920
+      3.903703455900      0.442139749250
+      1.628218800500      0.297041225020
+Er        F         
+      0.964181588410      0.515260179640
+      0.417374040540      0.679879748240
+
+#BASIS SET:
+Tm        S         
+      3.529951967700      0.440030003130
+      2.373264752500     -1.676147150600
+      0.859732693780      0.244015605370
+      0.188994272710      0.126181490180
+Tm        S         
+      1.237714700000      0.353524952200
+      0.437883330560      0.257138376250
+Tm        S         
+      0.056601467736      0.751591102950
+      0.034452658435      0.665435412990
+Tm        P         
+      2.389415038600     -0.298798971110
+      1.855600057200      0.231839187010
+      0.806657888830      0.465772792620
+      0.364317357270      0.303562724750
+Tm        P         
+      0.189679696220      1.000000000000
+Tm        D         
+      0.842104081560     -0.016821641435
+      0.642693889420      0.525275761120
+      0.174205588420      0.254940589090
+Tm        D         
+      0.185376205340      0.386133952670
+      0.066999787462      0.185960001190
+Tm        F         
+     69.098639820000      0.039780930603
+     24.622398191000      0.189150061040
+      9.670584964100      0.377486221370
+      3.967675958400      0.447132729830
+      1.602691573900      0.305383090360
+Tm        F         
+      0.862982955250      0.539209910530
+      0.387135399100      0.660330253160
+
+#BASIS SET:
+Yb        S         
+      4.129766509200      0.314141037340
+      2.367490669700     -1.896766067000
+      0.806650220260      0.225771432160
+      0.202273850080      0.109885398660
+Yb        S         
+      1.438428189100      0.334354680940
+      0.458551811280      0.223260919060
+Yb        S         
+      0.060105436712      0.719579093490
+      0.035482632702      0.643413955490
+Yb        P         
+      2.471659314300     -0.295125193320
+      1.860993668600      0.237399570240
+      0.823485411640      0.457292600590
+      0.367510901360      0.351242665330
+Yb        P         
+      0.171373315040      1.000000000000
+Yb        D         
+      0.805143471510     -0.017580695087
+      0.498184965450      0.507346270890
+      0.183187725480      0.270574390960
+Yb        D         
+      0.048692800777      0.230634878920
+      0.002065313870      0.297864857370
+Yb        F         
+     69.918468598000      0.041342514961
+     24.947950088000      0.192551350850
+      9.765463162000      0.376774380600
+      4.011151681600      0.429599734010
+      1.635982295700      0.293066793210
+Yb        F         
+      0.885391198850      0.546626208600
+      0.427393308340      0.733864066460
+
+#BASIS SET:
+Lu        S         
+      3.997702916200      0.457441680180
+      2.709261988900     -1.540133835600
+      0.902625980060      0.245518229230
+      0.197099834360      0.103486958220
+Lu        S         
+      1.249815976000      0.312543278220
+      0.465480048010      0.234491973770
+Lu        S         
+      0.080960274419      0.624707137520
+      0.047485302330      0.624526188080
+Lu        P         
+      2.581464009800     -0.297208403370
+      1.976311904200      0.250055874640
+      0.835207096090      0.450659846040
+      0.356413889420      0.305459141120
+Lu        P         
+      0.156421764500      1.000000000000
+Lu        D         
+      0.917062629490     -0.017339228386
+      0.572775203150      0.503610273730
+      0.168191033340      0.248443090760
+Lu        D         
+      0.086838737228      0.311494627600
+      0.038900529418      0.217449926630
+Lu        F         
+     73.095465952000      0.042169444125
+     26.023715946000      0.195911479400
+     10.230613237000      0.380324283710
+      4.182255200200      0.437222388230
+      1.656374469500      0.297996648530
+Lu        F         
+      0.826840213730      0.493037406260
+      0.497649365380      0.842886162920
+
+#BASIS SET:
+Hf        S         
+      5.745263913265      0.276398505380
+      3.894142553735     -0.839898489463
+      3.121027335851      0.474240430954
+      0.635496702086      0.158777918809
+Hf        S         
+      0.764844351020      1.707468977910
+      0.290339685115      0.519137602207
+Hf        S         
+      0.082091325177      1.958910571765
+      0.025505481057      0.408538141346
+Hf        P         
+      8.047349756152      0.134374727878
+      5.353154905330     -0.304612877776
+      0.989731302776      0.607862566692
+      0.373084955619      0.433934935433
+Hf        P         
+      0.042930780688      1.371306677174
+      0.018204544818      0.809563007668
+Hf        D         
+      3.430686255910     -0.067789549648
+      1.284638756596      0.283064494075
+      0.433718517572      0.619508023969
+      0.150295851218      0.390710591201
+Hf        D         
+      0.072503797491      0.794719269255
+      0.027061821278      0.342285689082
+
+#BASIS SET:
+Ta        S         
+      5.963803707657      0.279364073058
+      3.961116677799     -0.850260809206
+      3.085766632618      0.479568840294
+      0.665808701395      0.193394187367
+Ta        S         
+      0.805972319842      1.688246848358
+      0.268930532034      0.379133660479
+Ta        S         
+      0.086553429743      2.595263529220
+      0.027791157335      0.533383099437
+Ta        P         
+      8.182904208394      0.136625206869
+      5.419010727730     -0.317091050150
+      1.080897305362      0.587463210567
+      0.427101858969      0.435988981255
+Ta        P         
+      0.123665166522      1.348117857613
+      0.074631391466      0.779829286766
+Ta        D         
+      3.938780857951     -0.057900768529
+      1.367499138219      0.225894545813
+      0.629935945405      0.379219030762
+      0.273092407860      0.508381938982
+Ta        D         
+      0.099925683041      0.812911312187
+      0.037943852635      0.332399580288
+
+#BASIS SET:
+W         S         
+      6.106125947092      0.277025340819
+      4.127702570212     -0.844730321999
+      3.274970817330      0.483142244883
+      0.733004472901      0.158006598997
+W         S         
+      0.841444067228      1.644909883363
+      0.271780212684      0.409568828017
+W         S         
+      0.100331812736      1.869589815121
+      0.029632732761      0.528929932062
+W         P         
+      8.285553529800      0.157354211889
+      5.615678187579     -0.352854952607
+      1.158573046813      0.612420934890
+      0.467617380312      0.446722978433
+W         P         
+      0.137270627660      0.964875039887
+      0.082293678701      0.246586455905
+W         D         
+      4.192126847007     -0.054899215601
+      1.448413562415      0.216997892231
+      0.626627825114      0.432982414411
+      0.249124670225      0.423123825902
+W         D         
+      0.097421390343      1.090266290688
+      0.036447054814      0.478789058281
+
+#BASIS SET:
+Re        S         
+      6.488782003396      0.257736215445
+      4.217568459632     -0.822875620070
+      3.262879428220      0.479067032833
+      0.754784222831      0.203147240894
+Re        S         
+      0.941081734619      1.881764103129
+      0.272712355739      0.429341141770
+Re        S         
+      0.111126860141      2.255322226400
+      0.040009355137      1.259531768378
+Re        P         
+      7.520497681685      0.496927359844
+      6.498258506185     -0.676809195328
+      1.216427394186      0.572364689927
+      0.495158438289      0.396340498872
+Re        P         
+      0.136186441470      4.165341545653
+      0.082441357688      0.908639271543
+Re        D         
+      4.424276090621     -0.075416964612
+      1.537528643872      0.289740692768
+      0.720850710433      0.511674499486
+      0.312377703413      0.543026746807
+Re        D         
+      0.122723095719      0.419130234729
+      0.050344645461      0.124831357094
+
+#BASIS SET:
+Os        S         
+      6.589301521031      0.262324014895
+      4.483410826406     -0.824151542759
+      3.598784747600      0.488236883338
+      0.830482873892      0.148959289850
+Os        S         
+      0.974401985901      1.704276404135
+      0.337099880208      0.419939365808
+Os        S         
+      0.119371477901      1.190416170712
+      0.040334652774      0.415098407891
+Os        P         
+      8.018868508095      0.366991980684
+      6.516333088213     -0.572512459986
+      1.298661604992      0.634399744500
+      0.537047008060      0.437296123703
+Os        P         
+      0.163425595329      2.846863530709
+      0.046752457019      0.349464207023
+Os        D         
+      4.562022886893     -0.075845399099
+      1.700498170420      0.262012418274
+      0.783963505481      0.489253051009
+      0.333034419339      0.480607051003
+Os        D         
+      0.128320311421      1.946527751776
+      0.051494690010      0.291341732890
+
+#BASIS SET:
+Ir        S         
+      6.990696480869      0.238565941294
+      4.657628826621     -0.824425741900
+      3.822133240323      0.507613356927
+      0.888825622345      0.119157387695
+Ir        S         
+      1.023793770162      1.457427286127
+      0.309199315635      0.281543512886
+Ir        S         
+      0.127967479426      1.171391362467
+      0.040088560933      0.521901566386
+Ir        P         
+      8.689088185082      0.243119322841
+      6.577366558565     -0.444544360607
+      1.356413354364      0.645655248595
+      0.544928093624      0.402704630432
+Ir        P         
+      0.140794381741      0.983202269789
+      0.048851626916      0.535708402160
+Ir        D         
+      4.665178664686     -0.081060285159
+      1.806050424121      0.274531805989
+      0.787394625251      0.509312962011
+      0.321373987554      0.421907340162
+Ir        D         
+      0.121625299200      1.542032145949
+      0.049105698957      0.215355952279
+
+#BASIS SET:
+Pt        S         
+      7.679412615881      0.230877908692
+      4.709087759587     -0.861654782814
+      3.645313517688      0.541904557353
+      0.953420950841      0.472553125277
+Pt        S         
+      1.027769770519      1.543816333192
+      0.895827169520      0.276175320568
+Pt        S         
+      0.139555297410      1.211933971063
+      0.049340522870      0.547971738045
+Pt        P         
+      8.289144538801      0.712848039607
+      7.316311140271     -0.940414367671
+      1.449109980349      0.710309776673
+      0.609243388052      0.456919979943
+Pt        P         
+      0.193683119741      2.292500144336
+      0.044166144409      0.224210089383
+Pt        D         
+      4.910332949872     -0.079625149422
+      1.938691426506      0.267813136394
+      0.864747561557      0.492535757006
+      0.357332283630      0.415036501570
+Pt        D         
+      0.137946336644      2.006060038467
+      0.050242829695      0.262075608301
+
+#BASIS SET:
+Au        S         
+      8.482247329754      0.229037007166
+      5.008887371391     -0.907405209082
+      3.953463080432      0.532729463041
+      1.016674280753      0.288719412142
+Au        S         
+      1.244532317707      1.102769284388
+      0.309885825329      0.343912022063
+Au        S         
+      0.147596397487      1.099367153402
+      0.047719646781      0.522674919172
+Au        P         
+      8.831554690513      0.509475138022
+      7.566282794345     -0.721074776360
+      1.506700409832      0.698388067412
+      0.612822693070      0.409074097606
+Au        P         
+      0.176221965082      2.730273272472
+      0.061994012169      1.122895118562
+Au        D         
+      5.149888373979     -0.076940684362
+      2.038287817758      0.266148488840
+      0.898017206860      0.479912521191
+      0.362635858492      0.377471368469
+Au        D         
+      0.138458740562      2.462111180979
+      0.046715591266      0.250738248974
+
+#BASIS SET:
+Hg        S         
+      9.562300229367      0.181250771129
+      5.079395532020     -0.869350443981
+      3.982850471475      0.486376181267
+      0.961127400893      0.258983305153
+Hg        S         
+      1.377803059990      1.722106394360
+      0.381440671172      0.250455726703
+Hg        S         
+      0.172351434399      1.100334941673
+      0.063943840693      0.677823980347
+Hg        P         
+      9.893101932495      0.218067320959
+      7.423812439662     -0.420280763099
+      1.600738605984      0.669305658396
+      0.652677096183      0.386781925502
+Hg        P         
+      0.183498087066      2.466433663901
+      0.064592858638      1.561759630522
+Hg        D         
+      5.315692442902     -0.078574152910
+      2.245554598474      0.243559945187
+      1.001947266394      0.451151800485
+      0.413902886479      0.345923827411
+Hg        D         
+      0.167213642564      2.300779412806
+      0.067098161686      0.210685863765
+
+#BASIS SET:
+Tl        S         
+      1.505313049428      0.313662634995
+      0.926406638906     -0.722746145099
+      0.195472441877      0.634249265129
+      0.079197571540      0.616447338532
+Tl        S         
+      0.179423367042      0.144781868221
+      0.072061846216      0.890059337687
+Tl        P         
+      1.340445390895      0.090978395301
+      0.866604785093     -0.259543806887
+      0.217123440967      0.362159556277
+      0.091408672799      0.365315709408
+Tl        P         
+      0.591817274684      0.018848049794
+      0.043682496425      1.416103944518
+Tl        D         
+      0.117314837048      0.604178245689
+      0.054459036250      0.886013825833
+
+#BASIS SET:
+Pb        S         
+      1.341753004241      0.775578610898
+      1.105497689546     -1.202765308935
+      0.211699001507      0.723705670622
+      0.129547855691      0.256761268066
+Pb        S         
+      0.549706034009      0.131886387067
+      0.063284849518      1.510485987615
+Pb        P         
+      1.414803989187      0.152750104080
+      1.024069215145     -0.288877125439
+      0.186029005782      0.437219972839
+      0.082796300988      0.118522273753
+Pb        P         
+      0.331088269580      0.201768239863
+      0.046482916169      3.750488820230
+Pb        D         
+      0.200493671996      0.835023372291
+      0.088262123032      1.095405604994
+
+#BASIS SET:
+Bi        S         
+      1.598452354960      0.090505383135
+      1.024031934201     -0.505556064927
+      0.211694361874      0.733716856593
+      0.188315325223      0.112026081947
+Bi        S         
+      0.302425752233      0.209510573559
+      0.084994052070      0.597248589405
+Bi        P         
+      1.494708346058      0.395953810643
+      1.254367409879     -0.565931848387
+      0.207175316033      0.638559022440
+      0.075039529988      0.141797873604
+Bi        P         
+      0.199985131181      0.116806624587
+      0.062373215520      3.086007117955
+Bi        D         
+      0.125642967394      1.009438999376
+      0.092809453349      0.739934271101
+
+#BASIS SET:
+Po        S         
+      1.899315412209      0.398802099258
+      1.330205684309     -0.819810438961
+      0.295449029905      0.530266670623
+      0.199451922422      0.183432964269
+Po        S         
+      0.315025299523      0.224516411804
+      0.104988411226      0.818764958809
+Po        P         
+      1.611909638782      0.155499395899
+      1.099867763114     -0.399122211489
+      0.206617899935      0.547783019859
+      0.120990132583      0.207180326062
+Po        P         
+      0.471197704174      0.778854060703
+      0.062603532297      1.066397923226
+Po        D         
+      0.165433941928      0.960300261731
+      0.062674049116      0.786318229414
+
+#BASIS SET:
+At        S         
+      1.924148580749      0.525856661156
+      1.346688705121     -1.067720621131
+      0.359439113226      0.638723774762
+      0.188782116680      0.502953817840
+At        S         
+      0.436733046143      0.086926389554
+      0.086660202651      0.943503262571
+At        P         
+      2.082544353701      0.062574676624
+      1.160295712623     -0.258455276224
+      0.350563463518      0.458257809303
+      0.160220695112      0.515683272949
+At        P         
+      0.191314694381      0.104704037336
+      0.069240934858      0.929284118241
+At        D         
+      0.286195490906      0.174109793764
+      0.168758025833      1.048978368888
+
+#BASIS SET:
+Rn        S         
+      1.974019782555      0.658758503133
+      1.478356500621     -1.204012936389
+      0.338292719593      0.976772343385
+      0.138050661999      0.215247233371
+Rn        S         
+      0.239238522409      0.078556974413
+      0.115156734440      1.152238798180
+Rn        P         
+      1.866046112655      0.255378767651
+      1.572334840818     -0.377681250068
+      0.317716247096      0.531475806749
+      0.141239873985      0.285072318557
+Rn        P         
+      0.200353951276      0.101799470336
+      0.077190057936      0.881907208554
+Rn        D         
+      0.333065558020      0.245718887909
+      0.214108626479      1.219897201244
+
+END
diff --git a/gpu4pyscf/drivers/dft_3c_driver.py b/gpu4pyscf/drivers/dft_3c_driver.py
new file mode 100644
index 00000000..adf9fe8f
--- /dev/null
+++ b/gpu4pyscf/drivers/dft_3c_driver.py
@@ -0,0 +1,409 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###################################################################
+# This is a customized driver for three composite methods only
+# It only works for b97-3c, r2scan-3c, and wb97x-3c
+###################################################################
+
+import os
+import time
+import json
+import pyscf
+import argparse
+import tempfile
+import shutil
+import cupy
+import traceback
+import h5py
+import numpy as np
+from types import MethodType
+from pyscf import lib
+from pyscf import dft
+from pyscf.hessian import thermo
+from pyscf.lib import logger
+from pyscf.dispersion import dftd3, dftd4, gcp
+
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+import importlib.metadata
+required_version = "1.3.0"
+installed_version = importlib.metadata.version('pyscf-dispersion')
+assert installed_version >= required_version
+
+def parse_3c(xc_name):
+    """
+    return xc, nlc, basis, ecp, (xc_disp, disp), xc_gcp
+    """
+    if xc_name == 'b973c':
+        return 'GGA_XC_B97_3C', 0, 'def2-mtzvp', None, ('b97-3c', 'D3BJ'), 'b973c'
+    elif xc_name == 'r2scan3c':
+        return 'r2scan', 0, 'def2-mtzvpp', None, ('r2scan-3c', 'D4'), 'r2scan3c'
+    elif xc_name == 'wb97x3c':
+        # 'Grimme vDZP' is available is BSE, but pyscf 2.8 is not able to parse ECP properly
+        # basis = 'Grimme vDZP'
+        # ecp = 'Grimme vDZP'
+        basis = os.path.join(CURRENT_DIR, 'basis_vDZP_NWCHEM.dat')
+        ecp = os.path.join(CURRENT_DIR, 'ecp_vDZP_NWCHEM.dat')
+        return 'wb97x-v', 0, basis, ecp, ('wb97x-3c', 'D4'), None
+    else:
+        raise RuntimeError('Unknow xc functionals for parsing 3c')
+
+def get_dispersion(mol, xc, grad=True):
+    if xc == 'b97-3c':
+        d3_model = dftd3.DFTD3Dispersion(mol, xc=xc, atm=True)
+        res = d3_model.get_dispersion(grad=grad)
+    elif xc == 'r2scan-3c':
+        # r2scan-3c use customized parameters
+        # https://github.com/psi4/psi4/blob/0e54962d629494f4ed142d0499d7faeaf36effdd/psi4/driver/procrouting/dft/mgga_functionals.py#L250
+        d4_model = dftd4.DFTD4Dispersion(mol, xc=xc, atm=True, ga=2.0, gc=1.0)
+        d4_model.set_param(0.0, 0.42, 5.65, s9=2.0)
+        res = d4_model.get_dispersion(grad=grad)
+    elif xc == 'wb97x-3c':
+        d4_model = dftd4.DFTD4Dispersion(mol, xc=xc, atm=True)
+        res = d4_model.get_dispersion(grad=grad)
+    else:
+        raise NotImplementedError
+    return res
+
+def gen_disp_fun(xc_disp, xc_gcp):
+    """
+    Generate a function to calculate the sum of dispersion and gcp contributions
+    """
+    def get_disp(mf, disp=None, with_3body=None, verbose=None):
+        mol = mf.mol
+        energy = 0.0
+        if xc_disp is not None:
+            res = get_dispersion(mol, xc_disp, grad=False)
+            energy += res.get('energy')
+            mf.scf_summary['dispersion'] = energy
+        if xc_gcp is not None:
+            gcp_model = gcp.GCP(mol, method=xc_gcp)
+            res = gcp_model.get_counterpoise()
+            energy += res['energy']
+        return energy
+    return get_disp
+
+def gen_disp_grad_fun(xc_disp, xc_gcp):
+    """
+    Generate a function to calculate gradient of dispersion + gcp
+    """
+    def get_disp_grad(mf_grad, disp=None, with_3body=None, verbose=None):
+        mf = mf_grad.base
+        mol = mf.mol
+        gradient = 0.0
+        if xc_disp is not None:
+            res = get_dispersion(mol, xc_disp, grad=True)
+            gradient += res.get('gradient')
+            
+        if xc_gcp is not None:
+            gcp_model = gcp.GCP(mol, method=xc_gcp)
+            res = gcp_model.get_counterpoise(grad=True)
+            gradient += res['gradient']
+        return gradient
+    return get_disp_grad
+
+def gen_disp_hess_fun(xc_disp, xc_gcp):
+    """
+    Generate a function to calculate Hessian of dispersion + gcp
+    """
+    def get_disp_hess(mf_hess, disp=None, with_3body=None):
+        mf = mf_hess.base
+        mol = mf.mol
+        natm = mol.natm
+        h_disp = np.empty([natm,natm,3,3])
+
+        coords = mf_hess.mol.atom_coords()
+        mol = mol.copy()
+        eps = 1e-5
+        for i in range(natm):
+            for j in range(3):
+                coords[i,j] += eps
+                mol.set_geom_(coords, unit='Bohr')
+                g1 = 0.0
+                if xc_disp is not None:
+                    res = get_dispersion(mol, xc_disp, grad=True)
+                    g1 += res.get('gradient')
+                if xc_gcp is not None:
+                    gcp_model = gcp.GCP(mol, method=xc_gcp)
+                    res = gcp_model.get_counterpoise(grad=True)
+                    g1 += res['gradient']
+
+                coords[i,j] -= 2.0*eps
+                mol.set_geom_(coords, unit='Bohr')
+                g2 = 0.0
+                if xc_disp is not None:
+                    res = get_dispersion(mol, xc_disp, grad=True)
+                    g2 += res.get('gradient')
+                if xc_gcp is not None:
+                    gcp_model = gcp.GCP(mol, method=xc_gcp)
+                    res = gcp_model.get_counterpoise(grad=True)
+                    g2 += res['gradient']
+
+                coords[i,j] += eps
+                h_disp[i,:,j,:] = (g1 - g2)/(2.0*eps)
+        return h_disp
+    return get_disp_hess
+
+def run_dft(mol_name, config, charge=None, spin=0):
+    ''' Perform DFT calculations based on the configuration file.
+    Saving the results, timing, and log to a HDF5 file.
+    '''
+    xc             = config.get('xc',             'b3lyp')
+    grids          = config.get('grids',          {'atom_grid': (99,590)})
+    nlcgrids       = config.get('nlcgrids',       {'atom_grid': (50,194)})
+    verbose        = config.get('verbose',        4)
+    scf_conv_tol   = config.get('scf_conv_tol',   1e-10)
+    direct_scf_tol = config.get('direct_scf_tol', 1e-14)
+    with_df        = config.get('with_df',        True)
+    auxbasis       = config.get('auxbasis',       'def2-universal-jkfit')
+    with_gpu       = config.get('with_gpu',       True)
+
+    with_grad      = config.get('with_grad',      True)
+    with_hess      = config.get('with_hess',      True)
+    with_thermo    = config.get('with_thermo',    False) 
+    save_density   = config.get('save_density',   False)
+    input_dir      = config.get('input_dir',      './')
+    
+    default_solvent = {'method': 'iefpcm', 'eps': 78.3553, 'solvent': 'water'}
+    with_solvent   = config.get('with_solvent',   False)
+    solvent        = config.get('solvent',        default_solvent)
+
+    pyscf_xc, nlc, basis, ecp, (xc_disp, disp), xc_gcp = parse_3c(xc)
+
+    # I/O
+    fp = tempfile.TemporaryDirectory()
+    local_dir = f'{fp.name}/'
+    logfile = f'{mol_name[:-4]}_pyscf.log'
+    shutil.copyfile(f'{input_dir}/{mol_name}', local_dir+mol_name)
+    cupy.get_default_memory_pool().free_all_blocks()
+    lib.num_threads(8)
+    start_time = time.time()
+    mol = pyscf.M(
+        atom=local_dir+mol_name,
+        basis=basis,
+        ecp=ecp,
+        max_memory=32000,
+        verbose=verbose,
+        charge=charge,
+        spin=spin,
+        output=f'{local_dir}/{logfile}')
+    mol.build()
+
+    mf = dft.KS(mol, xc=pyscf_xc)
+    if 'atom_grid' in grids: mf.grids.atom_grid = grids['atom_grid']
+    if 'level' in grids:     mf.grids.level     = grids['level']
+    if mf._numint.libxc.is_nlc(mf.xc):
+        if 'atom_grid' in nlcgrids: mf.nlcgrids.atom_grid = nlcgrids['atom_grid']
+        if 'level' in nlcgrids:     mf.nlcgrids.level     = nlcgrids['level']
+
+    if with_df:
+        mf = mf.density_fit(auxbasis=auxbasis)
+    if with_gpu:
+        mf = mf.to_gpu()
+
+    #### Changes for 3C methods #####
+    # Setup dispersion correction and GCP
+    mf.nlc = nlc
+    mf.get_dispersion = MethodType(gen_disp_fun(xc_disp, xc_gcp), mf)
+    mf.do_disp = lambda: True
+    #################################
+
+    mf.chkfile = None
+    if with_solvent:
+        if solvent['method'].endswith(('PCM', 'pcm')):
+            mf = mf.PCM()
+            mf.with_solvent.lebedev_order = 29
+            mf.with_solvent.method = solvent['method'].replace('PCM','-PCM')
+            mf.with_solvent.eps = solvent['eps']
+        elif solvent['method'].endswith(('smd', 'SMD')):
+            mf = mf.SMD()
+            mf.with_solvent.lebedev_order = 29
+            mf.with_solvent.method = 'SMD'
+            mf.with_solvent.solvent = solvent['solvent']
+        else:
+            raise NotImplementedError
+
+    mf.direct_scf_tol = direct_scf_tol
+    mf.chkfile = None
+    mf.conv_tol = scf_conv_tol
+    e_tot = mf.kernel()
+    
+    if not mf.converged:
+        logger.warn(mf, 'SCF failed to converge')
+
+    scf_time = time.time() - start_time
+    print(f'compute time for energy: {scf_time:.3f} s')
+
+    e1        = mf.scf_summary.get('e1',         0.0)
+    e_coul    = mf.scf_summary.get('coul',       0.0)
+    e_xc      = mf.scf_summary.get('exc',        0.0)
+    e_disp    = mf.scf_summary.get('dispersion', 0.0)
+    e_solvent = mf.scf_summary.get('e_solvent',  0.0)
+
+    data_file = mol_name[:-4] + '_pyscf.h5'
+
+    with h5py.File(f'{local_dir}/{data_file}', 'w') as h5f:
+        h5f.create_dataset('e_tot',     data=e_tot)
+        h5f.create_dataset('e1',        data=e1)
+        h5f.create_dataset('e_coul',    data=e_coul)
+        h5f.create_dataset('e_xc',      data=e_xc)
+        h5f.create_dataset('e_disp',    data=e_disp)
+        h5f.create_dataset('e_solvent', data=e_solvent)
+        h5f.create_dataset('scf_time',  data=scf_time)
+    
+        dm = mf.make_rdm1()
+        if isinstance(dm, cupy.ndarray): dm = dm.get()
+        h5f.create_dataset('dm',       data=dm)
+
+        if save_density and xc.lower() != 'hf':
+            weights = mf.grids.weights
+            coords = mf.grids.coords
+            dm0 = dm[0] + dm[1] if dm.ndim == 3 else dm
+            rho = mf._numint.get_rho(mf.mol, dm0, mf.grids)
+
+            if isinstance(weights, cupy.ndarray): weights = weights.get()
+            if isinstance(coords, cupy.ndarray):  coords  = coords.get()
+            if isinstance(rho, cupy.ndarray):     rho     = rho.get()
+
+            h5f.create_dataset('grids_weights',      data=weights)
+            h5f.create_dataset('grids_coords',       data=coords)
+            h5f.create_dataset('grids_rho',          data=rho)
+
+        if dm.ndim == 3:
+            # open-shell case
+            mo_energy = mf.mo_energy
+            if isinstance(mo_energy, cupy.ndarray): mo_energy = mo_energy.get()
+            mo_energy[0].sort()
+            mo_energy[1].sort()
+            na, nb = mf.nelec
+            h5f.create_dataset('e_lumo_alpha',   data=mo_energy[0][na])
+            h5f.create_dataset('e_lumo_beta',    data=mo_energy[1][nb])
+            h5f.create_dataset('e_homo_alpha',   data=mo_energy[0][na-1])
+            h5f.create_dataset('e_homo_beta',    data=mo_energy[1][nb-1])
+        else:
+            # closed-shell case
+            mo_energy = mf.mo_energy
+            if isinstance(mo_energy, cupy.ndarray): mo_energy = mo_energy.get()
+            mo_energy.sort()
+            nocc = mf.mol.nelectron // 2
+            h5f.create_dataset('e_lumo',     data=mo_energy[nocc])
+            h5f.create_dataset('e_homo',     data=mo_energy[nocc-1])
+    
+    ##################### Gradient Calculation ###############################
+    g = None
+    if with_grad:
+        try:
+            start_time = time.time()
+            g = mf.nuc_grad_method()
+            # Overwrite the method for 3C method
+            g.get_dispersion = MethodType(gen_disp_grad_fun(xc_disp, xc_gcp), g)
+            if with_df:
+                g.auxbasis_response = True
+            f = g.kernel()
+            g = None
+            grad_time = time.time() - start_time
+            print(f'compute time for gradient: {grad_time:.3f} s')
+        except Exception as exc:
+            print(traceback.format_exc())
+            print(exc)
+            f = -1
+            grad_time = -1
+        
+        with h5py.File(f'{local_dir}/{data_file}', 'a') as h5f:
+            h5f.create_dataset('grad', data=f)
+            h5f.create_dataset('grad_time', data=grad_time)
+
+    #################### Hessian Calculation ###############################
+    h = None
+    if with_hess:
+        try:
+            natm = mol.natm
+            start_time = time.time()
+            h = mf.Hessian()
+            # Overwrite the method for 3C method
+            h.get_dispersion = MethodType(gen_disp_hess_fun(xc_disp, xc_gcp), h)
+            h.auxbasis_response = 2
+            _h_dft = h.kernel()
+            h_dft = _h_dft.transpose([0,2,1,3]).reshape([3*natm, 3*natm])
+            hess_time = time.time() - start_time
+            print(f'compute time for hessian: {hess_time:.3f} s')
+
+            if with_thermo:
+                # harmonic analysis
+                start_time = time.time()
+                normal_mode = thermo.harmonic_analysis(mol, _h_dft)
+
+                thermo_dat = thermo.thermo(
+                    mf,                            # GPU4PySCF object
+                    normal_mode['freq_au'],
+                    298.15,                            # room temperature
+                    101325)                            # standard atmosphere
+                thermo_time = time.time() - start_time
+                print(f'compute time for harmonic analysis: {thermo_time:.3f} s')
+
+        except Exception as exc:
+            print(traceback.format_exc())
+            print(exc)
+            h_dft = -1
+            hess_time = -1
+
+        with h5py.File(f'{local_dir}/{data_file}', 'a') as h5f:
+            h5f.create_dataset('hess', data=h_dft)
+            h5f.create_dataset('hess_time', data=hess_time)
+
+            if with_thermo: 
+                h5f.create_dataset('freq_au',         data=normal_mode['freq_au'])
+                h5f.create_dataset('freq_wavenumber', data=normal_mode['freq_wavenumber'])
+                h5f.create_dataset('E_tot',           data=thermo_dat['E_tot'][0])
+                h5f.create_dataset('H_tot',           data=thermo_dat['H_tot'][0])
+                h5f.create_dataset('G_tot',           data=thermo_dat['G_tot'][0])
+                h5f.create_dataset('E_elec',          data=thermo_dat['E_elec'][0])
+                h5f.create_dataset('E_trans',         data=thermo_dat['E_trans'][0])
+                h5f.create_dataset('E_rot',           data=thermo_dat['E_rot'][0])
+                h5f.create_dataset('E_vib',           data=thermo_dat['E_vib'][0])
+                h5f.create_dataset('E_0K',            data=thermo_dat['E_0K'][0])
+                h5f.create_dataset('H_elec',          data=thermo_dat['H_elec'][0])
+                h5f.create_dataset('H_trans',         data=thermo_dat['H_trans'][0])
+                h5f.create_dataset('H_rot',           data=thermo_dat['H_rot'][0])
+                h5f.create_dataset('H_vib',           data=thermo_dat['H_vib'][0])
+                h5f.create_dataset('G_elec',          data=thermo_dat['G_elec'][0])
+                h5f.create_dataset('G_trans',         data=thermo_dat['G_trans'][0])
+                h5f.create_dataset('G_rot',           data=thermo_dat['G_rot'][0])
+                h5f.create_dataset('G_vib',           data=thermo_dat['G_vib'][0])
+
+    # copy the files to destination folder
+    output_dir = config['output_dir']
+    isExist = os.path.exists(output_dir)
+    if not isExist:
+        os.makedirs(output_dir)
+
+    shutil.copyfile(f'{local_dir}/{data_file}', f'{output_dir}/{data_file}')
+    shutil.copyfile(f'{local_dir}/{logfile}', f'{output_dir}/{logfile}')
+
+    return mf
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Run DFT with GPU4PySCF for molecules')
+    parser.add_argument("--config", type=str, default='example.json')
+    parser.add_argument("--charge", type=int, default=None)
+    parser.add_argument("--spin",   type=int, default=0)
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = json.load(f)
+        if isinstance(config, list):
+            config = config[0]
+    for mol_name in config['molecules']:
+        run_dft(mol_name, config, charge=args.charge, spin=args.spin)
diff --git a/gpu4pyscf/drivers/dft_b973c_sample.json b/gpu4pyscf/drivers/dft_b973c_sample.json
new file mode 100644
index 00000000..9085744c
--- /dev/null
+++ b/gpu4pyscf/drivers/dft_b973c_sample.json
@@ -0,0 +1,26 @@
+[{
+    "input_dir": "./",
+    "output_dir": "./",
+    "molecules": [
+        "h2o.xyz",
+        "h2o.xyz",
+        "h2o.xyz",
+        "h2o.xyz",
+        "h2o.xyz"
+    ],
+    "xc": "b973c",
+    "auxbasis": "def2-universal-JFIT",
+    "verbose": 6,
+    "with_solvent": false,
+    "with_thermo": false,
+    "solvent": {
+        "eps": 78.3553,
+        "solvent": "water",
+        "method": "SMD"
+    },
+    "with_gpu": true,
+    "with_df": true,
+    "with_grad": true,
+    "with_hess": true,
+    "save_density": true
+}]
diff --git a/gpu4pyscf/drivers/dft_r2scan3c_sample.json b/gpu4pyscf/drivers/dft_r2scan3c_sample.json
new file mode 100644
index 00000000..3f0eb6aa
--- /dev/null
+++ b/gpu4pyscf/drivers/dft_r2scan3c_sample.json
@@ -0,0 +1,26 @@
+[{
+    "input_dir": "./",
+    "output_dir": "./",
+    "molecules": [
+        "h2o.xyz",
+        "h2o.xyz",
+        "h2o.xyz",
+        "h2o.xyz",
+        "h2o.xyz"
+    ],
+    "xc": "r2scan3c",
+    "auxbasis": "def2-universal-JFIT",
+    "verbose": 4,
+    "with_solvent": false,
+    "with_thermo": false,
+    "solvent": {
+        "eps": 78.3553,
+        "solvent": "water",
+        "method": "SMD"
+    },
+    "with_gpu": true,
+    "with_df": true,
+    "with_grad": true,
+    "with_hess": true,
+    "save_density": false
+}]
diff --git a/gpu4pyscf/drivers/dft_wb97x3c_sample.json b/gpu4pyscf/drivers/dft_wb97x3c_sample.json
new file mode 100644
index 00000000..c32e0cc0
--- /dev/null
+++ b/gpu4pyscf/drivers/dft_wb97x3c_sample.json
@@ -0,0 +1,25 @@
+[{
+    "input_dir": "./",
+    "output_dir": "./",
+    "molecules": [
+        "h2o.xyz",
+        "h2o.xyz",
+        "h2o.xyz",
+        "h2o.xyz",
+        "h2o.xyz"
+    ],
+    "xc": "wb97x3c",
+    "verbose": 4,
+    "with_solvent": false,
+    "with_thermo": false,
+    "solvent": {
+        "eps": 78.3553,
+        "solvent": "water",
+        "method": "SMD"
+    },
+    "with_gpu": true,
+    "with_df": true,
+    "with_grad": true,
+    "with_hess": false,
+    "save_density": false
+}]
diff --git a/gpu4pyscf/drivers/ecp_vDZP_NWCHEM.dat b/gpu4pyscf/drivers/ecp_vDZP_NWCHEM.dat
new file mode 100644
index 00000000..228ad57d
--- /dev/null
+++ b/gpu4pyscf/drivers/ecp_vDZP_NWCHEM.dat
@@ -0,0 +1,1214 @@
+
+ECP
+B nelec 2
+B S
+2         4.50610000          23.99296000         
+B P
+2         5.60000000          -1.30000000         
+B D
+2         0.08000000          -0.00300000         
+B ul
+2         1.00000000          0.00000000          
+C nelec 2
+C S
+2         6.40105200          33.12163800         
+C P
+2         7.30774700          -1.98625700         
+C D
+2         5.96179600          -9.45431800         
+C ul
+2         1.00000000          0.00000000          
+N nelec 2
+N S
+2         7.97723200          38.53383100         
+N P
+2         10.18385400         -2.55081000         
+N D
+2         11.55994700         -2.99554500         
+N ul
+2         1.00000000          0.00000000          
+O nelec 2
+O S
+2         10.44567000         50.77106900         
+O P
+2         18.04517400         -4.90355100         
+O D
+2         8.16479800          -3.31212400         
+O ul
+2         1.00000000          0.00000000          
+F nelec 2
+F S
+2         22.35040000         102.59795200        
+2         11.17520000         19.04966300         
+F P
+2         26.47680000         -15.14396000        
+2         13.23840000         2.80292100          
+F D
+F ul
+2         1.00000000          0.00000000          
+Ne nelec 2
+Ne S
+2         31.86016200         112.52543566        
+2         12.36221900         28.30083454         
+Ne P
+2         21.50803400         -11.12658543        
+2         12.91044700         3.38754919          
+Ne D
+2         0.85038500          -0.18408921         
+Ne ul
+2         1.00000000          0.00000000          
+Na nelec 2
+Na S
+2         2.47830001          -14.53866100        
+2         3.09900001          31.91120791         
+2         3.94710001          -32.32224607        
+1         8.21659994          3.14094701          
+0         1.50080000          1.87765500          
+Na ul
+2         1.60680000          -0.00010200         
+2         22.52309990         -1.71544300         
+1         76.23649979         -1.36191100         
+Mg nelec 2
+Mg S
+2         2.95159999          -15.62671006        
+2         3.73249999          33.53119421         
+2         4.81180000          -36.07307196        
+1         9.91949999          3.12963399          
+0         1.81850000          1.93843301          
+Mg ul
+2         1.13200000          -0.00005300         
+2         27.30570006         -1.93249400         
+1         93.30560017         -1.39384700         
+Al nelec 10
+Al S
+2         2.19822500          20.40981300         
+Al P
+2         1.60139500          8.98049500          
+Al D
+2         1.49902600          -1.97041100         
+Al ul
+2         1.00000000          0.00000000          
+Si nelec 10
+Si S
+2         2.71362197          26.62331865         
+Si P
+2         1.96687987          10.92995391         
+Si D
+2         2.71001600          -4.66941200         
+Si ul
+2         1.00000000          0.00000000          
+P nelec 10
+P S
+2         2.94055986          26.53226131         
+P P
+2         2.22771255          11.49721021         
+P D
+2         5.66170600          -16.77278000        
+P ul
+2         1.00000000          0.00000000          
+S nelec 10
+S S
+2         3.74389164          37.97481900         
+S P
+2         3.08608744          18.79052931         
+S D
+2         4.86241400          -7.83796400         
+S ul
+2         1.00000000          0.00000000          
+Cl nelec 10
+Cl S
+2         6.39430000          33.13663196         
+2         3.19710000          16.27072783         
+Cl P
+2         5.62070000          24.41699269         
+2         2.81030000          7.68304978          
+Cl D
+2         5.33810000          -8.58764865         
+Cl ul
+2         1.00000000          0.00000000          
+Ar nelec 10
+Ar S
+2         10.26172100         68.66778801         
+2         3.95272500          24.04276629         
+Ar P
+2         5.39271400          27.73076331         
+2         2.69996700          4.04545904          
+Ar D
+2         8.08623500          -8.13747696         
+2         4.01663200          -1.66452808         
+Ar F
+2         5.20845900          -3.40009845         
+Ar ul
+2         1.00000000          0.00000000          
+K nelec 10
+K S
+2         2.41030002          33.92499542         
+2         2.77449989          -117.52191162       
+2         3.33690000          108.25654602        
+1         2.22169995          5.80375385          
+0         12.19449997         3.21023202          
+K P
+2         3.15459990          -69.97570801        
+2         4.03189993          178.15260315        
+2         5.16720009          -136.11946106       
+1         11.33360004         4.43228197          
+0         1.74329996          4.74835920          
+K ul
+2         4.99889994          -1.96356797         
+2         14.58220005         -15.53031635        
+2         44.89559937         -38.57669830        
+1         141.48350525        -7.30101395         
+Ca nelec 10
+Ca S
+2         2.68330002          29.85718918         
+2         3.08299994          -117.43736267       
+2         3.70659995          102.65641785        
+1         2.47650003          10.56573391         
+0         13.28890038         3.14651394          
+Ca P
+2         3.47429991          -77.17020416        
+2         4.38199997          180.64172363        
+2         5.66209984          -125.80883789       
+1         2.20009995          5.88094378          
+0         6.12169981          4.76948595          
+Ca ul
+2         5.92140007          -2.26729298         
+2         17.20369911         -17.17546654        
+2         53.37039948         -42.60916519        
+1         169.98489380        -7.46766806         
+Sc nelec 10
+Sc S
+2         11.50000000         138.53815200        
+2         5.18400000          14.83404210         
+Sc P
+2         10.93000000         82.45861400         
+2         4.58100000          8.56520569          
+Sc D
+2         13.47000000         -16.12986210        
+2         4.37500000          -0.53469012         
+Sc ul
+2         1.00000000          0.00000000          
+Ti nelec 10
+Ti S
+2         13.01000000         158.24159300        
+2         5.86200000          17.51182390         
+Ti P
+2         12.46000000         95.23512680         
+2         5.21700000          10.04785600         
+Ti D
+2         15.35000000         -17.56886120        
+2         4.98000000          -0.58725612         
+Ti ul
+2         1.00000000          0.00000000          
+V nelec 10
+V S
+2         14.49000000         178.44797100        
+2         6.52400000          19.83137520         
+V P
+2         14.30000000         109.52976300        
+2         6.02100000          12.57030950         
+V D
+2         17.48000000         -19.21965700        
+2         5.70900000          -0.64277474         
+V ul
+2         1.00000000          0.00000000          
+Cr nelec 10
+Cr S
+2         16.39000000         201.57888700        
+2         7.40200000          24.20574090         
+Cr P
+2         16.45000000         125.02277400        
+2         6.96200000          16.47906550         
+Cr D
+2         19.93000000         -20.82742110        
+2         6.59800000          -0.83436781         
+Cr ul
+2         1.00000000          0.00000000          
+Mn nelec 10
+Mn S
+2         18.52000000         226.43090200        
+2         8.37300000          30.35907230         
+Mn P
+2         18.92000000         142.15470500        
+2         8.01700000          21.53650930         
+Mn D
+2         22.72000000         -22.56811870        
+2         7.64000000          -1.20581020         
+Mn ul
+2         1.00000000          0.00000000          
+Fe nelec 10
+Fe S
+2         20.93000000         253.74958800        
+2         9.44500000          37.92284500         
+Fe P
+2         21.76000000         161.03681200        
+2         9.17800000          27.65129800         
+Fe D
+2         25.90000000         -24.43127600        
+2         8.83500000          -1.43425100         
+Fe ul
+2         1.00000000          0.00000000          
+Co nelec 10
+Co S
+2         23.66000000         283.96056600        
+2         10.61000000         47.15684590         
+Co P
+2         25.04000000         182.21223600        
+2         10.44000000         35.23335150         
+Co D
+2         29.54000000         -26.47533270        
+2         10.18000000         -1.82578723         
+Co ul
+2         1.00000000          0.00000000          
+Ni nelec 10
+Ni S
+2         26.74000000         317.68227200        
+2         11.86000000         58.25539100         
+Ni P
+2         28.80000000         252.47436600        
+2         11.79000000         36.08150310         
+Ni D
+2         33.70000000         -18.52295510        
+2         11.66000000         -4.55766810         
+Ni ul
+2         1.00000000          0.00000000          
+Cu nelec 10
+Cu S
+2         30.11054300         355.75051200        
+2         13.07631000         70.93090600         
+Cu P
+2         32.69261400         77.96993100         
+2         32.77033900         155.92744800        
+2         13.75106700         18.02113200         
+2         13.32216600         36.09437200         
+Cu D
+2         38.99651100         -12.34341000        
+2         39.53978800         -18.27336200        
+2         12.28751100         -0.98470500         
+2         11.45930000         -1.31874700         
+Cu F
+2         6.19010200          -0.22726400         
+2         8.11878000          -0.46877300         
+Cu ul
+2         1.00000000          0.00000000          
+Zn nelec 10
+Zn S
+2         34.17400100         399.98639900        
+2         14.45637100         85.48975000         
+Zn P
+2         39.88868300         92.38107700         
+2         39.65501700         184.77117600        
+2         15.29054600         23.00254100         
+2         14.90352400         46.05742700         
+Zn D
+2         43.70829600         -13.69073400        
+2         43.69853600         -20.54398000        
+2         15.15071800         -1.31615400         
+2         15.28244100         -1.83871500         
+Zn F
+2         8.16001400          -0.37036000         
+2         12.22842200         -1.06294300         
+Zn ul
+2         1.00000000          0.00000000          
+Ga nelec 28
+Ga S
+2         5.21596000          203.85397200        
+Ga P
+2         4.30890400          156.10339000        
+Ga D
+2         0.49635700          1.03164700          
+Ga F
+2         1.71517000          -10.67373500        
+Ga ul
+2         1.00000000          0.00000000          
+Ge nelec 28
+Ge S
+2         4.81540900          149.24657900        
+Ge P
+2         4.16951500          132.84433500        
+Ge D
+2         0.59195800          1.34615400          
+Ge F
+2         1.79177000          -7.04422300         
+Ge ul
+2         1.00000000          0.00000000          
+As nelec 28
+As S
+2         3.61262500          53.96562000         
+As P
+2         3.90792600          88.94908800         
+As D
+2         1.92646700          22.42028800         
+As F
+2         1.77343400          -4.70481500         
+As ul
+2         1.00000000          0.00000000          
+Se nelec 28
+Se S
+2         4.23705700          79.66334500         
+Se P
+2         2.91033400          31.56099300         
+Se D
+2         2.33570100          30.80461000         
+Se F
+2         2.25463900          -6.54687500         
+Se ul
+2         1.00000000          0.00000000          
+Br nelec 28
+Br S
+2         5.02180000          61.51372100         
+2         2.51090000          9.02149300          
+Br P
+2         4.28140000          53.87586400         
+2         2.14070000          4.62940200          
+Br D
+2         2.88000000          20.84967700         
+2         1.44000000          2.96544400          
+Br F
+2         2.72070000          -8.16149300         
+Br ul
+2         1.00000000          0.00000000          
+Kr nelec 28
+Kr S
+2         5.87771800          73.91569390         
+2         3.08462200          16.16825080         
+Kr P
+2         5.16411000          58.51769101         
+2         2.35830200          8.25910073          
+Kr D
+2         3.21536200          33.45822776         
+2         1.28500800          0.67725331          
+Kr F
+2         4.08286900          -15.15869859        
+2         1.19396000          -0.17408825         
+Kr G
+2         3.18077500          -6.83315877         
+Kr ul
+2         1.00000000          0.00000000          
+Rb nelec 28
+Rb S
+2         2.29809999          50.81394196         
+2         2.66269994          -162.04731750       
+2         3.50929999          313.81082153        
+2         4.96980000          -309.75451660       
+2         6.94840002          216.07606506        
+1         17.70389938         20.86063194         
+0         25.66029930         3.36120105          
+Rb P
+2         2.02160001          45.41232300         
+2         2.33979988          -145.47238159       
+2         3.07839990          283.18420410        
+2         4.37570000          -305.10214233       
+2         6.15859985          207.65396118        
+1         16.77890015         12.15985012         
+0         16.61680031         5.39989424          
+Rb D
+2         1.23380005          31.68275070         
+2         1.41939998          -100.62529755       
+2         1.83389997          186.52160645        
+2         2.54550004          -239.76072693       
+2         3.47009993          170.19052124        
+1         10.62069988         9.91743755          
+0         9.28610039          7.41062880          
+Rb ul
+2         1.96459997          -1.04400003         
+2         5.02349997          -12.26854706        
+2         12.31190014         -40.49360657        
+2         39.43920136         -92.10794830        
+1         116.43070221        -20.25083160        
+Sr nelec 28
+Sr S
+2         2.44670010          53.58986664         
+2         2.86780000          -172.08218384       
+2         3.86610007          345.58593750        
+2         5.66069984          -351.22171021       
+2         8.30790043          257.34286499        
+1         23.49519920         12.91709232         
+0         21.03380013         6.33449411          
+Sr P
+2         2.20950007          49.15122604         
+2         2.58439994          -158.82582092       
+2         3.46840000          320.08462524        
+2         5.06430006          -349.10769653       
+2         7.37960005          239.32991028        
+1         22.22710037         11.86419868         
+0         19.91810036         5.33859777          
+Sr D
+2         1.40730000          32.67572403         
+2         1.62419999          -103.22133636       
+2         2.11750007          193.27650452        
+2         2.97239995          -248.63302612       
+2         4.11800003          183.39566040        
+1         12.52390003         10.14631939         
+0         10.88269997         7.38135815          
+Sr ul
+2         2.23399997          -1.16187501         
+2         5.67100000          -13.37399960        
+2         13.88179970         -43.23659134        
+2         44.81060028         -100.09903717       
+1         132.90260315        -20.51813126        
+Y nelec 28
+Y S
+2         7.48804900          135.15384400        
+2         3.74402500          15.55244100         
+Y P
+2         6.44537700          87.78499200         
+2         3.22268900          11.56406600         
+Y D
+2         4.65844700          29.70100100         
+2         2.32922400          5.53996800          
+Y F
+2         6.58421200          -19.12219800        
+2         3.29210600          -2.43637500         
+Y ul
+2         1.00000000          0.00000000          
+Zr nelec 28
+Zr S
+2         8.20000000          150.26759100        
+2         4.08972800          18.97621600         
+Zr P
+2         7.11000000          99.62212400         
+2         3.59679800          14.16873300         
+Zr D
+2         5.35000000          35.04512400         
+2         2.49182100          6.11125900          
+Zr F
+2         7.54000000          -21.09377600        
+2         3.77000000          -3.08069400         
+Zr ul
+2         1.00000000          0.00000000          
+Nb nelec 28
+Nb S
+2         8.90000000          165.17914300        
+2         4.43000000          21.99297400         
+Nb P
+2         7.77000000          111.79441400        
+2         3.96000000          16.63348300         
+Nb D
+2         6.05000000          38.11224900         
+2         2.84000000          8.03916700          
+Nb F
+2         8.49000000          -22.92955000        
+2         4.25000000          -3.66631000         
+Nb ul
+2         1.00000000          0.00000000          
+Mo nelec 28
+Mo S
+2         9.71459400          180.10310800        
+2         4.68050000          24.99722800         
+Mo P
+2         8.14213700          123.77275200        
+2         4.62598600          19.53022800         
+Mo D
+2         6.61841500          48.37502200         
+2         3.24875200          8.89205300          
+Mo F
+2         9.45000000          -24.80517700        
+2         4.72000000          -4.15378200         
+Mo ul
+2         1.00000000          0.00000000          
+Tc nelec 28
+Tc S
+2         10.42234600         195.15916600        
+2         5.03651600          28.09260300         
+Tc P
+2         8.95044900          135.28456600        
+2         4.85443900          21.80650400         
+Tc D
+2         6.94569700          54.32972900         
+2         3.97058500          11.15506800         
+Tc F
+2         10.40000000         -26.56244700        
+2         5.20000000          -4.58568100         
+Tc ul
+2         1.00000000          0.00000000          
+Ru nelec 28
+Ru S
+2         11.10526900         209.82297100        
+2         5.41474500          30.65472600         
+Ru P
+2         9.77127100          146.33618200        
+2         5.07399100          24.12787700         
+Ru D
+2         7.67142300          67.51589700         
+2         4.13656500          9.87010400          
+Ru F
+2         11.36000000         -28.34061600        
+2         5.68000000          -4.94462900         
+Ru ul
+2         1.00000000          0.00000000          
+Rh nelec 28
+Rh S
+2         11.72000000         225.34775400        
+2         5.82000000          32.82318900         
+Rh P
+2         10.42000000         158.70941200        
+2         5.45000000          26.44410000         
+Rh D
+2         8.82000000          62.75862600         
+2         3.87000000          10.97871900         
+Rh F
+2         12.31000000         -30.09345600        
+2         6.16000000          -5.21848200         
+Rh ul
+2         1.00000000          0.00000000          
+Pd nelec 28
+Pd S
+2         12.43000000         240.22904000        
+2         6.17075900          35.17194300         
+Pd P
+2         11.08000000         170.41727600        
+2         5.82955400          28.47213300         
+Pd D
+2         9.51000000          69.01384500         
+2         4.13978100          11.75086200         
+Pd F
+2         13.27000000         -31.92955400        
+2         6.63000000          -5.39821700         
+Pd ul
+2         1.00000000          0.00000000          
+Ag nelec 28
+Ag S
+2         13.13000000         255.13936500        
+2         6.51000000          36.86612200         
+Ag P
+2         11.74000000         182.18186900        
+2         6.20000000          30.35775100         
+Ag D
+2         10.21000000         73.71926100         
+2         4.38000000          12.50211700         
+Ag F
+2         14.22000000         -33.68992000        
+2         7.11000000          -5.53112000         
+Ag ul
+2         1.00000000          0.00000000          
+Cd nelec 28
+Cd S
+2         13.83586900         270.00948300        
+2         6.85727000          38.76730800         
+Cd P
+2         12.40497100         193.82962900        
+2         6.56779900          31.89652500         
+Cd D
+2         10.89692500         79.19364700         
+2         4.64116500          13.23082700         
+Cd F
+2         15.18479600         -35.47662600        
+2         7.59239800          -5.61767700         
+Cd ul
+2         1.00000000          0.00000000          
+In nelec 46
+In S
+2         1.43509100          29.16521900         
+2         0.69580500          -4.19080600         
+In P
+2         1.44083200          36.99054200         
+2         0.70139200          -3.36582000         
+In D
+2         0.96123600          20.00053100         
+In F
+2         0.88436900          -6.01909200         
+In ul
+2         1.00000000          0.00000000          
+Sn nelec 46
+Sn S
+2         1.96972500          67.92534700         
+2         0.97237500          -7.47854600         
+Sn P
+2         1.99921000          56.60288000         
+2         0.99904200          -2.16177600         
+Sn D
+2         0.50036100          2.57633600          
+Sn F
+2         1.23088000          -10.10925300        
+Sn ul
+2         1.00000000          0.00000000          
+Sb nelec 46
+Sb S
+2         2.49109100          68.42793800         
+2         1.34157500          -4.39863100         
+Sb P
+2         2.14386400          63.96546900         
+2         0.58550300          -0.57872600         
+Sb D
+2         0.79540100          7.80366100          
+Sb F
+2         1.60925100          -14.51768700        
+Sb ul
+2         1.00000000          0.00000000          
+Te nelec 46
+Te S
+2         2.92379400          50.08380500         
+2         1.15275400          1.96814000          
+Te P
+2         2.60308600          119.82070200        
+2         0.98544800          -2.03904800         
+Te D
+2         1.43501900          37.75721400         
+Te F
+2         1.93927000          -17.86464100        
+Te ul
+2         1.00000000          0.00000000          
+I nelec 46
+I S
+2         3.51120000          83.11386300         
+2         1.75560000          5.20187600          
+I P
+2         2.96880000          82.81110900         
+2         1.48440000          3.37968200          
+I D
+2         1.90660000          10.30427700         
+2         0.95330000          7.58803200          
+I F
+2         2.30750000          -21.47793600        
+I ul
+2         1.00000000          0.00000000          
+Xe nelec 46
+Xe S
+2         3.94026300          122.76382934        
+2         2.27726400          8.30885115          
+Xe P
+2         3.02837300          68.82300437         
+2         1.39431900          3.64674223          
+Xe D
+2         2.12260500          23.65207854         
+2         0.79866900          3.25844113          
+Xe F
+2         6.16436000          -47.70319876        
+2         1.54237400          -6.54113991         
+Xe G
+2         1.84789200          -7.10585060         
+Xe ul
+2         1.00000000          0.00000000          
+Cs nelec 46
+Cs S
+2         1.38530004          42.85466766         
+2         1.63240004          -138.00901794       
+2         2.20580006          275.99960327        
+2         3.22149992          -280.45663452       
+2         4.64960003          199.82038879        
+1         15.15250015         27.73096657         
+0         19.00049973         3.76870608          
+Cs P
+2         1.25950003          48.66250992         
+2         1.44169998          -145.70526123       
+2         1.87639999          264.46368408        
+2         2.65750003          -279.85159302       
+2         3.63870001          184.35585022        
+1         10.65320015         23.30001831         
+0         14.68060017         5.76792908          
+Cs D
+2         0.76810002          34.86072540         
+2         0.87459999          -106.79302979       
+2         1.10769999          188.23532104        
+2         1.48230004          -217.63992310       
+2         1.92019999          137.74559021        
+1         6.21829987          34.42418671         
+0         17.38809967         7.19875193          
+Cs ul
+2         0.93849999          -0.78916699         
+2         2.31629992          -8.42115784         
+2         6.00729990          -30.98544312        
+2         20.37969971         -95.03477478        
+1         59.32889938         -30.07960320        
+Ba nelec 46
+Ba S
+2         1.51549995          52.18550110         
+2         1.80079997          -166.64633179       
+2         2.46210003          336.98910522        
+2         3.64910007          -346.60510254       
+2         5.34859991          229.66429138        
+1         17.15789986         20.49417496         
+0         16.02389908         6.64949989          
+Ba P
+2         1.35119998          61.51347351         
+2         1.55149996          -171.17402649       
+2         2.06870008          303.61636353        
+2         3.00740004          -324.12673950       
+2         4.23050022          210.71342468        
+1         14.21850014         19.11876488         
+0         13.10439968         5.84502220          
+Ba D
+2         0.89740002          34.92659378         
+2         1.02090001          -109.23178864       
+2         1.28859997          196.23254395        
+2         1.73850000          -224.63766479       
+2         2.28509998          146.97143555        
+1         7.33769989          37.01747894         
+0         20.39410019         7.09744883          
+Ba ul
+2         0.97610003          -0.88013703         
+2         2.66910005          -10.01861763        
+2         7.10550022          -35.70346451        
+2         24.84989929         -114.57715607       
+1         75.09850311         -30.99500656        
+La nelec 46
+La S
+2         3.30990000          91.93217700         
+2         1.65500000          -3.78876400         
+La P
+2         2.83680000          63.75948600         
+2         1.41840000          -0.64795800         
+La D
+2         2.02130000          36.11617300         
+2         1.01070000          0.21911400          
+La F
+2         4.02860000          -36.01001600        
+La ul
+2         1.00000000          0.00000000          
+Ce nelec 46
+Ce S
+2         1.89370130          -255.56238300       
+2         1.97914860          307.31392800        
+0         10.74296970         10.66990170         
+Ce P
+0         7.75592980          12.22921090         
+2         1.81564130          124.94246600        
+2         1.67164720          -84.59998680        
+Ce D
+2         1.70642050          24.94467550         
+0         6.48933740          10.28614400         
+Ce ul
+1         9.20747930          -15.34875610        
+1         1.86730120          -5.84323950         
+Pr nelec 46
+Pr S
+2         2.12955580          -223.64398200       
+2         2.22746550          278.13451500        
+0         7.28371960          12.62107180         
+Pr P
+0         7.80928040          12.52563900         
+2         1.92977860          121.95278200        
+2         1.76478920          -79.40475120        
+Pr D
+2         1.79797430          26.19266520         
+0         6.54805150          9.87391210          
+Pr ul
+1         9.82972860          -15.44352190        
+1         1.98070080          -5.89611280         
+Nd nelec 46
+Nd S
+2         2.22478560          -219.47084000       
+2         2.34459590          280.52892800        
+0         11.85798300         11.76232500         
+Nd P
+0         9.44306450          11.43612170         
+2         1.98949080          120.33535600        
+2         1.79805450          -75.78196430        
+Nd D
+2         1.88855050          27.38307130         
+0         6.64366550          9.61741050          
+Nd ul
+1         10.42978840         -15.48300440        
+1         2.09229130          -5.94833370         
+Pm nelec 46
+Pm S
+2         2.27365310          -215.58178700       
+2         2.40502950          277.53404200        
+0         10.76473100         11.55980120         
+Pm P
+0         9.85397700          10.33666460         
+2         2.07626890          121.46769300        
+2         1.86493490          -74.87664400        
+Pm D
+2         1.97798220          28.48611040         
+0         6.67329420          9.28090580          
+Pm ul
+1         11.10670840         -15.58675570        
+1         2.21044790          -6.01203460         
+Sm nelec 46
+Sm S
+2         2.37776610          -206.06726600       
+2         2.52471590          270.34259800        
+0         10.26438980         11.44490440         
+Sm P
+0         10.00542360         10.69510070         
+2         2.20124120          121.87722800        
+2         1.96809280          -72.78839030        
+Sm D
+2         2.06720440          29.52394000         
+0         6.71288900          9.03717700          
+Sm ul
+1         11.75812390         -15.65862010        
+1         2.32808300          -6.05376500         
+Eu nelec 46
+Eu S
+2         2.50382840          -196.63773800       
+2         2.66926410          264.10344900        
+0         10.36738240         11.80698020         
+Eu P
+0         10.13418310         10.93806010         
+2         2.32439940          128.16026900        
+2         2.08337490          -76.51532760        
+Eu D
+2         2.15438940          30.46732640         
+0         6.69379730          8.74122690          
+Eu ul
+1         12.41835390         -15.72447990        
+1         2.44755850          -6.09107370         
+Gd nelec 46
+Gd S
+2         2.70226520          -137.90212500       
+2         2.93347740          208.96454000        
+0         9.85080960          16.85869360         
+Gd P
+0         10.25620120         11.13083430         
+2         2.45273210          131.65132100        
+2         2.19693430          -77.44430080        
+Gd D
+2         2.24716920          31.46641180         
+0         6.43064370          8.34507290          
+Gd ul
+1         13.09509080         -15.78672600        
+1         2.56973860          -6.12777810         
+Tb nelec 46
+Tb S
+2         2.61817350          -140.10322400       
+2         2.89041800          215.28993500        
+0         17.04897130         11.68123370         
+Tb P
+0         10.12533140         11.12162130         
+2         2.56570380          139.65713100        
+2         2.30834840          -83.30982210        
+Tb D
+2         2.32603330          32.12176790         
+0         6.59311800          8.22336260          
+Tb ul
+1         13.78498800         -15.84080810        
+1         2.69386690          -6.16678470         
+Dy nelec 46
+Dy S
+2         2.73908100          -130.00434800       
+2         3.04934710          208.63602400        
+0         16.57473300         11.48546660         
+Dy P
+0         10.69436010         11.62238130         
+2         2.74055830          125.41578200        
+2         2.41123460          -65.82038280        
+Dy D
+2         2.40961370          32.79772690         
+0         6.49588250          7.98023140          
+Dy ul
+1         14.49058170         -15.89723700        
+1         2.82105130          -6.19939400         
+Ho nelec 46
+Ho S
+2         2.84745680          -110.58301100       
+2         3.22421600          192.70173300        
+0         16.15618810         11.35468650         
+Ho P
+0         9.92608330          10.84274330         
+2         2.86230720          116.52006000        
+2         2.47244920          -55.66109800        
+Ho D
+2         2.49115510          33.34249060         
+0         6.35100910          7.73004920          
+Ho ul
+1         15.21414080         -15.95165880        
+1         2.95124920          -6.23162570         
+Er nelec 46
+Er S
+2         3.03618530          -150.36700200       
+2         3.35253480          236.19775600        
+0         16.03924000         11.41646110         
+Er P
+0         11.04634630         11.89253580         
+2         3.01799110          130.54350100        
+2         2.64727180          -65.69229880        
+Er D
+2         2.57008760          33.73180760         
+0         6.16564070          7.48868910          
+Er ul
+1         15.95965310         -16.00435820        
+1         3.08505820          -6.26509470         
+Tm nelec 46
+Tm S
+2         3.18338300          -143.24595300       
+2         3.53531470          233.19553800        
+0         16.16013920         11.61094440         
+Tm P
+0         2.46378240          7.39619030          
+2         2.72194360          119.57242900        
+2         2.49552880          -84.01744530        
+Tm D
+2         2.64361640          33.84116700         
+0         5.89426350          7.23350090          
+Tm ul
+1         16.72435540         -16.05536170        
+1         3.22216200          -6.29832940         
+Yb nelec 46
+Yb S
+2         3.29988440          -106.50639900       
+2         3.77177090          201.14955200        
+0         16.63449170         12.01001670         
+Yb P
+0         2.65544200          7.59659140          
+2         2.83854580          119.64934700        
+2         2.59880230          -82.70469640        
+Yb D
+2         2.71026500          34.17132690         
+0         6.26424190          7.27486230          
+Yb ul
+1         17.51231900         -16.10497510        
+1         3.36313060          -6.33277960         
+Lu nelec 46
+Lu S
+2         3.34224800          -71.31717050        
+2         4.00509410          169.40139300        
+0         16.18095210         11.88611540         
+Lu P
+0         2.72110670          7.71934060          
+2         2.98200450          116.10736300        
+2         2.71480080          -78.17862840        
+Lu D
+2         2.77571170          33.91192520         
+0         5.89380130          7.01871010          
+Lu ul
+1         18.32043650         -16.15200380        
+1         3.50797850          -6.36766820         
+Hf nelec 60
+Hf S
+2         14.76995900         1499.28471100       
+2         7.38497900          40.28210100         
+Hf P
+2         9.84949000          397.73300500        
+2         4.92474500          19.31640600         
+Hf D
+2         6.09675600          101.32980500        
+2         3.04837800          5.87343800          
+Hf F
+2         1.78577000          10.04672300         
+Hf G
+2         2.63240000          -9.55824400         
+Hf ul
+2         1.00000000          0.00000000          
+Ta nelec 60
+Ta S
+2         14.54640800         1345.88064700       
+2         7.27320400          36.76680600         
+Ta P
+2         9.93556500          378.42530100        
+2         4.96778200          22.29309100         
+Ta D
+2         6.34737700          104.88395600        
+2         3.17368800          8.75584800          
+Ta F
+2         2.01788100          12.01796100         
+Ta G
+2         3.04033000          -11.72893300        
+Ta ul
+2         1.00000000          0.00000000          
+W nelec 60
+W S
+2         14.32285600         1192.39588200       
+2         7.16142800          32.52293300         
+W P
+2         10.02164100         359.03196700        
+2         5.01082000          24.03038000         
+W D
+2         6.59799700          108.30134900        
+2         3.29899900          10.98252800         
+W F
+2         2.25888800          14.15257900         
+W G
+2         3.46411000          -14.05643500        
+W ul
+2         1.00000000          0.00000000          
+Re nelec 60
+Re S
+2         14.09930500         1038.95157200       
+2         7.04965300          29.56173800         
+Re P
+2         10.10771700         339.54351000        
+2         5.05385800          24.91369600         
+Re D
+2         6.84861800          111.69965300        
+2         3.42430900          12.62432900         
+Re F
+2         2.50865100          16.44985200         
+Re G
+2         3.90124500          -16.50112000        
+Re ul
+2         1.00000000          0.00000000          
+Os nelec 60
+Os S
+2         13.87575400         885.40571900        
+2         6.93787700          25.96704000         
+Os P
+2         10.19379300         320.08390200        
+2         5.09689600          26.14876500         
+Os D
+2         7.09923800          115.04484300        
+2         3.54961900          13.62257500         
+Os F
+2         2.76707500          18.90945700         
+Os G
+2         4.34990500          -19.02759500        
+Os ul
+2         1.00000000          0.00000000          
+Ir nelec 60
+Ir S
+2         13.65220300         732.26920000        
+2         6.82610100          26.48472100         
+Ir P
+2         10.27986800         299.48947400        
+2         5.13993400          26.46623400         
+Ir D
+2         7.34985900          124.45759500        
+2         3.67492900          14.03599500         
+Ir F
+2         3.03407200          21.53103100         
+Ir G
+2         4.80885700          -21.60759700        
+Ir ul
+2         1.00000000          0.00000000          
+Pt nelec 60
+Pt S
+2         13.42865100         579.22386100        
+2         6.71432600          29.66949100         
+Pt P
+2         10.36594400         280.86077400        
+2         5.18297200          26.74538200         
+Pt D
+2         7.60047900          120.39644400        
+2         3.80024000          15.81092100         
+Pt F
+2         3.30956900          24.31437600         
+Pt G
+2         5.27728900          -24.21867500        
+Pt ul
+2         1.00000000          0.00000000          
+Au nelec 60
+Au S
+2         13.20510000         426.84667900        
+2         6.60255000          37.00708300         
+Au P
+2         10.45202000         261.19958000        
+2         5.22601000          26.96249600         
+Au D
+2         7.85110000          124.79066600        
+2         3.92555000          16.30072600         
+Au F
+2         4.78982000          30.49008900         
+2         2.39491000          5.17107400          
+Au ul
+2         1.00000000          0.00000000          
+Hg nelec 60
+Hg S
+2         12.98154900         275.73721200        
+2         6.49077400          49.08921200         
+Hg P
+2         10.53809600         241.54007400        
+2         5.26904800          27.39659100         
+Hg D
+2         8.10172100          127.86700800        
+2         4.05086000          16.60831200         
+Hg F
+2         3.88579100          30.36499600         
+Hg G
+2         6.24095500          -29.47311800        
+Hg ul
+2         1.00000000          0.00000000          
+Tl nelec 78
+Tl S
+2         0.32623800          -1.01649800         
+2         1.97754100          51.70795900         
+2         10.00000000         73.18668300         
+Tl P
+2         0.54306300          -2.96267300         
+2         1.03214000          19.73043100         
+Tl D
+2         0.35481700          2.77269000          
+2         0.70963300          -3.97943900         
+Tl F
+2         0.68915600          -4.42678600         
+Tl G
+2         0.82061700          -12.27054000        
+Tl ul
+2         1.00000000          0.00000000          
+Pb nelec 78
+Pb S
+2         0.52916100          -1.87334200         
+2         1.45672700          20.86079700         
+2         9.99991100          97.58795500         
+Pb P
+2         0.67811900          -7.76820900         
+2         1.24901300          51.71925400         
+Pb D
+2         0.30744600          1.30076000          
+2         0.74493000          2.64082200          
+Pb F
+2         0.84869900          -5.70605600         
+Pb G
+2         0.99994100          -7.48418400         
+Pb ul
+2         1.00000000          0.00000000          
+Bi nelec 78
+Bi S
+2         0.16115200          -0.16198800         
+2         1.50983500          14.03169000         
+2         10.00000000         122.04740100        
+Bi P
+2         0.76049000          -6.18852600         
+2         1.42641500          51.04586800         
+Bi D
+2         0.78022600          20.53580400         
+2         0.26007500          -0.13619600         
+Bi F
+2         0.97360800          -6.41422600         
+Bi G
+2         1.08819500          -6.65606400         
+Bi ul
+2         1.00000000          0.00000000          
+Po nelec 78
+Po S
+2         0.92238600          -4.15930400         
+2         1.78191500          33.83035400         
+2         10.00000000         146.33910100        
+Po P
+2         0.72429100          -4.12531100         
+2         1.36386000          35.00707800         
+Po D
+2         0.47697900          1.20651800          
+2         0.95395700          13.35612500         
+Po F
+2         1.07545400          -6.77517400         
+Po G
+2         1.12209600          -5.51544100         
+Po ul
+2         1.00000000          0.00000000          
+At nelec 78
+At S
+2         0.92238600          -5.52846100         
+2         1.78191500          39.56886900         
+2         10.00000000         170.71138600        
+At P
+2         0.72429100          -2.29538700         
+2         1.36386000          25.49292000         
+At D
+2         0.63597200          4.86510700          
+2         1.27194300          14.57941300         
+At F
+2         1.15410800          -6.85786700         
+At G
+2         1.34511500          -7.61303900         
+At ul
+2         1.00000000          0.00000000          
+Rn nelec 78
+Rn S
+2         0.92238600          -5.01900500         
+2         1.78191500          37.03679000         
+2         10.80460100         195.10330800        
+Rn P
+2         0.72429100          -1.96648100         
+2         1.36386000          23.46405900         
+Rn D
+2         0.76940000          7.48345700          
+2         1.53880000          9.36190000          
+Rn F
+2         1.21389700          -6.76315000         
+Rn G
+2         1.57646900          -9.91566200         
+Rn ul
+2         1.00000000          0.00000000          
+END
diff --git a/gpu4pyscf/drivers/h2o.xyz b/gpu4pyscf/drivers/h2o.xyz
index 8c50538d..6072e217 100644
--- a/gpu4pyscf/drivers/h2o.xyz
+++ b/gpu4pyscf/drivers/h2o.xyz
@@ -1,5 +1,5 @@
 3
 
-O       99.814000000   100.835000000   101.232000000
-H       99.329200000    99.976800000   101.063000000
-H       99.151600000   101.561000000   101.414000000
+O       0.0000000000    -0.0000000000     0.1174000000
+H      -0.7570000000    -0.0000000000    -0.4696000000
+H       0.7570000000     0.0000000000    -0.4696000000
diff --git a/gpu4pyscf/drivers/opt_3c_driver.py b/gpu4pyscf/drivers/opt_3c_driver.py
new file mode 100644
index 00000000..20a97aa7
--- /dev/null
+++ b/gpu4pyscf/drivers/opt_3c_driver.py
@@ -0,0 +1,183 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###################################################################
+# This is a customized driver for three composite methods only
+# It only works for b97-3c, r2scan-3c, and wb97x-3c
+###################################################################
+
+import os
+import json
+import pyscf
+import argparse
+import tempfile
+import shutil
+import h5py
+from types import MethodType
+from pyscf import lib, gto
+from pyscf import dft, scf
+from pyscf.geomopt.geometric_solver import kernel
+
+from gpu4pyscf.drivers.dft_3c_driver import (
+    parse_3c, gen_disp_fun, gen_disp_grad_fun)
+
+def opt_mol(mol_name, config, constraints, charge=None, spin=0):
+    xc              = config.get('xc',              'b3lyp')
+    verbose         = config.get('verbose',         4)
+    scf_conv_tol    = config.get('scf_conv_tol',    1e-10)
+    with_df         = config.get('with_df',         True)
+    auxbasis        = config.get('auxbasis',       None)
+    with_gpu        = config.get('with_gpu',        True)
+    with_solvent    = config.get('with_solvent',    False)
+    maxsteps        = config.get('maxsteps',        50)
+    convergence_set = config.get('convergence_set', 'GAU')
+
+    default_solvent = {'method': 'iefpcm', 'eps': 78.3553, 'solvent': 'water'}
+    with_solvent   = config.get('with_solvent',   False)
+    solvent        = config.get('solvent',        default_solvent)
+    
+    # I/O
+    fp = tempfile.TemporaryDirectory()
+    local_dir = f'{fp.name}/'
+    logfile = f'{mol_name[:-4]}_pyscf.log'
+
+    shutil.copyfile(config['input_dir']+mol_name, local_dir+mol_name)
+    if constraints is not None:
+        shutil.copyfile(config['input_dir']+constraints, local_dir+constraints)
+
+    pyscf_xc, nlc, basis, ecp, (xc_disp, disp), xc_gcp = parse_3c(xc)
+
+    lib.num_threads(8)
+    mol = pyscf.M(
+        atom=local_dir+mol_name,
+        basis=basis,
+        ecp=ecp,
+        max_memory=32000,
+        verbose=verbose,
+        charge=charge,
+        spin=spin,
+        output=f'{local_dir}/{logfile}')
+    mol.build()
+
+    mf = dft.KS(mol, xc=pyscf_xc)
+    mf.grids.atom_grid = (99,590)
+    if mf._numint.libxc.is_nlc(mf.xc):
+        mf.nlcgrids.atom_grid = (50,194)
+    mf.disp = disp
+    if with_df:
+        pyscf_auxbasis = auxbasis
+        if auxbasis == "RIJK-def2-tzvp":
+            pyscf_auxbasis = 'def2-tzvp-jkfit'
+        mf = mf.density_fit(auxbasis=pyscf_auxbasis)
+    if with_gpu:
+        mf = mf.to_gpu()
+
+    #### Changes for 3C methods #####
+    # Setup dispersion correction and GCP
+    mf.nlc = nlc
+    mf.get_dispersion = MethodType(gen_disp_fun(xc_disp, xc_gcp), mf)
+    mf.do_disp = lambda: True
+    #################################
+
+    mf.chkfile = None
+
+    if with_solvent:
+        if solvent['method'].endswith(('PCM', 'pcm')):
+            mf = mf.PCM()
+            mf.with_solvent.lebedev_order = 29
+            mf.with_solvent.method = solvent['method'].replace('PCM','-PCM')
+            mf.with_solvent.eps = solvent['eps']
+        elif with_solvent and solvent['method'].endswith(('smd', 'SMD')):
+            mf = mf.SMD()
+            mf.with_solvent.lebedev_order = 29
+            mf.with_solvent.method = 'SMD'
+            mf.with_solvent.solvent = solvent['solvent']
+        else:
+            raise NotImplementedError
+
+    mf.direct_scf_tol = 1e-14
+    mf.chkfile = None
+    mf.conv_tol = scf_conv_tol
+
+    history = []
+    def callback(envs):
+        result = {
+            'energy':    envs['energy'],
+            'gradients': envs['gradients'],
+            'coords':    envs['coords'].tolist(),
+            'e1':        mf.scf_summary.get('e1',         0.0),
+            'e_coul':    mf.scf_summary.get('coul',       0.0),
+            'e_xc':      mf.scf_summary.get('exc',        0.0),
+            'e_disp':    mf.scf_summary.get('dispersion', 0.0)
+        }
+        history.append(result)
+
+    grad_scanner = mf.nuc_grad_method().as_scanner()
+    get_disp = gen_disp_grad_fun(xc_disp, xc_gcp)
+    grad_scanner.get_dispersion = MethodType(get_disp, grad_scanner)
+
+    geometric_log = f'{mol_name[:-4]}_geometric.log'
+    import sys
+    # PySCF forwards geometric log to sys.stderr
+    with open(f'{local_dir}/{geometric_log}', 'w') as log_file:
+        sys.stderr = log_file
+        conv, mol_eq = kernel(
+            grad_scanner,
+            maxsteps=maxsteps,
+            callback=callback,
+            convergence_set=convergence_set,
+            constraints=constraints)
+    sys.stderr = sys.__stderr__
+
+    # copy the files to destination folder
+    output_dir = config['output_dir']
+    isExist = os.path.exists(output_dir)
+    if not isExist:
+        os.makedirs(output_dir)
+    optimized_xyz = f'{mol_name[:-4]}_opt.xyz'
+    hist_file = f'{mol_name[:-4]}_hist.h5'
+    mol_eq.tofile(f'{local_dir}/{optimized_xyz}', format='xyz')
+
+    with h5py.File(f'{local_dir}/{hist_file}', 'w') as h5f:
+        #json.dump(history, f)
+        for step, info in enumerate(history):
+            group = h5f.create_group(f'step_{step}')
+            for key, array in info.items():
+                group.create_dataset(key, data=array)
+
+    shutil.copyfile(f'{local_dir}/{optimized_xyz}', f'{output_dir}/{optimized_xyz}')
+    shutil.copyfile(f'{local_dir}/{hist_file}', f'{output_dir}/{hist_file}')
+    shutil.copyfile(f'{local_dir}/{logfile}', f'{output_dir}/{logfile}')
+    shutil.copyfile(f'{local_dir}/{geometric_log}', f'{output_dir}/{geometric_log}')
+    if conv:
+        with open(f'{output_dir}/{mol_name[:-4]}_success.txt', 'w') as file:
+            file.write("Geometry optimization converged\n")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Run DFT with GPU4PySCF for molecules')
+    parser.add_argument("--config", type=str, default='example.json')
+    parser.add_argument("--charge", type=int, default=None)
+    parser.add_argument("--spin",   type=int, default=0)
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = json.load(f)
+        if isinstance(config, list):
+            config = config[0]
+    for i, mol_name in enumerate(config['molecules']):
+        constraints = None
+        if 'constraints' in config and config['constraints']:
+            assert len(config['constraints']) == len(config['molecules'])
+            constraints = config['constraints'][i]
+        opt_mol(mol_name, config, constraints, charge=args.charge, spin=args.spin)
diff --git a/gpu4pyscf/drivers/opt_b973c_sample.json b/gpu4pyscf/drivers/opt_b973c_sample.json
new file mode 100644
index 00000000..c1e12ef3
--- /dev/null
+++ b/gpu4pyscf/drivers/opt_b973c_sample.json
@@ -0,0 +1,22 @@
+[{
+    "input_dir": "./",
+    "output_dir": "./",
+    "molecules": [
+        "ethane.xyz",
+        "ethane.xyz"
+    ],
+    "constraints": [
+        "constraints.txt",
+        "constraints.txt"
+    ],
+    "xc": "b973c",
+    "auxbasis": "def2-universal-JFIT",
+    "verbose": 4,
+    "with_solvent": false,
+    "solvent": {
+        "eps": 78.3553,
+        "method": "CPCM"
+    },
+    "with_gpu": true,
+    "with_df": true
+}]
diff --git a/gpu4pyscf/drivers/opt_r2scan3c_sample.json b/gpu4pyscf/drivers/opt_r2scan3c_sample.json
new file mode 100644
index 00000000..d793f65a
--- /dev/null
+++ b/gpu4pyscf/drivers/opt_r2scan3c_sample.json
@@ -0,0 +1,22 @@
+[{
+    "input_dir": "./",
+    "output_dir": "./",
+    "molecules": [
+        "ethane.xyz",
+        "ethane.xyz"
+    ],
+    "constraints": [
+        "constraints.txt",
+        "constraints.txt"
+    ],
+    "xc": "r2scan3c",
+    "auxbasis": "def2-universal-JFIT",
+    "verbose": 4,
+    "with_solvent": false,
+    "solvent": {
+        "eps": 78.3553,
+        "method": "CPCM"
+    },
+    "with_gpu": true,
+    "with_df": true
+}]
diff --git a/gpu4pyscf/drivers/opt_wb97x3c_sample.json b/gpu4pyscf/drivers/opt_wb97x3c_sample.json
new file mode 100644
index 00000000..4f4ecb5b
--- /dev/null
+++ b/gpu4pyscf/drivers/opt_wb97x3c_sample.json
@@ -0,0 +1,21 @@
+[{
+    "input_dir": "./",
+    "output_dir": "./",
+    "molecules": [
+        "ethane.xyz",
+        "ethane.xyz"
+    ],
+    "constraints": [
+        "constraints.txt",
+        "constraints.txt"
+    ],
+    "xc": "wb97x3c",
+    "verbose": 4,
+    "with_solvent": false,
+    "solvent": {
+        "eps": 78.3553,
+        "method": "CPCM"
+    },
+    "with_gpu": true,
+    "with_df": true
+}]
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index dd374cc3..94399043 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -27,7 +27,7 @@
 from gpu4pyscf.scf.hf import KohnShamDFT
 from gpu4pyscf.lib.cupy_helper import tag_array, contract, condense, sandwich_dot, reduce_to_device
 from gpu4pyscf.__config__ import props as gpu_specs
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 from gpu4pyscf.df import int3c2e      #TODO: move int3c2e to out of df
 from gpu4pyscf.lib import logger
 from gpu4pyscf.scf import jk
@@ -127,7 +127,7 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None,
     if vhfopt is None:
         # Small group size for load balance
         group_size = None
-        if _num_devices > 1: 
+        if num_devices > 1: 
             group_size = jk.GROUP_SIZE
         vhfopt = _VHFOpt(mol).build(group_size=group_size)
 
@@ -156,13 +156,13 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None,
                     tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
+    for device_id in range(num_devices):
+        task_list.append(tasks[device_id::num_devices])
 
     cp.cuda.get_current_stream().synchronize()
     futures = []
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _ejk_ip1_task,
                 mol, dms, vhfopt, task_list[device_id],
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
index e0d535a4..25f43ef1 100644
--- a/gpu4pyscf/grad/rks.py
+++ b/gpu4pyscf/grad/rks.py
@@ -28,7 +28,7 @@
 from gpu4pyscf.lib.cupy_helper import (
     contract, get_avail_mem, add_sparse, tag_array, sandwich_dot, reduce_to_device)
 from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 
 from pyscf import __config__
 MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 128*128)
@@ -223,8 +223,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
 
     futures = []
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _get_vxc_task,
                 ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
diff --git a/gpu4pyscf/grad/uks.py b/gpu4pyscf/grad/uks.py
index 90582d73..50e8fd05 100644
--- a/gpu4pyscf/grad/uks.py
+++ b/gpu4pyscf/grad/uks.py
@@ -29,7 +29,7 @@
 from gpu4pyscf.lib.cupy_helper import (
     contract, get_avail_mem, add_sparse, tag_array, reduce_to_device)
 from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 from gpu4pyscf import __config__
 
 MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 128*128)
@@ -230,8 +230,8 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
 
     futures = []
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _get_vxc_task,
                 ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
diff --git a/gpu4pyscf/gto/int3c1e.py b/gpu4pyscf/gto/int3c1e.py
index 8e6ce88c..e445c458 100644
--- a/gpu4pyscf/gto/int3c1e.py
+++ b/gpu4pyscf/gto/int3c1e.py
@@ -24,7 +24,7 @@
 from gpu4pyscf.scf.int4c2e import BasisProdCache
 from gpu4pyscf.df.int3c2e import sort_mol, _split_l_ctr_groups, get_pairing
 from gpu4pyscf.gto.mole import basis_seg_contraction
-from gpu4pyscf.__config__ import _num_devices, _streams
+from gpu4pyscf.__config__ import num_devices, _streams
 
 BLKSIZE = 128
 
@@ -132,7 +132,7 @@ def get_n_hermite_density_of_angular_pair(l):
         self.density_offset = np.append(0, np.cumsum(n_density_per_angular_pair)).astype(np.int32)
 
         self._bpcache = {}
-        for n in range(_num_devices):
+        for n in range(num_devices):
             with cp.cuda.Device(n), _streams[n]:
                 bpcache = ctypes.POINTER(BasisProdCache)()
                 scale_shellpair_diag = 1.0
diff --git a/gpu4pyscf/gto/int3c1e_ipip.py b/gpu4pyscf/gto/int3c1e_ipip.py
new file mode 100644
index 00000000..b86abf46
--- /dev/null
+++ b/gpu4pyscf/gto/int3c1e_ipip.py
@@ -0,0 +1,410 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ctypes
+import cupy as cp
+import numpy as np
+from pyscf import lib
+from pyscf.gto import ATOM_OF
+from pyscf.lib import c_null_ptr
+from gpu4pyscf.lib.cupy_helper import load_library, cart2sph, get_avail_mem
+from gpu4pyscf.gto.int3c1e import VHFOpt
+
+libgint = load_library('libgint')
+
+def get_int3c1e_ipip1_charge_contracted(mol, grids, charge_exponents, charges, intopt):
+    omega = mol.omega
+    assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
+
+    grids = cp.asarray(grids, order='C')
+    if charge_exponents is not None:
+        charge_exponents = cp.asarray(charge_exponents, order='C')
+
+    assert charges.ndim == 1 and charges.shape[0] == grids.shape[0]
+    charges = cp.asarray(charges).astype(np.float64)
+
+    charges = charges.reshape([-1, 1], order='C')
+    grids = cp.concatenate([grids, charges], axis=1)
+
+    int1e_charge_contracted = cp.empty([3, 3, mol.nao, mol.nao], order='C')
+    for cp_ij_id, _ in enumerate(intopt.log_qs):
+        cpi = intopt.cp_idx[cp_ij_id]
+        cpj = intopt.cp_jdx[cp_ij_id]
+        li = intopt.angular[cpi]
+        lj = intopt.angular[cpj]
+
+        stream = cp.cuda.get_current_stream()
+
+        log_q_ij = intopt.log_qs[cp_ij_id]
+
+        nbins = 1
+        bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+
+        i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
+        j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
+        ni = i1 - i0
+        nj = j1 - j0
+
+        ao_offsets = np.array([i0, j0], dtype=np.int32)
+        strides = np.array([ni, ni*nj], dtype=np.int32)
+
+        charge_exponents_pointer = c_null_ptr()
+        if charge_exponents is not None:
+            charge_exponents_pointer = charge_exponents.data.ptr
+
+        ngrids = grids.shape[0]
+        # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid
+        # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points
+        n_charge_sum_per_thread = 100 # This number roughly optimize kernel performance on a large system
+
+        int1e_angular_slice = cp.zeros([3, 3, j1-j0, i1-i0], order='C')
+
+        err = libgint.GINTfill_int3c1e_ipip1_charge_contracted(
+            ctypes.cast(stream.ptr, ctypes.c_void_p),
+            intopt.bpcache,
+            ctypes.cast(grids.data.ptr, ctypes.c_void_p),
+            ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
+            ctypes.c_int(ngrids),
+            ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p),
+            strides.ctypes.data_as(ctypes.c_void_p),
+            ao_offsets.ctypes.data_as(ctypes.c_void_p),
+            bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nbins),
+            ctypes.c_int(cp_ij_id),
+            ctypes.c_double(omega),
+            ctypes.c_int(n_charge_sum_per_thread))
+
+        if err != 0:
+            raise RuntimeError('GINTfill_int3c1e_charge_contracted failed')
+
+        int1e_angular_slice[1,0] = int1e_angular_slice[0,1]
+        int1e_angular_slice[2,0] = int1e_angular_slice[0,2]
+        int1e_angular_slice[2,1] = int1e_angular_slice[1,2]
+
+        i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
+        j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
+        if not mol.cart:
+            int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj)
+            int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li)
+
+        int1e_charge_contracted[:, :, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,1,3,2)
+
+    return intopt.unsort_orbitals(int1e_charge_contracted, axis=[2,3])
+
+def get_int3c1e_ipvip1_charge_contracted(mol, grids, charge_exponents, charges, intopt):
+    omega = mol.omega
+    assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
+
+    grids = cp.asarray(grids, order='C')
+    if charge_exponents is not None:
+        charge_exponents = cp.asarray(charge_exponents, order='C')
+
+    assert charges.ndim == 1 and charges.shape[0] == grids.shape[0]
+    charges = cp.asarray(charges).astype(np.float64)
+
+    charges = charges.reshape([-1, 1], order='C')
+    grids = cp.concatenate([grids, charges], axis=1)
+
+    int1e_charge_contracted = cp.empty([3, 3, mol.nao, mol.nao], order='C')
+    for cp_ij_id, _ in enumerate(intopt.log_qs):
+        cpi = intopt.cp_idx[cp_ij_id]
+        cpj = intopt.cp_jdx[cp_ij_id]
+        li = intopt.angular[cpi]
+        lj = intopt.angular[cpj]
+
+        stream = cp.cuda.get_current_stream()
+
+        log_q_ij = intopt.log_qs[cp_ij_id]
+
+        nbins = 1
+        bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+
+        i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
+        j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
+        ni = i1 - i0
+        nj = j1 - j0
+
+        ao_offsets = np.array([i0, j0], dtype=np.int32)
+        strides = np.array([ni, ni*nj], dtype=np.int32)
+
+        charge_exponents_pointer = c_null_ptr()
+        if charge_exponents is not None:
+            charge_exponents_pointer = charge_exponents.data.ptr
+
+        ngrids = grids.shape[0]
+        # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid
+        # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points
+        n_charge_sum_per_thread = 100 # This number roughly optimize kernel performance on a large system
+
+        int1e_angular_slice = cp.zeros([3, 3, j1-j0, i1-i0], order='C')
+
+        err = libgint.GINTfill_int3c1e_ipvip1_charge_contracted(
+            ctypes.cast(stream.ptr, ctypes.c_void_p),
+            intopt.bpcache,
+            ctypes.cast(grids.data.ptr, ctypes.c_void_p),
+            ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
+            ctypes.c_int(ngrids),
+            ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p),
+            strides.ctypes.data_as(ctypes.c_void_p),
+            ao_offsets.ctypes.data_as(ctypes.c_void_p),
+            bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nbins),
+            ctypes.c_int(cp_ij_id),
+            ctypes.c_double(omega),
+            ctypes.c_int(n_charge_sum_per_thread))
+
+        if err != 0:
+            raise RuntimeError('GINTfill_int3c1e_charge_contracted failed')
+
+        i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
+        j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
+        if not mol.cart:
+            int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj)
+            int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li)
+
+        int1e_charge_contracted[:, :, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,1,3,2)
+
+    return intopt.unsort_orbitals(int1e_charge_contracted, axis=[2,3])
+
+def get_int3c1e_ip1ip2_charge_contracted(mol, grids, charge_exponents, charges, intopt):
+    omega = mol.omega
+    assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
+
+    grids = cp.asarray(grids, order='C')
+    if charge_exponents is not None:
+        charge_exponents = cp.asarray(charge_exponents, order='C')
+
+    assert charges.ndim == 1 and charges.shape[0] == grids.shape[0]
+    charges = cp.asarray(charges).astype(np.float64)
+
+    charges = charges.reshape([-1, 1], order='C')
+    grids = cp.concatenate([grids, charges], axis=1)
+
+    int1e_charge_contracted = cp.empty([3, 3, mol.nao, mol.nao], order='C')
+    for cp_ij_id, _ in enumerate(intopt.log_qs):
+        cpi = intopt.cp_idx[cp_ij_id]
+        cpj = intopt.cp_jdx[cp_ij_id]
+        li = intopt.angular[cpi]
+        lj = intopt.angular[cpj]
+
+        stream = cp.cuda.get_current_stream()
+
+        log_q_ij = intopt.log_qs[cp_ij_id]
+
+        nbins = 1
+        bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+
+        i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
+        j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
+        ni = i1 - i0
+        nj = j1 - j0
+
+        ao_offsets = np.array([i0, j0], dtype=np.int32)
+        strides = np.array([ni, ni*nj], dtype=np.int32)
+
+        charge_exponents_pointer = c_null_ptr()
+        if charge_exponents is not None:
+            charge_exponents_pointer = charge_exponents.data.ptr
+
+        ngrids = grids.shape[0]
+        # n_charge_sum_per_thread = 1 # means every thread processes one pair and one grid
+        # n_charge_sum_per_thread = ngrids # or larger number gaurantees one thread processes one pair and all grid points
+        n_charge_sum_per_thread = 100 # This number roughly optimize kernel performance on a large system
+
+        int1e_angular_slice = cp.zeros([3, 3, j1-j0, i1-i0], order='C')
+
+        err = libgint.GINTfill_int3c1e_ip1ip2_charge_contracted(
+            ctypes.cast(stream.ptr, ctypes.c_void_p),
+            intopt.bpcache,
+            ctypes.cast(grids.data.ptr, ctypes.c_void_p),
+            ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
+            ctypes.c_int(ngrids),
+            ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p),
+            strides.ctypes.data_as(ctypes.c_void_p),
+            ao_offsets.ctypes.data_as(ctypes.c_void_p),
+            bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nbins),
+            ctypes.c_int(cp_ij_id),
+            ctypes.c_double(omega),
+            ctypes.c_int(n_charge_sum_per_thread))
+
+        if err != 0:
+            raise RuntimeError('GINTfill_int3c1e_charge_contracted failed')
+
+        i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
+        j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
+        if not mol.cart:
+            int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj)
+            int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li)
+
+        int1e_charge_contracted[:, :, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,1,3,2)
+
+    return intopt.unsort_orbitals(int1e_charge_contracted, axis=[2,3])
+
+def get_int3c1e_ipip2_density_contracted(mol, grids, charge_exponents, dm, intopt):
+    omega = mol.omega
+    assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented."
+
+    nao_cart = intopt._sorted_mol.nao
+    ngrids = grids.shape[0]
+
+    grids = cp.asarray(grids, order='C')
+    if charge_exponents is not None:
+        charge_exponents = cp.asarray(charge_exponents, order='C')
+
+    dm = cp.asarray(dm)
+    assert dm.ndim == 2
+    assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao
+
+    dm = intopt.sort_orbitals(dm, [0,1])
+    if not mol.cart:
+        cart2sph_transformation_matrix = intopt.cart2sph
+        # TODO: This part is inefficient (O(N^3)), should be changed to the O(N^2) algorithm
+        dm = cart2sph_transformation_matrix @ dm @ cart2sph_transformation_matrix.T
+    dm = dm.flatten(order='F') # Column major order matches (i + j * n_ao) access pattern in the C function
+
+    dm = cp.asnumpy(dm)
+
+    ao_loc_sorted_order = intopt._sorted_mol.ao_loc_nr(cart = True)
+    l_ij = intopt.l_ij.T.flatten()
+    bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten()
+
+    n_total_hermite_density = intopt.density_offset[-1]
+    dm_pair_ordered = np.empty(n_total_hermite_density)
+    libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p),
+                                              dm_pair_ordered.ctypes.data_as(ctypes.c_void_p),
+                                              ctypes.c_int(1), ctypes.c_int(nao_cart),
+                                              ctypes.c_int(len(intopt.bas_pairs_locs) - 1),
+                                              intopt.bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
+                                              intopt.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p),
+                                              l_ij.ctypes.data_as(ctypes.c_void_p),
+                                              intopt.density_offset.ctypes.data_as(ctypes.c_void_p),
+                                              ao_loc_sorted_order.ctypes.data_as(ctypes.c_void_p),
+                                              bas_coords.ctypes.data_as(ctypes.c_void_p),
+                                              ctypes.c_bool(False))
+
+    dm_pair_ordered = cp.asarray(dm_pair_ordered)
+
+    n_threads_per_block_1d = 16
+    n_max_blocks_per_grid_1d = 65535
+    n_max_threads_1d = n_threads_per_block_1d * n_max_blocks_per_grid_1d
+    n_grid_split = int(np.ceil(ngrids / n_max_threads_1d))
+    if (n_grid_split > 100):
+        print(f"Grid dimension = {ngrids} is too large, more than 100 kernels for one electron integral will be launched.")
+    ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split
+
+    int3c_density_contracted = cp.zeros([3, 3, ngrids], order='C')
+
+    for p0, p1 in lib.prange(0, ngrids, ngrids_per_split):
+        for cp_ij_id, _ in enumerate(intopt.log_qs):
+            stream = cp.cuda.get_current_stream()
+
+            log_q_ij = intopt.log_qs[cp_ij_id]
+
+            nbins = 1
+            bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+
+            charge_exponents_pointer = c_null_ptr()
+            if charge_exponents is not None:
+                exponents_slice = charge_exponents[p0:p1]
+                charge_exponents_pointer = exponents_slice.data.ptr
+            grids_slice = grids[p0:p1]
+
+            # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid
+            # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type
+            n_pair_sum_per_thread = nao_cart
+
+            err = libgint.GINTfill_int3c1e_ipip2_density_contracted(
+                ctypes.cast(stream.ptr, ctypes.c_void_p),
+                intopt.bpcache,
+                ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p),
+                ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
+                ctypes.c_int(p1-p0),
+                ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p),
+                intopt.density_offset.ctypes.data_as(ctypes.c_void_p),
+                ctypes.cast(int3c_density_contracted[:, p0:p1].data.ptr, ctypes.c_void_p),
+                bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(nbins),
+                ctypes.c_int(cp_ij_id),
+                ctypes.c_double(omega),
+                ctypes.c_int(n_pair_sum_per_thread))
+
+            if err != 0:
+                raise RuntimeError('GINTfill_int3c1e_density_contracted failed')
+
+    int3c_density_contracted[1,0] = int3c_density_contracted[0,1]
+    int3c_density_contracted[2,0] = int3c_density_contracted[0,2]
+    int3c_density_contracted[2,1] = int3c_density_contracted[1,2]
+
+    return int3c_density_contracted
+
+def int1e_grids_ipip1(mol, grids, charge_exponents=None, charges=None, direct_scf_tol=1e-13, intopt=None):
+    assert grids is not None
+    assert charges is not None
+
+    if intopt is None:
+        intopt = VHFOpt(mol)
+        intopt.build(direct_scf_tol, aosym=False)
+    else:
+        assert isinstance(intopt, VHFOpt), \
+            f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
+        assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
+        assert not intopt.aosym
+
+    return get_int3c1e_ipip1_charge_contracted(mol, grids, charge_exponents, charges, intopt)
+
+def int1e_grids_ipvip1(mol, grids, charge_exponents=None, charges=None, direct_scf_tol=1e-13, intopt=None):
+    assert grids is not None
+    assert charges is not None
+
+    if intopt is None:
+        intopt = VHFOpt(mol)
+        intopt.build(direct_scf_tol, aosym=False)
+    else:
+        assert isinstance(intopt, VHFOpt), \
+            f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
+        assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
+        assert not intopt.aosym
+
+    return get_int3c1e_ipvip1_charge_contracted(mol, grids, charge_exponents, charges, intopt)
+
+def int1e_grids_ip1ip2(mol, grids, charge_exponents=None, charges=None, direct_scf_tol=1e-13, intopt=None):
+    assert grids is not None
+    assert charges is not None
+
+    if intopt is None:
+        intopt = VHFOpt(mol)
+        intopt.build(direct_scf_tol, aosym=False)
+    else:
+        assert isinstance(intopt, VHFOpt), \
+            f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
+        assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
+        assert not intopt.aosym
+
+    return get_int3c1e_ip1ip2_charge_contracted(mol, grids, charge_exponents, charges, intopt)
+
+def int1e_grids_ipip2(mol, grids, charge_exponents=None, dm=None, direct_scf_tol=1e-13, intopt=None):
+    assert grids is not None
+    assert dm is not None
+
+    if intopt is None:
+        intopt = VHFOpt(mol)
+        intopt.build(direct_scf_tol, aosym=False)
+    else:
+        assert isinstance(intopt, VHFOpt), \
+            f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object."
+        assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first."
+        assert not intopt.aosym
+
+    return get_int3c1e_ipip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
diff --git a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
index 56f87e4b..de68266b 100644
--- a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
+++ b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
@@ -364,5 +364,5 @@ def test_int1e_grids_ip1_density_contracted(self):
         cp.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
 
 if __name__ == "__main__":
-    print("Full Tests for One Electron Coulomb Integrals")
+    print("Full Tests for One Electron Coulomb Integrals 1st Derivative")
     unittest.main()
diff --git a/gpu4pyscf/gto/tests/test_int1e_grids_ipip.py b/gpu4pyscf/gto/tests/test_int1e_grids_ipip.py
new file mode 100644
index 00000000..18f9fed9
--- /dev/null
+++ b/gpu4pyscf/gto/tests/test_int1e_grids_ipip.py
@@ -0,0 +1,480 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy as np
+import cupy as cp
+import pyscf
+from pyscf import lib, gto, df
+from gpu4pyscf.gto.int3c1e_ipip import int1e_grids_ipip1, int1e_grids_ipvip1, int1e_grids_ip1ip2, int1e_grids_ipip2
+
+def setUpModule():
+    global mol_sph, mol_cart, grid_points, integral_threshold, density_contraction_threshold, charge_contraction_threshold
+    atom = '''
+O	0.0000	0.7375	-0.0528
+O	0.0000	-0.7375	-0.1528
+H	0.8190	0.8170	0.4220
+H	-0.8190	-0.8170	0.4220
+'''
+    bas='def2-qzvpp'
+
+    mol_sph = pyscf.M(atom=atom, basis=bas, max_memory=32000)
+    mol_sph.output = '/dev/null'
+    mol_sph.verbose = 0
+    mol_sph.build()
+
+    mol_cart = pyscf.M(atom=atom, basis=bas, max_memory=32000, cart=True)
+    mol_cart.output = '/dev/null'
+    mol_cart.verbose = 0
+    mol_cart.build()
+
+    xs = np.arange(-2.01, 2.0, 0.5)
+    ys = np.arange(-2.02, 2.0, 0.5)
+    zs = np.arange(-2.03, 2.0, 0.5)
+    grid_points = lib.cartesian_prod([xs, ys, zs])
+
+    # All of the following thresholds bound the max value of the corresponding matrix / tensor.
+    integral_threshold = 1e-12
+    density_contraction_threshold = 1e-10
+    charge_contraction_threshold = 1e-12
+
+def tearDownModule():
+    global mol_sph, mol_cart, grid_points
+    mol_sph.stdout.close()
+    mol_cart.stdout.close()
+    del mol_sph, mol_cart, grid_points
+
+class KnownValues(unittest.TestCase):
+    '''
+    Values are compared to PySCF CPU intor() function
+    '''
+    def test_int1e_grids_ipip1_charge_contracted_cart(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+        mol = mol_cart
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges)
+
+        assert isinstance(test_int1e_dAdA, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold)
+
+    def test_int1e_grids_ipip1_charge_contracted_sph(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+        mol = mol_sph
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges)
+
+        assert isinstance(test_int1e_dAdA, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold)
+
+    def test_int1e_grids_ipip1_charge_contracted_gaussian_charge(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+        charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+        mol = mol_sph
+        fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+        int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges, charge_exponents = charge_exponents)
+
+        assert isinstance(test_int1e_dAdA, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold)
+
+    def test_int1e_grids_ipip1_charge_contracted_omega(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+        omega = 0.8
+        mol_sph_omega = mol_sph.copy()
+        mol_sph_omega.set_range_coulomb(omega)
+
+        mol = mol_sph_omega
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges)
+
+        assert isinstance(test_int1e_dAdA, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold)
+
+    def test_int1e_grids_ipip1_charge_contracted_gaussian_charge_omega(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+        charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+        omega = 0.8
+        mol_sph_omega = mol_sph.copy()
+        mol_sph_omega.set_range_coulomb(omega)
+
+        mol = mol_sph_omega
+        fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+        int3c2e_ipip1 = mol._add_suffix('int3c2e_ipip1')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip1)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip1, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdA = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdA = ref_int1e_dAdA.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdA = int1e_grids_ipip1(mol, grid_points, charges = charges, charge_exponents = charge_exponents)
+
+        assert isinstance(test_int1e_dAdA, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdA, test_int1e_dAdA, atol = integral_threshold)
+
+    # ^ ipip1    v ipvip1
+
+    def test_int1e_grids_ipvip1_charge_contracted_cart(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+        mol = mol_cart
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges)
+
+        assert isinstance(test_int1e_dAdB, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold)
+
+    def test_int1e_grids_ipvip1_charge_contracted_sph(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+        mol = mol_sph
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges)
+
+        assert isinstance(test_int1e_dAdB, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold)
+
+    def test_int1e_grids_ipvip1_charge_contracted_gaussian_charge(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+        charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+        mol = mol_sph
+        fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+        int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges, charge_exponents = charge_exponents)
+
+        assert isinstance(test_int1e_dAdB, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold)
+
+    def test_int1e_grids_ipvip1_charge_contracted_omega(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+        omega = 0.8
+        mol_sph_omega = mol_sph.copy()
+        mol_sph_omega.set_range_coulomb(omega)
+
+        mol = mol_sph_omega
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges)
+
+        assert isinstance(test_int1e_dAdB, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold)
+
+    def test_int1e_grids_ipvip1_charge_contracted_gaussian_charge_omega(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+        charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+        omega = 0.8
+        mol_sph_omega = mol_sph.copy()
+        mol_sph_omega.set_range_coulomb(omega)
+
+        mol = mol_sph_omega
+        fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+        int3c2e_ipvip1 = mol._add_suffix('int3c2e_ipvip1')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipvip1)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipvip1, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdB = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdB = ref_int1e_dAdB.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdB = int1e_grids_ipvip1(mol, grid_points, charges = charges, charge_exponents = charge_exponents)
+
+        assert isinstance(test_int1e_dAdB, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdB, test_int1e_dAdB, atol = integral_threshold)
+
+    # ^ ipvip1    v ip1ip2
+
+    def test_int1e_grids_ip1ip2_charge_contracted_cart(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+        mol = mol_cart
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges)
+
+        assert isinstance(test_int1e_dAdC, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold)
+
+    def test_int1e_grids_ip1ip2_charge_contracted_sph(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+        mol = mol_sph
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges)
+
+        assert isinstance(test_int1e_dAdC, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold)
+
+    def test_int1e_grids_ip1ip2_charge_contracted_gaussian_charge(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+        charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+        mol = mol_sph
+        fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+        int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges, charge_exponents = charge_exponents)
+
+        assert isinstance(test_int1e_dAdC, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold)
+
+    def test_int1e_grids_ip1ip2_charge_contracted_omega(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+
+        omega = 0.8
+        mol_sph_omega = mol_sph.copy()
+        mol_sph_omega.set_range_coulomb(omega)
+
+        mol = mol_sph_omega
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges)
+
+        assert isinstance(test_int1e_dAdC, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold)
+
+    def test_int1e_grids_ip1ip2_charge_contracted_gaussian_charge_omega(self):
+        np.random.seed(12345)
+        charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0])
+        charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+        omega = 0.8
+        mol_sph_omega = mol_sph.copy()
+        mol_sph_omega.set_range_coulomb(omega)
+
+        mol = mol_sph_omega
+        fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+        int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+        ref_int1e_dAdC = np.einsum('dijq,q->dij', v_nj, charges)
+        ref_int1e_dAdC = ref_int1e_dAdC.reshape(3, 3, mol.nao, mol.nao)
+
+        test_int1e_dAdC = int1e_grids_ip1ip2(mol, grid_points, charges = charges, charge_exponents = charge_exponents)
+
+        assert isinstance(test_int1e_dAdC, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dAdC, test_int1e_dAdC, atol = integral_threshold)
+
+    # ^ ip1ip2    v ipip2
+
+    def test_int1e_grids_ipip2_charge_contracted_cart(self):
+        np.random.seed(12345)
+        dm = np.random.uniform(-2.0, 2.0, (mol_cart.nao, mol_cart.nao))
+
+        mol = mol_cart
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        # Note: we cannot compute ipip2 (dCdC) directly due to numerical problems,
+        #       pyscf treat a point charge as a sharp Gaussian, and we cannot take 2nd derivative of it.
+        int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+        v_nj = -v_nj - v_nj.transpose(0, 2, 1, 3) # dCdC = -dAdC - dBdC
+        ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm)
+        ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0])
+
+        test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm)
+
+        assert isinstance(test_int1e_dCdC, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold)
+
+    def test_int1e_grids_ipip2_charge_contracted_sph(self):
+        np.random.seed(12345)
+        dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao))
+
+        mol = mol_sph
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        # Note: we cannot compute ipip2 (dCdC) directly due to numerical problems,
+        #       pyscf treat a point charge as a sharp Gaussian, and we cannot take 2nd derivative of it.
+        int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+        v_nj = -v_nj - v_nj.transpose(0, 2, 1, 3) # dCdC = -dAdC - dBdC
+        ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm)
+        ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0])
+
+        test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm)
+
+        assert isinstance(test_int1e_dCdC, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold)
+
+    def test_int1e_grids_ipip2_charge_contracted_gaussian_charge(self):
+        np.random.seed(12345)
+        dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao))
+        charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+        mol = mol_sph
+        fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+        int3c2e_ipip2 = mol._add_suffix('int3c2e_ipip2')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip2)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip2, aosym='s1', cintopt=cintopt)
+        ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm)
+        ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0])
+
+        test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm, charge_exponents = charge_exponents)
+
+        assert isinstance(test_int1e_dCdC, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold)
+
+    def test_int1e_grids_ipip2_charge_contracted_omega(self):
+        np.random.seed(12345)
+        dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao))
+
+        omega = 0.8
+        mol_sph_omega = mol_sph.copy()
+        mol_sph_omega.set_range_coulomb(omega)
+
+        mol = mol_sph_omega
+        fakemol = gto.fakemol_for_charges(grid_points)
+
+        # Note: we cannot compute ipip2 (dCdC) directly due to numerical problems,
+        #       pyscf treat a point charge as a sharp Gaussian, and we cannot take 2nd derivative of it.
+        int3c2e_ip1ip2 = mol._add_suffix('int3c2e_ip1ip2')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1ip2)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1ip2, aosym='s1', cintopt=cintopt)
+        v_nj = -v_nj - v_nj.transpose(0, 2, 1, 3) # dCdC = -dAdC - dBdC
+        ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm)
+        ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0])
+
+        test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm)
+
+        assert isinstance(test_int1e_dCdC, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold)
+
+    def test_int1e_grids_ipip2_charge_contracted_gaussian_charge_omega(self):
+        np.random.seed(12345)
+        dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao))
+        charge_exponents = np.random.uniform(0.5, 1.0, grid_points.shape[0])
+
+        omega = 0.8
+        mol_sph_omega = mol_sph.copy()
+        mol_sph_omega.set_range_coulomb(omega)
+
+        mol = mol_sph_omega
+        fakemol = gto.fakemol_for_charges(grid_points, expnt=charge_exponents)
+
+        int3c2e_ipip2 = mol._add_suffix('int3c2e_ipip2')
+        cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ipip2)
+        v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ipip2, aosym='s1', cintopt=cintopt)
+        ref_int1e_dCdC = np.einsum('dijq,ij->dq', v_nj, dm)
+        ref_int1e_dCdC = ref_int1e_dCdC.reshape(3, 3, grid_points.shape[0])
+
+        test_int1e_dCdC = int1e_grids_ipip2(mol, grid_points, dm = dm, charge_exponents = charge_exponents)
+
+        assert isinstance(test_int1e_dCdC, cp.ndarray)
+        cp.testing.assert_allclose(ref_int1e_dCdC, test_int1e_dCdC, atol = integral_threshold)
+
+if __name__ == "__main__":
+    print("Full Tests for One Electron Coulomb Integrals 2nd Derivative")
+    unittest.main()
diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py
index 65edff6b..5ed768c2 100644
--- a/gpu4pyscf/hessian/jk.py
+++ b/gpu4pyscf/hessian/jk.py
@@ -33,7 +33,7 @@
                                        reduce_to_device, contract)
 
 from gpu4pyscf.__config__ import props as gpu_specs
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 from gpu4pyscf.lib import logger
 
 
@@ -174,7 +174,7 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None,
     if vhfopt is None:
         # Small group size for load balance
         group_size = None
-        if _num_devices > 1:
+        if num_devices > 1:
             group_size = jk.GROUP_SIZE
         vhfopt = _VHFOpt(mol).build(group_size=group_size)
 
@@ -202,13 +202,13 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None,
                     tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
+    for device_id in range(num_devices):
+        task_list.append(tasks[device_id::num_devices])
 
     cp.cuda.get_current_stream().synchronize()
     futures = []
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _jk_task,
                 mol, dms, mo_coeff, mo_occ, vhfopt, task_list[device_id], hermi=hermi,
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 775a6e98..b39aab8e 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -32,7 +32,7 @@
     contract, tag_array, sandwich_dot, transpose_sum, get_avail_mem, condense,
     krylov)
 from gpu4pyscf.__config__ import props as gpu_specs
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 from gpu4pyscf.lib import logger
 from gpu4pyscf.scf.jk import (
     LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, GROUP_SIZE, libvhf_rys, _VHFOpt, 
@@ -271,7 +271,7 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non
     if vhfopt is None:
         # Small group size for load balance
         group_size = None
-        if _num_devices > 1: 
+        if num_devices > 1: 
             group_size = GROUP_SIZE
         vhfopt = _VHFOpt(mol).build(group_size=group_size)
 
@@ -296,13 +296,13 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non
                     tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
+    for device_id in range(num_devices):
+        task_list.append(tasks[device_id::num_devices])
 
     cp.cuda.get_current_stream().synchronize()
     futures = []
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _ejk_ip2_task,
                 mol, dms, vhfopt, task_list[device_id],
@@ -494,7 +494,7 @@ def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=Non
     vhfopt.tile = 1
     # Small group size for load balance
     group_size = None
-    if _num_devices > 1: 
+    if num_devices > 1: 
         group_size = GROUP_SIZE
     vhfopt.build(group_size=group_size)
 
@@ -532,13 +532,13 @@ def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=Non
                     tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
+    for device_id in range(num_devices):
+        task_list.append(tasks[device_id::num_devices])
 
     cp.cuda.get_current_stream().synchronize()
     futures = []
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _build_jk_ip1_task,
                 mol, dms, vhfopt, task_list[device_id], atoms_slice,
@@ -908,7 +908,7 @@ def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ,
         with mol.with_range_coulomb(omega):
             # Small group size for load balance
             group_size = None
-            if _num_devices > 1: 
+            if num_devices > 1: 
                 group_size = GROUP_SIZE
             vhfopt = _VHFOpt(mol, mf.direct_scf_tol).build(group_size=group_size)
             mf._opt_gpu[omega] = vhfopt
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index a1c01079..c12ef0e2 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -30,7 +30,7 @@
 from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem,
                                        reduce_to_device, transpose_sum)
 from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 from gpu4pyscf.hessian import jk
 
 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
@@ -49,7 +49,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     dm0 = cupy.dot(mocc, mocc.T) * 2
 
     if mf.do_nlc():
-        raise NotImplementedError
+        raise NotImplementedError("2nd derivative of NLC is not implemented.")
 
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = ni.libxc.is_hybrid_xc(mf.xc)
@@ -524,8 +524,8 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
 
     futures = []
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _get_vxc_deriv2_task,
                 hessobj, grids, mo_coeff, mo_occ, max_memory,
@@ -550,7 +550,6 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id
 
     ngrids_glob = grids.coords.shape[0]
     grid_start, grid_end = numint.gen_grid_range(ngrids_glob, device_id)
-    
     with cupy.cuda.Device(device_id), _streams[device_id]:
         mo_occ = cupy.asarray(mo_occ)
         mo_coeff = cupy.asarray(mo_coeff)
@@ -688,8 +687,8 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
 
     futures = []
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _get_vxc_deriv1_task,
                 hessobj, grids, mo_coeff, mo_occ, max_memory,
@@ -796,8 +795,8 @@ def nr_rks_fxc_mo(ni, mol, grids, xc_code, dm0=None, dms=None, mo_coeff=None, re
     
     futures = []
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _nr_rks_fxc_mo_task,
                 ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc,
diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py
index 0c2995f8..620331e0 100644
--- a/gpu4pyscf/hessian/uks.py
+++ b/gpu4pyscf/hessian/uks.py
@@ -47,8 +47,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     dm0b = moccb.dot(moccb.T)
     dm0 = cp.asarray((dm0a, dm0b))
 
-    if mf.nlc != '':
-        raise NotImplementedError
+    if mf.do_nlc():
+        raise NotImplementedError("2nd derivative of NLC is not implemented.")
 
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = ni.libxc.is_hybrid_xc(mf.xc)
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index fe197c71..4c62d8db 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -23,8 +23,8 @@
 from gpu4pyscf.gto import mole
 from gpu4pyscf.lib.cutensor import contract
 from gpu4pyscf.lib.cusolver import eigh, cholesky  #NOQA
-from gpu4pyscf.lib.memcpy import copy_array  #NOQA
-from gpu4pyscf.__config__ import _streams, _num_devices, _p2p_access
+from gpu4pyscf.lib.memcpy import copy_array, p2p_transfer  #NOQA
+from gpu4pyscf.__config__ import _streams, num_devices, _p2p_access
 
 LMAX_ON_GPU = 7
 DSOLVE_LINDEP = 1e-13
@@ -81,23 +81,6 @@ def get_avail_mem():
         mem_avail = cupy.cuda.runtime.memGetInfo()[0]
         return mem_avail + total_mem - used_mem
 
-def p2p_transfer(a, b):
-    ''' If the direct P2P data transfer is not available, transfer data via CPU memory
-    '''
-    if a.device == b.device:
-        a[:] = b
-    elif _p2p_access:
-        a[:] = b
-        '''
-    elif a.strides == b.strides and a.flags.c_contiguous and a.dtype == b.dtype:
-        # cupy supports a direct copy from different devices without p2p. See also
-        # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L48
-        # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L1015
-        a[:] = b
-        '''
-    else:
-        copy_array(b, a)
-
 def concatenate(array_list):
     ''' Concatenate axis=0 only
     '''
@@ -126,8 +109,8 @@ def reduce_to_device(array_list, inplace=False):
     ''' Reduce a list of ndarray in different devices to device 0
     TODO: reduce memory footprint, improve throughput
     '''
-    assert len(array_list) == _num_devices
-    if _num_devices == 1:
+    assert len(array_list) == num_devices
+    if num_devices == 1:
         return array_list[0]
     
     out_shape = array_list[0].shape
diff --git a/gpu4pyscf/lib/cusolver.py b/gpu4pyscf/lib/cusolver.py
index 5c8d2dd6..393d7d96 100644
--- a/gpu4pyscf/lib/cusolver.py
+++ b/gpu4pyscf/lib/cusolver.py
@@ -16,11 +16,13 @@
 import numpy as np
 import cupy
 import ctypes
+from ctypes.util import find_library
 from cupy_backends.cuda.libs import cusolver
 from cupy_backends.cuda.libs import cublas
 from cupy.cuda import device
 
-libcusolver = ctypes.CDLL('libcusolver.so')
+libcusolver = find_library('cusolver')
+libcusolver = ctypes.CDLL(libcusolver)
 
 CUSOLVER_EIG_TYPE_1 = 1
 CUSOLVER_EIG_TYPE_2 = 2
diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py
index 0599e39a..034076ab 100644
--- a/gpu4pyscf/lib/cutensor.py
+++ b/gpu4pyscf/lib/cutensor.py
@@ -103,10 +103,11 @@ def contraction(
     ws = cupy.empty(ws_size, dtype=np.int8)
     out = c
 
-    alpha = np.asarray(alpha)
-    beta = np.asarray(beta)
+    alpha = np.asarray(alpha, dtype=dtype)
+    beta = np.asarray(beta, dtype=dtype)
 
-    cutensor_backend.contract(cutensor._get_handle().ptr, plan.ptr,
+    handler = cutensor._get_handle()
+    cutensor_backend.contract(handler.ptr, plan.ptr,
                              alpha.ctypes.data, a.data.ptr, b.data.ptr,
                              beta.ctypes.data, c.data.ptr, out.data.ptr,
                              ws.data.ptr, ws_size)
@@ -114,13 +115,10 @@ def contraction(
     return out
 
 import os
-if 'CONTRACT_ENGINE' in os.environ:
-    contract_engine = os.environ['CONTRACT_ENGINE']
-else:
-    contract_engine = None
-
+contract_engine = None
 if cutensor is None:
     contract_engine = 'cupy'  # default contraction engine
+contract_engine = os.environ.get('CONTRACT_ENGINE', contract_engine)
 
 # override the 'contract' function if einsum is customized or cutensor is not found
 if contract_engine is not None:
@@ -139,10 +137,15 @@ def contraction(
     warnings.warn(f'using {contract_engine} as the tensor contraction engine.')
     def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None):
         if out is None:
-            return cupy.asarray(einsum(pattern, a, b), order='C')
+            out = einsum(pattern, a, b)
+            out *= alpha
+        elif beta == 0.:
+            out[:] = einsum(pattern, a, b)
+            out *= alpha
         else:
-            out[:] = alpha*einsum(pattern, a, b) + beta*out
-            return cupy.asarray(out, order='C')
+            out *= beta
+            out += alpha*einsum(pattern, a, b)
+        return cupy.asarray(out, order='C')
 else:
     def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None):
         '''
diff --git a/gpu4pyscf/lib/gint/CMakeLists.txt b/gpu4pyscf/lib/gint/CMakeLists.txt
index 464efed6..7647c2c3 100644
--- a/gpu4pyscf/lib/gint/CMakeLists.txt
+++ b/gpu4pyscf/lib/gint/CMakeLists.txt
@@ -26,6 +26,7 @@ add_library(gint SHARED
   nr_fill_ao_ints.cu
   nr_fill_ao_int3c1e.cu
   nr_fill_ao_int3c1e_ip.cu
+  nr_fill_ao_int3c1e_ipip.cu
   nr_fill_ao_int3c2e.cu
   nr_fill_ao_int3c2e_ip1.cu
   nr_fill_ao_int3c2e_ip2.cu
diff --git a/gpu4pyscf/lib/gint/g3c1e_ipip.cu b/gpu4pyscf/lib/gint/g3c1e_ipip.cu
new file mode 100644
index 00000000..87ebb270
--- /dev/null
+++ b/gpu4pyscf/lib/gint/g3c1e_ipip.cu
@@ -0,0 +1,635 @@
+/*
+ * Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gint.h"
+
+template <int NROOTS>
+__device__
+static void GINTwrite_int3c1e_ipip1_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double prefactor, const int i_l, const int j_l)
+{
+    const int *idx = c_idx;
+    const int *idy = c_idx + TOT_NF;
+    const int *idz = c_idx + TOT_NF * 2;
+
+    const int g_size = NROOTS * (i_l + 2 + 1) * (j_l + 1);
+    const double* __restrict__ gx = g;
+    const double* __restrict__ gy = g + g_size;
+    const double* __restrict__ gz = g + g_size * 2;
+
+    const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+    const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+    const int n_density_elements_ij = n_density_elements_i * n_density_elements_j;
+    for (int j = 0; j < n_density_elements_j; j++) {
+        for (int i = 0; i < n_density_elements_i; i++) {
+            const int loc_j = c_l_locs[j_l] + j;
+            const int loc_i = c_l_locs[i_l] + i;
+            const int ix = idx[loc_i];
+            const int iy = idy[loc_i];
+            const int iz = idz[loc_i];
+            const int jx = idx[loc_j];
+            const int jy = idy[loc_j];
+            const int jz = idz[loc_j];
+            const int gx_offset = ix + jx * (i_l + 2 + 1);
+            const int gy_offset = iy + jy * (i_l + 2 + 1);
+            const int gz_offset = iz + jz * (i_l + 2 + 1);
+
+            double d2eri_dAxdAx = 0;
+            double d2eri_dAxdAy = 0;
+            double d2eri_dAxdAz = 0;
+            double d2eri_dAydAy = 0;
+            double d2eri_dAydAz = 0;
+            double d2eri_dAzdAz = 0;
+#pragma unroll
+            for (int i_root = 0; i_root < NROOTS; i_root++) {
+                const double gx_minus_2 = (ix >= 2 ? gx[(gx_offset - 2) * NROOTS + i_root] : 0);
+                const double gy_minus_2 = (iy >= 2 ? gy[(gy_offset - 2) * NROOTS + i_root] : 0);
+                const double gz_minus_2 = (iz >= 2 ? gz[(gz_offset - 2) * NROOTS + i_root] : 0);
+                const double gx_minus_1 = (ix >= 1 ? gx[(gx_offset - 1) * NROOTS + i_root] : 0);
+                const double gy_minus_1 = (iy >= 1 ? gy[(gy_offset - 1) * NROOTS + i_root] : 0);
+                const double gz_minus_1 = (iz >= 1 ? gz[(gz_offset - 1) * NROOTS + i_root] : 0);
+                const double gx_0 = gx[gx_offset * NROOTS + i_root];
+                const double gy_0 = gy[gy_offset * NROOTS + i_root];
+                const double gz_0 = gz[gz_offset * NROOTS + i_root];
+                const double gx_1 = gx[(gx_offset + 1) * NROOTS + i_root];
+                const double gy_1 = gy[(gy_offset + 1) * NROOTS + i_root];
+                const double gz_1 = gz[(gz_offset + 1) * NROOTS + i_root];
+                const double gx_2 = gx[(gx_offset + 2) * NROOTS + i_root];
+                const double gy_2 = gy[(gy_offset + 2) * NROOTS + i_root];
+                const double gz_2 = gz[(gz_offset + 2) * NROOTS + i_root];
+                const double dgx_dAx = ix * gx_minus_1 + minus_two_a * gx_1;
+                const double dgy_dAy = iy * gy_minus_1 + minus_two_a * gy_1;
+                const double dgz_dAz = iz * gz_minus_1 + minus_two_a * gz_1;
+                const double d2gx_dAx2 = ix * (ix - 1) * gx_minus_2 + minus_two_a * (2 * ix + 1) * gx_0 + minus_two_a * minus_two_a * gx_2;
+                const double d2gy_dAy2 = iy * (iy - 1) * gy_minus_2 + minus_two_a * (2 * iy + 1) * gy_0 + minus_two_a * minus_two_a * gy_2;
+                const double d2gz_dAz2 = iz * (iz - 1) * gz_minus_2 + minus_two_a * (2 * iz + 1) * gz_0 + minus_two_a * minus_two_a * gz_2;
+                d2eri_dAxdAx += d2gx_dAx2 * gy_0 * gz_0;
+                d2eri_dAxdAy += dgx_dAx * dgy_dAy * gz_0;
+                d2eri_dAxdAz += dgx_dAx * gy_0 * dgz_dAz;
+                d2eri_dAydAy += gx_0 * d2gy_dAy2 * gz_0;
+                d2eri_dAydAz += gx_0 * dgy_dAy * dgz_dAz;
+                d2eri_dAzdAz += gx_0 * gy_0 * d2gz_dAz2;
+            }
+            local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += d2eri_dAxdAx * prefactor;
+            local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += d2eri_dAxdAy * prefactor;
+            local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += d2eri_dAxdAz * prefactor;
+            local_output[i + j * n_density_elements_i + 3 * n_density_elements_ij] += d2eri_dAydAy * prefactor;
+            local_output[i + j * n_density_elements_i + 4 * n_density_elements_ij] += d2eri_dAydAz * prefactor;
+            local_output[i + j * n_density_elements_i + 5 * n_density_elements_ij] += d2eri_dAzdAz * prefactor;
+        }
+    }
+}
+
+template <int NROOTS, int GSIZE_INT3C_1E>
+__global__
+static void GINTfill_int3c1e_ipip1_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                                    const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+                                                                    const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    if (task_ij >= ntasks_ij) {
+        return;
+    }
+
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+    const double* __restrict__ a_exponents = c_bpcache.a1;
+
+    constexpr int l_sum_max = (NROOTS - 1) * 2 + 1;
+    constexpr int l_i_max_density_elements = (l_sum_max + 1) / 2;
+    constexpr int l_j_max_density_elements = l_sum_max - l_i_max_density_elements;
+    double output_cache[(l_i_max_density_elements + 1) * (l_i_max_density_elements + 2) / 2
+                        * (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2
+                        * 6] { 0.0 };
+
+    for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) {
+        const double* grid_point = grid_points + task_grid * 4;
+        const double charge = grid_point[3];
+        const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+        double g[GSIZE_INT3C_1E];
+
+        for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
+            GINT_g1e<NROOTS>(g, grid_point, ish, jsh, ij, i_l + 2, j_l, charge_exponent, omega);
+            const double minus_two_a = -2.0 * a_exponents[ij];
+            GINTwrite_int3c1e_ipip1_charge_contracted<NROOTS>(g, output_cache, minus_two_a, charge, i_l, j_l);
+        }
+    }
+
+    const int* ao_loc = c_bpcache.ao_loc;
+
+    const int i0 = ao_loc[ish] - ao_offsets_i;
+    const int j0 = ao_loc[jsh] - ao_offsets_j;
+    const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+    const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+    const int n_density_elements_ij = n_density_elements_i * n_density_elements_j;
+    for (int j = 0; j < n_density_elements_j; j++) {
+        for (int i = 0; i < n_density_elements_i; i++) {
+            const double d2eri_dAxdAx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij];
+            const double d2eri_dAxdAy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij];
+            const double d2eri_dAxdAz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij];
+            const double d2eri_dAydAy = output_cache[i + j * n_density_elements_i + 3 * n_density_elements_ij];
+            const double d2eri_dAydAz = output_cache[i + j * n_density_elements_i + 4 * n_density_elements_ij];
+            const double d2eri_dAzdAz = output_cache[i + j * n_density_elements_i + 5 * n_density_elements_ij];
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), d2eri_dAxdAx);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), d2eri_dAxdAy);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), d2eri_dAxdAz);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 4 * stride_ij), d2eri_dAydAy);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 5 * stride_ij), d2eri_dAydAz);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 8 * stride_ij), d2eri_dAzdAz);
+        }
+    }
+}
+
+template <int NROOTS>
+__device__
+static void GINTwrite_int3c1e_ipvip1_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double minus_two_b, const double prefactor, const int i_l, const int j_l)
+{
+    const int *idx = c_idx;
+    const int *idy = c_idx + TOT_NF;
+    const int *idz = c_idx + TOT_NF * 2;
+
+    const int g_size = NROOTS * (i_l + 1 + 1) * (j_l + 1 + 1);
+    const double* __restrict__ gx = g;
+    const double* __restrict__ gy = g + g_size;
+    const double* __restrict__ gz = g + g_size * 2;
+
+    const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+    const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+    const int n_density_elements_ij = n_density_elements_i * n_density_elements_j;
+    for (int j = 0; j < n_density_elements_j; j++) {
+        for (int i = 0; i < n_density_elements_i; i++) {
+            const int loc_j = c_l_locs[j_l] + j;
+            const int loc_i = c_l_locs[i_l] + i;
+            const int ix = idx[loc_i];
+            const int iy = idy[loc_i];
+            const int iz = idz[loc_i];
+            const int jx = idx[loc_j];
+            const int jy = idy[loc_j];
+            const int jz = idz[loc_j];
+            const int j_offset = i_l + 1 + 1;
+
+            double d2eri_dAxdBx = 0;
+            double d2eri_dAxdBy = 0;
+            double d2eri_dAxdBz = 0;
+            double d2eri_dAydBx = 0;
+            double d2eri_dAydBy = 0;
+            double d2eri_dAydBz = 0;
+            double d2eri_dAzdBx = 0;
+            double d2eri_dAzdBy = 0;
+            double d2eri_dAzdBz = 0;
+#pragma unroll
+            for (int i_root = 0; i_root < NROOTS; i_root++) {
+                const double gx_i_minus_1_j_minus_1 = ix * jx * (ix >= 1 && jx >= 1 ? gx[(ix - 1 + (jx - 1) * j_offset) * NROOTS + i_root] : 0);
+                const double gy_i_minus_1_j_minus_1 = iy * jy * (iy >= 1 && jy >= 1 ? gy[(iy - 1 + (jy - 1) * j_offset) * NROOTS + i_root] : 0);
+                const double gz_i_minus_1_j_minus_1 = iz * jz * (iz >= 1 && jz >= 1 ? gz[(iz - 1 + (jz - 1) * j_offset) * NROOTS + i_root] : 0);
+                const double gx_i_minus_1_j_1 = ix * minus_two_b * (ix >= 1 ? gx[(ix - 1 + (jx + 1) * j_offset) * NROOTS + i_root] : 0);
+                const double gy_i_minus_1_j_1 = iy * minus_two_b * (iy >= 1 ? gy[(iy - 1 + (jy + 1) * j_offset) * NROOTS + i_root] : 0);
+                const double gz_i_minus_1_j_1 = iz * minus_two_b * (iz >= 1 ? gz[(iz - 1 + (jz + 1) * j_offset) * NROOTS + i_root] : 0);
+                const double gx_i_1_j_minus_1 = jx * minus_two_a * (jx >= 1 ? gx[(ix + 1 + (jx - 1) * j_offset) * NROOTS + i_root] : 0);
+                const double gy_i_1_j_minus_1 = jy * minus_two_a * (jy >= 1 ? gy[(iy + 1 + (jy - 1) * j_offset) * NROOTS + i_root] : 0);
+                const double gz_i_1_j_minus_1 = jz * minus_two_a * (jz >= 1 ? gz[(iz + 1 + (jz - 1) * j_offset) * NROOTS + i_root] : 0);
+                const double gx_i_1_j_1 = minus_two_a * minus_two_b * gx[(ix + 1 + (jx + 1) * j_offset) * NROOTS + i_root];
+                const double gy_i_1_j_1 = minus_two_a * minus_two_b * gy[(iy + 1 + (jy + 1) * j_offset) * NROOTS + i_root];
+                const double gz_i_1_j_1 = minus_two_a * minus_two_b * gz[(iz + 1 + (jz + 1) * j_offset) * NROOTS + i_root];
+                const double gx_0 = gx[(ix + jx * j_offset) * NROOTS + i_root];
+                const double gy_0 = gy[(iy + jy * j_offset) * NROOTS + i_root];
+                const double gz_0 = gz[(iz + jz * j_offset) * NROOTS + i_root];
+                const double gx_i_1_j_0 = minus_two_a * gx[(ix + 1 + jx * j_offset) * NROOTS + i_root];
+                const double gy_i_1_j_0 = minus_two_a * gy[(iy + 1 + jy * j_offset) * NROOTS + i_root];
+                const double gz_i_1_j_0 = minus_two_a * gz[(iz + 1 + jz * j_offset) * NROOTS + i_root];
+                const double gx_i_minus_1_j_0 = ix * (ix >= 1 ? gx[(ix - 1 + jx * j_offset) * NROOTS + i_root] : 0);
+                const double gy_i_minus_1_j_0 = iy * (iy >= 1 ? gy[(iy - 1 + jy * j_offset) * NROOTS + i_root] : 0);
+                const double gz_i_minus_1_j_0 = iz * (iz >= 1 ? gz[(iz - 1 + jz * j_offset) * NROOTS + i_root] : 0);
+                const double gx_i_0_j_1 = minus_two_b * gx[(ix + (jx + 1) * j_offset) * NROOTS + i_root];
+                const double gy_i_0_j_1 = minus_two_b * gy[(iy + (jy + 1) * j_offset) * NROOTS + i_root];
+                const double gz_i_0_j_1 = minus_two_b * gz[(iz + (jz + 1) * j_offset) * NROOTS + i_root];
+                const double gx_i_0_j_minus_1 = jx * (jx >= 1 ? gx[(ix + (jx - 1) * j_offset) * NROOTS + i_root] : 0);
+                const double gy_i_0_j_minus_1 = jy * (jy >= 1 ? gy[(iy + (jy - 1) * j_offset) * NROOTS + i_root] : 0);
+                const double gz_i_0_j_minus_1 = jz * (jz >= 1 ? gz[(iz + (jz - 1) * j_offset) * NROOTS + i_root] : 0);
+
+                d2eri_dAxdBx += (gx_i_minus_1_j_minus_1 + gx_i_minus_1_j_1 + gx_i_1_j_minus_1 + gx_i_1_j_1) * gy_0 * gz_0;
+                d2eri_dAxdBy += (gx_i_minus_1_j_0 + gx_i_1_j_0) * (gy_i_0_j_minus_1 + gy_i_0_j_1) * gz_0;
+                d2eri_dAxdBz += (gx_i_minus_1_j_0 + gx_i_1_j_0) * gy_0 * (gz_i_0_j_minus_1 + gz_i_0_j_1);
+                d2eri_dAydBx += (gx_i_0_j_minus_1 + gx_i_0_j_1) * (gy_i_minus_1_j_0 + gy_i_1_j_0) * gz_0;
+                d2eri_dAydBy += gx_0 * (gy_i_minus_1_j_minus_1 + gy_i_minus_1_j_1 + gy_i_1_j_minus_1 + gy_i_1_j_1) * gz_0;
+                d2eri_dAydBz += gx_0 * (gy_i_minus_1_j_0 + gy_i_1_j_0) * (gz_i_0_j_minus_1 + gz_i_0_j_1);
+                d2eri_dAzdBx += (gx_i_0_j_minus_1 + gx_i_0_j_1) * gy_0 * (gz_i_minus_1_j_0 + gz_i_1_j_0);
+                d2eri_dAzdBy += gx_0 * (gy_i_0_j_minus_1 + gy_i_0_j_1) * (gz_i_minus_1_j_0 + gz_i_1_j_0);
+                d2eri_dAzdBz += gx_0 * gy_0 * (gz_i_minus_1_j_minus_1 + gz_i_minus_1_j_1 + gz_i_1_j_minus_1 + gz_i_1_j_1);
+            }
+            local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += d2eri_dAxdBx * prefactor;
+            local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += d2eri_dAxdBy * prefactor;
+            local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += d2eri_dAxdBz * prefactor;
+            local_output[i + j * n_density_elements_i + 3 * n_density_elements_ij] += d2eri_dAydBx * prefactor;
+            local_output[i + j * n_density_elements_i + 4 * n_density_elements_ij] += d2eri_dAydBy * prefactor;
+            local_output[i + j * n_density_elements_i + 5 * n_density_elements_ij] += d2eri_dAydBz * prefactor;
+            local_output[i + j * n_density_elements_i + 6 * n_density_elements_ij] += d2eri_dAzdBx * prefactor;
+            local_output[i + j * n_density_elements_i + 7 * n_density_elements_ij] += d2eri_dAzdBy * prefactor;
+            local_output[i + j * n_density_elements_i + 8 * n_density_elements_ij] += d2eri_dAzdBz * prefactor;
+        }
+    }
+}
+
+template <int NROOTS, int GSIZE_INT3C_1E>
+__global__
+static void GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                                     const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+                                                                     const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    if (task_ij >= ntasks_ij) {
+        return;
+    }
+
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+    const double* __restrict__ a_exponents = c_bpcache.a1;
+    const double* __restrict__ b_exponents = c_bpcache.a2;
+
+    constexpr int l_sum_max = (NROOTS - 1) * 2 + 1;
+    constexpr int l_i_max_density_elements = (l_sum_max + 1) / 2;
+    constexpr int l_j_max_density_elements = l_sum_max - l_i_max_density_elements;
+    double output_cache[(l_i_max_density_elements + 1) * (l_i_max_density_elements + 2) / 2
+                        * (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2
+                        * 9] { 0.0 };
+
+    for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) {
+        const double* grid_point = grid_points + task_grid * 4;
+        const double charge = grid_point[3];
+        const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+        double g[GSIZE_INT3C_1E];
+
+        for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
+            GINT_g1e<NROOTS>(g, grid_point, ish, jsh, ij, i_l + 1, j_l + 1, charge_exponent, omega);
+            const double minus_two_a = -2.0 * a_exponents[ij];
+            const double minus_two_b = -2.0 * b_exponents[ij];
+            GINTwrite_int3c1e_ipvip1_charge_contracted<NROOTS>(g, output_cache, minus_two_a, minus_two_b, charge, i_l, j_l);
+        }
+    }
+
+    const int* ao_loc = c_bpcache.ao_loc;
+
+    const int i0 = ao_loc[ish] - ao_offsets_i;
+    const int j0 = ao_loc[jsh] - ao_offsets_j;
+    const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+    const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+    const int n_density_elements_ij = n_density_elements_i * n_density_elements_j;
+    for (int j = 0; j < n_density_elements_j; j++) {
+        for (int i = 0; i < n_density_elements_i; i++) {
+            const double d2eri_dAxdBx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij];
+            const double d2eri_dAxdBy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij];
+            const double d2eri_dAxdBz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij];
+            const double d2eri_dAydBx = output_cache[i + j * n_density_elements_i + 3 * n_density_elements_ij];
+            const double d2eri_dAydBy = output_cache[i + j * n_density_elements_i + 4 * n_density_elements_ij];
+            const double d2eri_dAydBz = output_cache[i + j * n_density_elements_i + 5 * n_density_elements_ij];
+            const double d2eri_dAzdBx = output_cache[i + j * n_density_elements_i + 6 * n_density_elements_ij];
+            const double d2eri_dAzdBy = output_cache[i + j * n_density_elements_i + 7 * n_density_elements_ij];
+            const double d2eri_dAzdBz = output_cache[i + j * n_density_elements_i + 8 * n_density_elements_ij];
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), d2eri_dAxdBx);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), d2eri_dAxdBy);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), d2eri_dAxdBz);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 3 * stride_ij), d2eri_dAydBx);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 4 * stride_ij), d2eri_dAydBy);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 5 * stride_ij), d2eri_dAydBz);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 6 * stride_ij), d2eri_dAzdBx);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 7 * stride_ij), d2eri_dAzdBy);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 8 * stride_ij), d2eri_dAzdBz);
+        }
+    }
+}
+
+template <int NROOTS>
+__device__
+static void GINTwrite_int3c1e_ip1ip2_charge_contracted(const double* g, double* local_output, const double minus_two_a, const double* u2, const double* AC, const double prefactor, const int i_l, const int j_l)
+{
+    const int *idx = c_idx;
+    const int *idy = c_idx + TOT_NF;
+    const int *idz = c_idx + TOT_NF * 2;
+
+    const int g_size = NROOTS * (i_l + 2 + 1) * (j_l + 1);
+    const double* __restrict__ gx = g;
+    const double* __restrict__ gy = g + g_size;
+    const double* __restrict__ gz = g + g_size * 2;
+
+    const double ACx = AC[0];
+    const double ACy = AC[1];
+    const double ACz = AC[2];
+
+    const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+    const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+    const int n_density_elements_ij = n_density_elements_i * n_density_elements_j;
+    for (int j = 0; j < n_density_elements_j; j++) {
+        for (int i = 0; i < n_density_elements_i; i++) {
+            const int loc_j = c_l_locs[j_l] + j;
+            const int loc_i = c_l_locs[i_l] + i;
+            const int ix = idx[loc_i];
+            const int iy = idy[loc_i];
+            const int iz = idz[loc_i];
+            const int jx = idx[loc_j];
+            const int jy = idy[loc_j];
+            const int jz = idz[loc_j];
+            const int gx_offset = ix + jx * (i_l + 2 + 1);
+            const int gy_offset = iy + jy * (i_l + 2 + 1);
+            const int gz_offset = iz + jz * (i_l + 2 + 1);
+
+            double d2eri_dAxdCx = 0;
+            double d2eri_dAxdCy = 0;
+            double d2eri_dAxdCz = 0;
+            double d2eri_dAydCx = 0;
+            double d2eri_dAydCy = 0;
+            double d2eri_dAydCz = 0;
+            double d2eri_dAzdCx = 0;
+            double d2eri_dAzdCy = 0;
+            double d2eri_dAzdCz = 0;
+#pragma unroll
+            for (int i_root = 0; i_root < NROOTS; i_root++) {
+                const double gx_minus_1 = (ix >= 1 ? gx[(gx_offset - 1) * NROOTS + i_root] : 0);
+                const double gy_minus_1 = (iy >= 1 ? gy[(gy_offset - 1) * NROOTS + i_root] : 0);
+                const double gz_minus_1 = (iz >= 1 ? gz[(gz_offset - 1) * NROOTS + i_root] : 0);
+                const double gx_0 = gx[gx_offset * NROOTS + i_root];
+                const double gy_0 = gy[gy_offset * NROOTS + i_root];
+                const double gz_0 = gz[gz_offset * NROOTS + i_root];
+                const double gx_1 = gx[(gx_offset + 1) * NROOTS + i_root];
+                const double gy_1 = gy[(gy_offset + 1) * NROOTS + i_root];
+                const double gz_1 = gz[(gz_offset + 1) * NROOTS + i_root];
+                const double gx_2 = gx[(gx_offset + 2) * NROOTS + i_root];
+                const double gy_2 = gy[(gy_offset + 2) * NROOTS + i_root];
+                const double gz_2 = gz[(gz_offset + 2) * NROOTS + i_root];
+
+                const double two_u2 = 2.0 * u2[i_root];
+                const double dgx_dAx = ix * gx_minus_1 + minus_two_a * gx_1;
+                const double dgy_dAy = iy * gy_minus_1 + minus_two_a * gy_1;
+                const double dgz_dAz = iz * gz_minus_1 + minus_two_a * gz_1;
+                const double dgx_dCx = two_u2 * (ACx * gx_0 + gx_1);
+                const double dgy_dCy = two_u2 * (ACy * gy_0 + gy_1);
+                const double dgz_dCz = two_u2 * (ACz * gz_0 + gz_1);
+                const double d2gx_dAxdCx = two_u2 * (ix * ACx * gx_minus_1 + ix * gx_0 + minus_two_a * ACx * gx_1 + minus_two_a * gx_2);
+                const double d2gy_dAydCy = two_u2 * (iy * ACy * gy_minus_1 + iy * gy_0 + minus_two_a * ACy * gy_1 + minus_two_a * gy_2);
+                const double d2gz_dAzdCz = two_u2 * (iz * ACz * gz_minus_1 + iz * gz_0 + minus_two_a * ACz * gz_1 + minus_two_a * gz_2);
+
+                d2eri_dAxdCx += - d2gx_dAxdCx * gy_0 * gz_0;
+                d2eri_dAxdCy += - dgx_dAx * dgy_dCy * gz_0;
+                d2eri_dAxdCz += - dgx_dAx * gy_0 * dgz_dCz;
+                d2eri_dAydCx += - dgx_dCx * dgy_dAy * gz_0;
+                d2eri_dAydCy += - gx_0 * d2gy_dAydCy * gz_0;
+                d2eri_dAydCz += - gx_0 * dgy_dAy * dgz_dCz;
+                d2eri_dAzdCx += - dgx_dCx * gy_0 * dgz_dAz;
+                d2eri_dAzdCy += - gx_0 * dgy_dCy * dgz_dAz;
+                d2eri_dAzdCz += - gx_0 * gy_0 * d2gz_dAzdCz;
+            }
+            local_output[i + j * n_density_elements_i + 0 * n_density_elements_ij] += d2eri_dAxdCx * prefactor;
+            local_output[i + j * n_density_elements_i + 1 * n_density_elements_ij] += d2eri_dAxdCy * prefactor;
+            local_output[i + j * n_density_elements_i + 2 * n_density_elements_ij] += d2eri_dAxdCz * prefactor;
+            local_output[i + j * n_density_elements_i + 3 * n_density_elements_ij] += d2eri_dAydCx * prefactor;
+            local_output[i + j * n_density_elements_i + 4 * n_density_elements_ij] += d2eri_dAydCy * prefactor;
+            local_output[i + j * n_density_elements_i + 5 * n_density_elements_ij] += d2eri_dAydCz * prefactor;
+            local_output[i + j * n_density_elements_i + 6 * n_density_elements_ij] += d2eri_dAzdCx * prefactor;
+            local_output[i + j * n_density_elements_i + 7 * n_density_elements_ij] += d2eri_dAzdCy * prefactor;
+            local_output[i + j * n_density_elements_i + 8 * n_density_elements_ij] += d2eri_dAzdCz * prefactor;
+        }
+    }
+}
+
+template <int NROOTS, int GSIZE_INT3C_1E>
+__global__
+static void GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                                     const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+                                                                     const double omega, const double* grid_points, const double* charge_exponents)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    if (task_ij >= ntasks_ij) {
+        return;
+    }
+
+    const int bas_ij = offsets.bas_ij + task_ij;
+    const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+    const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+    const int ish = bas_pair2bra[bas_ij];
+    const int jsh = bas_pair2ket[bas_ij];
+    const double* __restrict__ a_exponents = c_bpcache.a1;
+
+    const int nbas = c_bpcache.nbas;
+    const double* __restrict__ bas_x = c_bpcache.bas_coords;
+    const double* __restrict__ bas_y = bas_x + nbas;
+    const double* __restrict__ bas_z = bas_y + nbas;
+    const double Ax = bas_x[ish];
+    const double Ay = bas_y[ish];
+    const double Az = bas_z[ish];
+
+    constexpr int l_sum_max = (NROOTS - 1) * 2 + 1;
+    constexpr int l_i_max_density_elements = (l_sum_max + 1) / 2;
+    constexpr int l_j_max_density_elements = l_sum_max - l_i_max_density_elements;
+    double output_cache[(l_i_max_density_elements + 1) * (l_i_max_density_elements + 2) / 2
+                        * (l_j_max_density_elements + 1) * (l_j_max_density_elements + 2) / 2
+                        * 9] { 0.0 };
+
+    for (int task_grid = blockIdx.y * blockDim.y + threadIdx.y; task_grid < ngrids; task_grid += gridDim.y * blockDim.y) {
+        const double* grid_point = grid_points + task_grid * 4;
+        const double Cx = grid_point[0];
+        const double Cy = grid_point[1];
+        const double Cz = grid_point[2];
+        const double charge = grid_point[3];
+        const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+        const double AC[3] { Ax - Cx, Ay - Cy, Az - Cz };
+
+        double g[GSIZE_INT3C_1E];
+        double u2[NROOTS];
+
+        for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
+            GINT_g1e_save_u2<NROOTS>(g, u2, grid_point, ish, jsh, ij, i_l + 2, j_l, charge_exponent, omega);
+            const double minus_two_a = -2.0 * a_exponents[ij];
+            GINTwrite_int3c1e_ip1ip2_charge_contracted<NROOTS>(g, output_cache, minus_two_a, u2, AC, charge, i_l, j_l);
+        }
+    }
+
+    const int* ao_loc = c_bpcache.ao_loc;
+
+    const int i0 = ao_loc[ish] - ao_offsets_i;
+    const int j0 = ao_loc[jsh] - ao_offsets_j;
+    const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2;
+    const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2;
+    const int n_density_elements_ij = n_density_elements_i * n_density_elements_j;
+    for (int j = 0; j < n_density_elements_j; j++) {
+        for (int i = 0; i < n_density_elements_i; i++) {
+            const double d2eri_dAxdCx = output_cache[i + j * n_density_elements_i + 0 * n_density_elements_ij];
+            const double d2eri_dAxdCy = output_cache[i + j * n_density_elements_i + 1 * n_density_elements_ij];
+            const double d2eri_dAxdCz = output_cache[i + j * n_density_elements_i + 2 * n_density_elements_ij];
+            const double d2eri_dAydCx = output_cache[i + j * n_density_elements_i + 3 * n_density_elements_ij];
+            const double d2eri_dAydCy = output_cache[i + j * n_density_elements_i + 4 * n_density_elements_ij];
+            const double d2eri_dAydCz = output_cache[i + j * n_density_elements_i + 5 * n_density_elements_ij];
+            const double d2eri_dAzdCx = output_cache[i + j * n_density_elements_i + 6 * n_density_elements_ij];
+            const double d2eri_dAzdCy = output_cache[i + j * n_density_elements_i + 7 * n_density_elements_ij];
+            const double d2eri_dAzdCz = output_cache[i + j * n_density_elements_i + 8 * n_density_elements_ij];
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij), d2eri_dAxdCx);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij), d2eri_dAxdCy);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij), d2eri_dAxdCz);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 3 * stride_ij), d2eri_dAydCx);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 4 * stride_ij), d2eri_dAydCy);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 5 * stride_ij), d2eri_dAydCz);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 6 * stride_ij), d2eri_dAzdCx);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 7 * stride_ij), d2eri_dAzdCy);
+            atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 8 * stride_ij), d2eri_dAzdCz);
+        }
+    }
+}
+
+template <int L_SUM>
+__global__
+static void GINTfill_int3c1e_ipip2_density_contracted_kernel_general(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
+                                                                     const BasisProdOffsets offsets, const int nprim_ij,
+                                                                     const double omega, const double* grid_points, const double* charge_exponents)
+{
+    constexpr int NROOTS = (L_SUM + 2) / 2 + 1;
+
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = offsets.ntasks_kl;
+    const int task_grid = blockIdx.y * blockDim.y + threadIdx.y;
+    if (task_grid >= ngrids) {
+        return;
+    }
+
+    const double* grid_point = grid_points + task_grid * 3;
+    const double Cx = grid_point[0];
+    const double Cy = grid_point[1];
+    const double Cz = grid_point[2];
+    const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0;
+
+    double d2eri_dCxdCx_pair_sum = 0.0;
+    double d2eri_dCxdCy_pair_sum = 0.0;
+    double d2eri_dCxdCz_pair_sum = 0.0;
+    double d2eri_dCydCy_pair_sum = 0.0;
+    double d2eri_dCydCz_pair_sum = 0.0;
+    double d2eri_dCzdCz_pair_sum = 0.0;
+    for (int task_ij = blockIdx.x * blockDim.x + threadIdx.x; task_ij < ntasks_ij; task_ij += gridDim.x * blockDim.x) {
+
+        const int bas_ij = offsets.bas_ij + task_ij;
+        const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+        const int* bas_pair2bra = c_bpcache.bas_pair2bra;
+        // const int* bas_pair2ket = c_bpcache.bas_pair2ket;
+        const int ish = bas_pair2bra[bas_ij];
+        // const int jsh = bas_pair2ket[bas_ij];
+        const int nbas = c_bpcache.nbas;
+        const double* __restrict__ bas_x = c_bpcache.bas_coords;
+        const double* __restrict__ bas_y = bas_x + nbas;
+        const double* __restrict__ bas_z = bas_y + nbas;
+        const double Ax = bas_x[ish];
+        const double Ay = bas_y[ish];
+        const double Az = bas_z[ish];
+
+        const double ACx = Ax - Cx;
+        const double ACy = Ay - Cy;
+        const double ACz = Az - Cz;
+
+        double D_hermite[(L_SUM + 1) * (L_SUM + 2) * (L_SUM + 3) / 6];
+#pragma unroll
+        for (int i_t = 0; i_t < (L_SUM + 1) * (L_SUM + 2) * (L_SUM + 3) / 6; i_t++) {
+            D_hermite[i_t] = density[bas_ij - hermite_density_offsets.pair_offset_of_angular_pair + hermite_density_offsets.density_offset_of_angular_pair + i_t * hermite_density_offsets.n_pair_of_angular_pair];
+        }
+
+        double d2eri_dCxdCx = 0.0;
+        double d2eri_dCxdCy = 0.0;
+        double d2eri_dCxdCz = 0.0;
+        double d2eri_dCydCy = 0.0;
+        double d2eri_dCydCz = 0.0;
+        double d2eri_dCzdCz = 0.0;
+        for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
+            double g[NROOTS * (L_SUM + 2 + 1) * 3];
+            double u2[NROOTS];
+            GINT_g1e_without_hrr_save_u2<L_SUM + 2>(g, u2, Cx, Cy, Cz, ish, ij, charge_exponent, omega);
+
+            const double* __restrict__ gx = g;
+            const double* __restrict__ gy = g + NROOTS * (L_SUM + 2 + 1);
+            const double* __restrict__ gz = g + NROOTS * (L_SUM + 2 + 1) * 2;
+
+#pragma unroll
+            for (int i_x = 0, i_t = 0; i_x <= L_SUM; i_x++) {
+#pragma unroll
+                for (int i_y = 0; i_x + i_y <= L_SUM; i_y++) {
+#pragma unroll
+                    for (int i_z = 0; i_x + i_y + i_z <= L_SUM; i_z++, i_t++) {
+                        double d2eri_dCxdCx_per_hermite = 0.0;
+                        double d2eri_dCxdCy_per_hermite = 0.0;
+                        double d2eri_dCxdCz_per_hermite = 0.0;
+                        double d2eri_dCydCy_per_hermite = 0.0;
+                        double d2eri_dCydCz_per_hermite = 0.0;
+                        double d2eri_dCzdCz_per_hermite = 0.0;
+#pragma unroll
+                        for (int i_root = 0; i_root < NROOTS; i_root++) {
+                            const double gx_0 = gx[i_root + NROOTS * i_x];
+                            const double gy_0 = gy[i_root + NROOTS * i_y];
+                            const double gz_0 = gz[i_root + NROOTS * i_z];
+                            const double gx_1 = gx[i_root + NROOTS * (i_x + 1)];
+                            const double gy_1 = gy[i_root + NROOTS * (i_y + 1)];
+                            const double gz_1 = gz[i_root + NROOTS * (i_z + 1)];
+                            const double gx_2 = gx[i_root + NROOTS * (i_x + 2)];
+                            const double gy_2 = gy[i_root + NROOTS * (i_y + 2)];
+                            const double gz_2 = gz[i_root + NROOTS * (i_z + 2)];
+                            const double two_u2 = 2.0 * u2[i_root];
+                            const double dgx_dCx = two_u2 * (gx_1 + ACx * gx_0);
+                            const double dgy_dCy = two_u2 * (gy_1 + ACy * gy_0);
+                            const double dgz_dCz = two_u2 * (gz_1 + ACz * gz_0);
+                            const double d2gx_dCx2 = two_u2 * (-gx_0 + two_u2 * (gx_2 + ACx * gx_1 * 2 + ACx * ACx * gx_0));
+                            const double d2gy_dCy2 = two_u2 * (-gy_0 + two_u2 * (gy_2 + ACy * gy_1 * 2 + ACy * ACy * gy_0));
+                            const double d2gz_dCz2 = two_u2 * (-gz_0 + two_u2 * (gz_2 + ACz * gz_1 * 2 + ACz * ACz * gz_0));
+                            d2eri_dCxdCx_per_hermite += d2gx_dCx2 * gy_0 * gz_0;
+                            d2eri_dCxdCy_per_hermite += dgx_dCx * dgy_dCy * gz_0;
+                            d2eri_dCxdCz_per_hermite += dgx_dCx * gy_0 * dgz_dCz;
+                            d2eri_dCydCy_per_hermite += gx_0 * d2gy_dCy2 * gz_0;
+                            d2eri_dCydCz_per_hermite += gx_0 * dgy_dCy * dgz_dCz;
+                            d2eri_dCzdCz_per_hermite += gx_0 * gy_0 * d2gz_dCz2;
+                        }
+                        const double D_t = D_hermite[i_t];
+                        d2eri_dCxdCx += d2eri_dCxdCx_per_hermite * D_t;
+                        d2eri_dCxdCy += d2eri_dCxdCy_per_hermite * D_t;
+                        d2eri_dCxdCz += d2eri_dCxdCz_per_hermite * D_t;
+                        d2eri_dCydCy += d2eri_dCydCy_per_hermite * D_t;
+                        d2eri_dCydCz += d2eri_dCydCz_per_hermite * D_t;
+                        d2eri_dCzdCz += d2eri_dCzdCz_per_hermite * D_t;
+                    }
+                }
+            }
+        }
+        d2eri_dCxdCx_pair_sum += d2eri_dCxdCx;
+        d2eri_dCxdCy_pair_sum += d2eri_dCxdCy;
+        d2eri_dCxdCz_pair_sum += d2eri_dCxdCz;
+        d2eri_dCydCy_pair_sum += d2eri_dCydCy;
+        d2eri_dCydCz_pair_sum += d2eri_dCydCz;
+        d2eri_dCzdCz_pair_sum += d2eri_dCzdCz;
+    }
+    atomicAdd(output + task_grid + ngrids * 0, d2eri_dCxdCx_pair_sum);
+    atomicAdd(output + task_grid + ngrids * 1, d2eri_dCxdCy_pair_sum);
+    atomicAdd(output + task_grid + ngrids * 2, d2eri_dCxdCz_pair_sum);
+    atomicAdd(output + task_grid + ngrids * 4, d2eri_dCydCy_pair_sum);
+    atomicAdd(output + task_grid + ngrids * 5, d2eri_dCydCz_pair_sum);
+    atomicAdd(output + task_grid + ngrids * 8, d2eri_dCzdCz_pair_sum);
+}
diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu
new file mode 100644
index 00000000..4f3a3dee
--- /dev/null
+++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu
@@ -0,0 +1,361 @@
+/*
+ * Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <cuda_runtime.h>
+
+#include "gint.h"
+#include "gint1e.h"
+#include "cuda_alloc.cuh"
+#include "cint2e.cuh"
+
+#include "rys_roots.cu"
+#include "g1e.cu"
+#include "g3c1e_ipip.cu"
+
+static int GINTfill_int3c1e_ipip1_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                          const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+                                                          const double omega, const double* grid_points, const double* charge_exponents,
+                                                          const int n_charge_sum_per_thread, const cudaStream_t stream)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread;
+
+    const dim3 threads(THREADSX, THREADSY);
+    const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
+    const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+    switch (nrys_roots) {
+    case 2: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<2, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 3: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<3, GSIZE6_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 4: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 5: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 6: GINTfill_int3c1e_ipip1_charge_contracted_kernel_general<6, GSIZE6_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    default:
+        fprintf(stderr, "nrys_roots = %d out of range\n", nrys_roots);
+        return 1;
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+static int GINTfill_int3c1e_ipvip1_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                           const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+                                                           const double omega, const double* grid_points, const double* charge_exponents,
+                                                           const int n_charge_sum_per_thread, const cudaStream_t stream)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread;
+
+    const dim3 threads(THREADSX, THREADSY);
+    const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
+    const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+    switch (nrys_roots) {
+    case 2: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<2, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 3: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<3, GSIZE6_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 4: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 5: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 6: GINTfill_int3c1e_ipvip1_charge_contracted_kernel_general<6, GSIZE6_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    default:
+        fprintf(stderr, "nrys_roots = %d out of range\n", nrys_roots);
+        return 1;
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+static int GINTfill_int3c1e_ip1ip2_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                           const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j,
+                                                           const double omega, const double* grid_points, const double* charge_exponents,
+                                                           const int n_charge_sum_per_thread, const cudaStream_t stream)
+{
+    const int ntasks_ij = offsets.ntasks_ij;
+    const int ngrids = (offsets.ntasks_kl + n_charge_sum_per_thread - 1) / n_charge_sum_per_thread;
+
+    const dim3 threads(THREADSX, THREADSY);
+    const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
+    const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+    switch (nrys_roots) {
+    case 2: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<2, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 3: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<3, GSIZE6_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 4: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 5: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    case 6: GINTfill_int3c1e_ip1ip2_charge_contracted_kernel_general<6, GSIZE6_INT3C_1E> <<<blocks, threads, 0, stream>>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, omega, grid_points, charge_exponents); break;
+    default:
+        fprintf(stderr, "nrys_roots = %d out of range\n", nrys_roots);
+        return 1;
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+static int GINTfill_int3c1e_ipip2_density_contracted_tasks(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets,
+                                                           const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij,
+                                                           const double omega, const double* grid_points, const double* charge_exponents,
+                                                           const int n_pair_sum_per_thread, const cudaStream_t stream)
+{
+    const int ntasks_ij = (offsets.ntasks_ij + n_pair_sum_per_thread - 1) / n_pair_sum_per_thread;
+    const int ngrids = offsets.ntasks_kl;
+
+    const dim3 threads(THREADSX, THREADSY);
+    const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY);
+    switch (i_l + j_l) {
+    case  0: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 0> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+    case  1: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 1> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+    case  2: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 2> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+    case  3: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 3> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+    case  4: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 4> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+    case  5: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 5> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+    case  6: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 6> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+    case  7: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 7> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+    case  8: GINTfill_int3c1e_ipip2_density_contracted_kernel_general< 8> <<<blocks, threads, 0, stream>>>(output, density, hermite_density_offsets, offsets, nprim_ij, omega, grid_points, charge_exponents); break;
+    // Up to g + g = 8 now
+    default:
+        fprintf(stderr, "i_l + j_l = %d out of range\n", i_l + j_l);
+        return 1;
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+extern "C" {
+int GINTfill_int3c1e_ipip1_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
+                                             const double* grid_points, const double* charge_exponents, const int ngrids,
+                                             double* integral_charge_contracted,
+                                             const int* strides, const int* ao_offsets,
+                                             const int* bins_locs_ij, const int nbins,
+                                             const int cp_ij_id, const double omega, const int n_charge_sum_per_thread)
+{
+    const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
+    const int i_l = cp_ij->l_bra;
+    const int j_l = cp_ij->l_ket;
+    const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+    const int nprim_ij = cp_ij->nprim_12;
+
+    if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) {
+        fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots);
+        return 2;
+    }
+
+    checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+
+    const int* bas_pairs_locs = bpcache->bas_pairs_locs;
+    const int* primitive_pairs_locs = bpcache->primitive_pairs_locs;
+    for (int ij_bin = 0; ij_bin < nbins; ij_bin++) {
+        const int bas_ij0 = bins_locs_ij[ij_bin];
+        const int bas_ij1 = bins_locs_ij[ij_bin + 1];
+        const int ntasks_ij = bas_ij1 - bas_ij0;
+        if (ntasks_ij <= 0) {
+            continue;
+        }
+
+        BasisProdOffsets offsets;
+        offsets.ntasks_ij = ntasks_ij;
+        offsets.ntasks_kl = ngrids;
+        offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0;
+        offsets.bas_kl = -1;
+        offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij;
+        offsets.primitive_kl = -1;
+
+        const int err = GINTfill_int3c1e_ipip1_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij,
+                                                                       strides[0], strides[1], ao_offsets[0], ao_offsets[1],
+                                                                       omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream);
+
+        if (err != 0) {
+            return err;
+        }
+    }
+
+    return 0;
+}
+
+int GINTfill_int3c1e_ipvip1_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
+                                              const double* grid_points, const double* charge_exponents, const int ngrids,
+                                              double* integral_charge_contracted,
+                                              const int* strides, const int* ao_offsets,
+                                              const int* bins_locs_ij, const int nbins,
+                                              const int cp_ij_id, const double omega, const int n_charge_sum_per_thread)
+{
+    const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
+    const int i_l = cp_ij->l_bra;
+    const int j_l = cp_ij->l_ket;
+    const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+    const int nprim_ij = cp_ij->nprim_12;
+
+    if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) {
+        fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots);
+        return 2;
+    }
+
+    checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+
+    const int* bas_pairs_locs = bpcache->bas_pairs_locs;
+    const int* primitive_pairs_locs = bpcache->primitive_pairs_locs;
+    for (int ij_bin = 0; ij_bin < nbins; ij_bin++) {
+        const int bas_ij0 = bins_locs_ij[ij_bin];
+        const int bas_ij1 = bins_locs_ij[ij_bin + 1];
+        const int ntasks_ij = bas_ij1 - bas_ij0;
+        if (ntasks_ij <= 0) {
+            continue;
+        }
+
+        BasisProdOffsets offsets;
+        offsets.ntasks_ij = ntasks_ij;
+        offsets.ntasks_kl = ngrids;
+        offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0;
+        offsets.bas_kl = -1;
+        offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij;
+        offsets.primitive_kl = -1;
+
+        const int err = GINTfill_int3c1e_ipvip1_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij,
+                                                                        strides[0], strides[1], ao_offsets[0], ao_offsets[1],
+                                                                        omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream);
+
+        if (err != 0) {
+            return err;
+        }
+    }
+
+    return 0;
+}
+
+int GINTfill_int3c1e_ip1ip2_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
+                                              const double* grid_points, const double* charge_exponents, const int ngrids,
+                                              double* integral_charge_contracted,
+                                              const int* strides, const int* ao_offsets,
+                                              const int* bins_locs_ij, const int nbins,
+                                              const int cp_ij_id, const double omega, const int n_charge_sum_per_thread)
+{
+    const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
+    const int i_l = cp_ij->l_bra;
+    const int j_l = cp_ij->l_ket;
+    const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+    const int nprim_ij = cp_ij->nprim_12;
+
+    if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) {
+        fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots);
+        return 2;
+    }
+
+    checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+
+    const int* bas_pairs_locs = bpcache->bas_pairs_locs;
+    const int* primitive_pairs_locs = bpcache->primitive_pairs_locs;
+    for (int ij_bin = 0; ij_bin < nbins; ij_bin++) {
+        const int bas_ij0 = bins_locs_ij[ij_bin];
+        const int bas_ij1 = bins_locs_ij[ij_bin + 1];
+        const int ntasks_ij = bas_ij1 - bas_ij0;
+        if (ntasks_ij <= 0) {
+            continue;
+        }
+
+        BasisProdOffsets offsets;
+        offsets.ntasks_ij = ntasks_ij;
+        offsets.ntasks_kl = ngrids;
+        offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0;
+        offsets.bas_kl = -1;
+        offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij;
+        offsets.primitive_kl = -1;
+
+        const int err = GINTfill_int3c1e_ip1ip2_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij,
+                                                                        strides[0], strides[1], ao_offsets[0], ao_offsets[1],
+                                                                        omega, grid_points, charge_exponents, n_charge_sum_per_thread, stream);
+
+        if (err != 0) {
+            return err;
+        }
+    }
+
+    return 0;
+}
+
+int GINTfill_int3c1e_ipip2_density_contracted(const cudaStream_t stream, const BasisProdCache* bpcache,
+                                              const double* grid_points, const double* charge_exponents, const int ngrids,
+                                              const double* dm_pair_ordered, const int* density_offset,
+                                              double* integral_density_contracted,
+                                              const int* bins_locs_ij, const int nbins,
+                                              const int cp_ij_id, const double omega, const int n_pair_sum_per_thread)
+{
+    const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
+    const int i_l = cp_ij->l_bra;
+    const int j_l = cp_ij->l_ket;
+    const int nrys_roots = (i_l + j_l + 2) / 2 + 1;
+    const int nprim_ij = cp_ij->nprim_12;
+
+    if (nrys_roots > MAX_NROOTS_INT3C_1E + 2) {
+        fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots);
+        return 2;
+    }
+
+    checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
+
+    const int* bas_pairs_locs = bpcache->bas_pairs_locs;
+    const int* primitive_pairs_locs = bpcache->primitive_pairs_locs;
+    for (int ij_bin = 0; ij_bin < nbins; ij_bin++) {
+        const int bas_ij0 = bins_locs_ij[ij_bin];
+        const int bas_ij1 = bins_locs_ij[ij_bin + 1];
+        const int ntasks_ij = bas_ij1 - bas_ij0;
+        if (ntasks_ij <= 0) {
+            continue;
+        }
+
+        BasisProdOffsets offsets;
+        offsets.ntasks_ij = ntasks_ij;
+        offsets.ntasks_kl = ngrids;
+        offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0;
+        offsets.bas_kl = -1;
+        offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij;
+        offsets.primitive_kl = -1;
+
+        HermiteDensityOffsets hermite_density_offsets;
+        hermite_density_offsets.density_offset_of_angular_pair = density_offset[cp_ij_id];
+        hermite_density_offsets.pair_offset_of_angular_pair = bas_pairs_locs[cp_ij_id];
+        hermite_density_offsets.n_pair_of_angular_pair = bas_pairs_locs[cp_ij_id + 1] - bas_pairs_locs[cp_ij_id];
+
+        const int err = GINTfill_int3c1e_ipip2_density_contracted_tasks(integral_density_contracted, dm_pair_ordered, hermite_density_offsets,
+                                                                        offsets, i_l, j_l, nprim_ij,
+                                                                        omega, grid_points, charge_exponents, n_pair_sum_per_thread, stream);
+
+        if (err != 0) {
+            return err;
+        }
+    }
+
+    return 0;
+}
+}
diff --git a/gpu4pyscf/lib/gvhf-rys/cart2xyz.c b/gpu4pyscf/lib/gvhf-rys/cart2xyz.c
index ee564cf9..ba10aca6 100644
--- a/gpu4pyscf/lib/gvhf-rys/cart2xyz.c
+++ b/gpu4pyscf/lib/gvhf-rys/cart2xyz.c
@@ -3,6 +3,9 @@
 #include <stdint.h>
 #include "vhf.cuh"
 
+// up to l=7
+#define L_SLOTS 8
+
 static int _LEN_CART0[] = {
     0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, 136
 };
@@ -32,9 +35,9 @@ static void _get_dm_to_dm_xyz_coeff(double* pcx, double* rij, int lmax)
 {
     int lmax1 = lmax + 1;
     int l, lx;
-    double rx_pow[LMAX1];
-    double ry_pow[LMAX1];
-    double rz_pow[LMAX1];
+    double rx_pow[L_SLOTS];
+    double ry_pow[L_SLOTS];
+    double rz_pow[L_SLOTS];
 
     rx_pow[0] = 1.0;
     ry_pow[0] = 1.0;
@@ -67,7 +70,7 @@ static void _dm_to_dm_xyz(double* dm_xyz, double* dm, int nao, int li, int lj, d
     int lij = li + lj;
     int l1 = lij + 1;
     int l1l1 = l1 * l1;
-    double pcx[LMAX1*LMAX1*3];
+    double pcx[L_SLOTS*L_SLOTS*3];
     double *pcy = pcx + lj1 * lj1;
     double *pcz = pcy + lj1 * lj1;
     _get_dm_to_dm_xyz_coeff(pcx, rij, lj);
@@ -116,7 +119,7 @@ static void _dm_xyz_to_dm(double* dm_xyz, double* dm, int nao, int li, int lj, d
     int lj1 = lj + 1;
     int l1 = li + lj + 1;
     int l1l1 = l1 * l1;
-    double pcx[LMAX1*LMAX1*3];
+    double pcx[L_SLOTS*L_SLOTS*3];
     double *pcy = pcx + lj1 * lj1;
     double *pcz = pcy + lj1 * lj1;
     _get_dm_to_dm_xyz_coeff(pcx, rij, lj);
@@ -152,7 +155,7 @@ void transform_cart_to_xyz(double *dm_xyz, double *dm, int *ao_loc, int *pair_lo
                            int *bas, int nbas, double *env)
 {
     int nao = ao_loc[nbas];
-    double cache[(LMAX*2+1)*(LMAX*2+1)*(LMAX*2+1)];
+    double cache[L_SLOTS*L_SLOTS*L_SLOTS*8];
     for (int ish = 0; ish < nbas; ish++) {
         int i0 = ao_loc[ish];
         int li = bas[ish*BAS_SLOTS+ANG_OF];
@@ -182,7 +185,7 @@ void transform_xyz_to_cart(double *vj, double *vj_xyz, int *ao_loc, int *pair_lo
                            int *bas, int nbas, double *env)
 {
     int nao = ao_loc[nbas];
-    double cache[(LMAX*2+1)*(LMAX*2+1)*(LMAX*2+1)];
+    double cache[L_SLOTS*L_SLOTS*L_SLOTS*8];
     for (int ish = 0; ish < nbas; ish++) {
         int i0 = ao_loc[ish];
         int li = bas[ish*BAS_SLOTS+ANG_OF];
diff --git a/gpu4pyscf/lib/gvhf-rys/create_tasks.cu b/gpu4pyscf/lib/gvhf-rys/create_tasks.cu
index ae8ef8ad..262f9de0 100644
--- a/gpu4pyscf/lib/gvhf-rys/create_tasks.cu
+++ b/gpu4pyscf/lib/gvhf-rys/create_tasks.cu
@@ -97,39 +97,35 @@ static int _fill_jk_tasks(ShellQuartet *shl_quartet_idx,
     }
 
     // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
-    extern __shared__ int thread_offsets[];
-    thread_offsets[t_id] = count;
+    extern __shared__ int cum_count[];
+    cum_count[t_id] = count;
     // Up-sweep phase
     for (int stride = 1; stride < threads; stride *= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
         if (index < threads) {
-            thread_offsets[index] += thread_offsets[index-stride];
+            cum_count[index] += cum_count[index-stride];
         }
     }
     __syncthreads();
-    if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
     // Down-sweep phase
-    for (int stride = threads/2; stride > 0; stride /= 2) {
+    for (int stride = threads/4; stride > 0; stride /= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
-        if (index < threads) {
-            int temp = thread_offsets[index - stride];
-            thread_offsets[index - stride] = thread_offsets[index];
-            thread_offsets[index] += temp;
+        if (index + stride < threads) {
+            cum_count[index + stride] += cum_count[index];
         }
     }
     __syncthreads();
-    __shared__ int ntasks;
-    if (t_id == threads-1) {
-        ntasks = thread_offsets[threads-1] + count;
-    }
-    __syncthreads();
+    int ntasks = cum_count[threads-1];
     if (ntasks == 0) {
         return ntasks;
     }
 
-    int offset = thread_offsets[t_id];
+    int offset = 0;
+    if (t_id > 0) {
+        offset = cum_count[t_id-1];
+    }
     for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
         int tile_kl = tile_kl_mapping[t_kl_id];
         if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
@@ -311,7 +307,7 @@ static int _fill_sr_jk_tasks(ShellQuartet *shl_quartet_idx,
                             float ypq = yij - ykl;
                             float zpq = zij - zkl;
                             float rr = xpq*xpq + ypq*ypq + zpq*zpq;
-                            float theta_rr = logf(rr + 1e-30f) + theta * rr;
+                            float theta_rr = logf(rr + 1.f) + theta * rr;
                             d_cutoff = skl_cutoff - s_estimator[bas_kl] + theta_rr;
                             if (d_cutoff > 0) {
                                 continue;
@@ -332,39 +328,35 @@ static int _fill_sr_jk_tasks(ShellQuartet *shl_quartet_idx,
     }
 
     // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
-    extern __shared__ int thread_offsets[];
-    thread_offsets[t_id] = count;
+    extern __shared__ int cum_count[];
+    cum_count[t_id] = count;
     // Up-sweep phase
     for (int stride = 1; stride < threads; stride *= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
         if (index < threads) {
-            thread_offsets[index] += thread_offsets[index-stride];
+            cum_count[index] += cum_count[index-stride];
         }
     }
     __syncthreads();
-    if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
     // Down-sweep phase
-    for (int stride = threads/2; stride > 0; stride /= 2) {
+    for (int stride = threads/4; stride > 0; stride /= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
-        if (index < threads) {
-            int temp = thread_offsets[index - stride];
-            thread_offsets[index - stride] = thread_offsets[index];
-            thread_offsets[index] += temp;
+        if (index + stride < threads) {
+            cum_count[index + stride] += cum_count[index];
         }
     }
     __syncthreads();
-    __shared__ int ntasks;
-    if (t_id == threads-1) {
-        ntasks = thread_offsets[threads-1] + count;
-    }
-    __syncthreads();
+    int ntasks = cum_count[threads-1];
     if (ntasks == 0) {
         return ntasks;
     }
 
-    int offset = thread_offsets[t_id];
+    int offset = 0;
+    if (t_id > 0) {
+        offset = cum_count[t_id-1];
+    }
     for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
         int tile_kl = tile_kl_mapping[t_kl_id];
         if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
@@ -457,7 +449,7 @@ static int _fill_sr_jk_tasks(ShellQuartet *shl_quartet_idx,
                             float ypq = yij - ykl;
                             float zpq = zij - zkl;
                             float rr = xpq*xpq + ypq*ypq + zpq*zpq;
-                            float theta_rr = logf(rr + 1e-30f) + theta * rr;
+                            float theta_rr = logf(rr + 1.f) + theta * rr;
                             d_cutoff = skl_cutoff - s_estimator[bas_kl] + theta_rr;
                             if (d_cutoff > 0) {
                                 continue;
diff --git a/gpu4pyscf/lib/gvhf-rys/create_tasks_ip1.cu b/gpu4pyscf/lib/gvhf-rys/create_tasks_ip1.cu
index 83803180..6ec7132e 100644
--- a/gpu4pyscf/lib/gvhf-rys/create_tasks_ip1.cu
+++ b/gpu4pyscf/lib/gvhf-rys/create_tasks_ip1.cu
@@ -93,40 +93,35 @@ static int _fill_ejk_tasks(ShellQuartet *shl_quartet_idx,
         }
     }
 
-    // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
-    extern __shared__ int thread_offsets[];
-    thread_offsets[t_id] = count;
+    extern __shared__ int cum_count[];
+    cum_count[t_id] = count;
     // Up-sweep phase
     for (int stride = 1; stride < threads; stride *= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
         if (index < threads) {
-            thread_offsets[index] += thread_offsets[index-stride];
+            cum_count[index] += cum_count[index-stride];
         }
     }
     __syncthreads();
-    if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
     // Down-sweep phase
-    for (int stride = threads/2; stride > 0; stride /= 2) {
+    for (int stride = threads/4; stride > 0; stride /= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
-        if (index < threads) {
-            int temp = thread_offsets[index - stride];
-            thread_offsets[index - stride] = thread_offsets[index];
-            thread_offsets[index] += temp;
+        if (index + stride < threads) {
+            cum_count[index + stride] += cum_count[index];
         }
     }
     __syncthreads();
-    __shared__ int ntasks;
-    if (t_id == threads-1) {
-        ntasks = thread_offsets[threads-1] + count;
-    }
-    __syncthreads();
+    int ntasks = cum_count[threads-1];
     if (ntasks == 0) {
         return ntasks;
     }
 
-    int offset = thread_offsets[t_id];
+    int offset = 0;
+    if (t_id > 0) {
+        offset = cum_count[t_id-1];
+    }
     for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
         int tile_kl = tile_kl_mapping[t_kl_id];
         if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
@@ -317,40 +312,35 @@ static int _fill_sr_ejk_tasks(ShellQuartet *shl_quartet_idx,
         }
     }
 
-    // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
-    extern __shared__ int thread_offsets[];
-    thread_offsets[t_id] = count;
+    extern __shared__ int cum_count[];
+    cum_count[t_id] = count;
     // Up-sweep phase
     for (int stride = 1; stride < threads; stride *= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
         if (index < threads) {
-            thread_offsets[index] += thread_offsets[index-stride];
+            cum_count[index] += cum_count[index-stride];
         }
     }
     __syncthreads();
-    if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
     // Down-sweep phase
-    for (int stride = threads/2; stride > 0; stride /= 2) {
+    for (int stride = threads/4; stride > 0; stride /= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
-        if (index < threads) {
-            int temp = thread_offsets[index - stride];
-            thread_offsets[index - stride] = thread_offsets[index];
-            thread_offsets[index] += temp;
+        if (index + stride < threads) {
+            cum_count[index + stride] += cum_count[index];
         }
     }
     __syncthreads();
-    __shared__ int ntasks;
-    if (t_id == threads-1) {
-        ntasks = thread_offsets[threads-1] + count;
-    }
-    __syncthreads();
+    int ntasks = cum_count[threads-1];
     if (ntasks == 0) {
         return ntasks;
     }
 
-    int offset = thread_offsets[t_id];
+    int offset = 0;
+    if (t_id > 0) {
+        offset = cum_count[t_id-1];
+    }
     for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
         int tile_kl = tile_kl_mapping[t_kl_id];
         if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
@@ -504,40 +494,35 @@ static int _fill_jk_tasks_s2kl(ShellQuartet *shl_quartet_idx,
         }
     }
 
-    // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
-    extern __shared__ int thread_offsets[];
-    thread_offsets[t_id] = count;
+    extern __shared__ int cum_count[];
+    cum_count[t_id] = count;
     // Up-sweep phase
     for (int stride = 1; stride < threads; stride *= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
         if (index < threads) {
-            thread_offsets[index] += thread_offsets[index-stride];
+            cum_count[index] += cum_count[index-stride];
         }
     }
     __syncthreads();
-    if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
     // Down-sweep phase
-    for (int stride = threads/2; stride > 0; stride /= 2) {
+    for (int stride = threads/4; stride > 0; stride /= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
-        if (index < threads) {
-            int temp = thread_offsets[index - stride];
-            thread_offsets[index - stride] = thread_offsets[index];
-            thread_offsets[index] += temp;
+        if (index + stride < threads) {
+            cum_count[index + stride] += cum_count[index];
         }
     }
     __syncthreads();
-    __shared__ int ntasks;
-    if (t_id == threads-1) {
-        ntasks = thread_offsets[threads-1] + count;
-    }
-    __syncthreads();
+    int ntasks = cum_count[threads-1];
     if (ntasks == 0) {
         return ntasks;
     }
 
-    int offset = thread_offsets[t_id];
+    int offset = 0;
+    if (t_id > 0) {
+        offset = cum_count[t_id-1];
+    }
     ShellQuartet sq = {(uint16_t)ish, (uint16_t)jsh};
     for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
         int bas_kl = pair_kl_mapping[t_kl_id];
@@ -562,156 +547,3 @@ static int _fill_jk_tasks_s2kl(ShellQuartet *shl_quartet_idx,
     }
     return ntasks;
 }
-
-__device__
-static int _fill_ejk_tasks_tmp(ShellQuartet *shl_quartet_idx,
-                           RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
-                           int batch_ij, int batch_kl)
-{
-    int nbas = envs.nbas;
-    int *tile_ij_mapping = bounds.tile_ij_mapping;
-    int *tile_kl_mapping = bounds.tile_kl_mapping;
-    float *q_cond = bounds.q_cond;
-    float *tile_q_cond = bounds.tile_q_cond;
-    float *dm_cond = bounds.dm_cond;
-    float cutoff = bounds.cutoff;
-    int t_id = threadIdx.y * blockDim.x + threadIdx.x;
-    int t_kl0 = batch_kl * TILES_IN_BATCH;
-    int t_kl1 = MIN(t_kl0 + TILES_IN_BATCH, bounds.ntile_kl_pairs);
-    int threads = blockDim.x * blockDim.y;
-
-    int tile_ij = tile_ij_mapping[batch_ij];
-    int nbas_tiles = nbas / TILE;
-    int tile_i = tile_ij / nbas_tiles;
-    int tile_j = tile_ij % nbas_tiles;
-    int ish0 = tile_i * TILE;
-    int jsh0 = tile_j * TILE;
-    int ish1 = ish0 + TILE;
-    int jsh1 = jsh0 + TILE;
-    int do_j = jk.vj != NULL;
-    int do_k = jk.vk != NULL;
-
-    int count = 0;
-    float tile_q_ij = tile_q_cond[tile_ij];
-    for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
-        int tile_kl = tile_kl_mapping[t_kl_id];
-        if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
-            break;
-        }
-        int tile_k = tile_kl / nbas_tiles;
-        int tile_l = tile_kl % nbas_tiles;
-        int ksh0 = tile_k * TILE;
-        int lsh0 = tile_l * TILE;
-        int ksh1 = ksh0 + TILE;
-        int lsh1 = lsh0 + TILE;
-        for (int ish = ish0; ish < ish1; ++ish) {
-            for (int jsh = jsh0; jsh < MIN(ish+1, jsh1); ++jsh) {
-                float q_ij = q_cond [ish*nbas+jsh];
-                float d_ij = dm_cond[ish*nbas+jsh];
-                int bas_ij = ish * nbas + jsh;
-                for (int ksh = ksh0; ksh < MIN(ish+1, ksh1); ++ksh) {
-                    float d_ik = dm_cond[ish*nbas+ksh];
-                    float d_jk = dm_cond[jsh*nbas+ksh];
-                    for (int lsh = lsh0; lsh < MIN(ksh+1, lsh1); ++lsh) {
-                        int bas_kl = ksh * nbas + lsh;
-                        if (bas_ij < bas_kl) {
-                            continue;
-                        }
-                        float q_ijkl = q_ij + q_cond[ksh*nbas+lsh];
-                        if (q_ijkl < cutoff) {
-                            continue;
-                        }
-                        float d_cutoff = cutoff - q_ijkl;
-                        if ((do_k && (d_ik+dm_cond[jsh*nbas+lsh] > d_cutoff ||
-                                      d_jk+dm_cond[ish*nbas+lsh] > d_cutoff)) ||
-                            (do_j && d_ij+dm_cond[ksh*nbas+lsh] > d_cutoff)) {
-                            ++count;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
-    extern __shared__ int thread_offsets[];
-    thread_offsets[t_id] = count;
-    // Up-sweep phase
-    for (int stride = 1; stride < threads; stride *= 2) {
-        __syncthreads();
-        int index = (t_id + 1) * stride * 2 - 1;
-        if (index < threads) {
-            thread_offsets[index] += thread_offsets[index-stride];
-        }
-    }
-    __syncthreads();
-    if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
-    // Down-sweep phase
-    for (int stride = threads/2; stride > 0; stride /= 2) {
-        __syncthreads();
-        int index = (t_id + 1) * stride * 2 - 1;
-        if (index < threads) {
-            int temp = thread_offsets[index - stride];
-            thread_offsets[index - stride] = thread_offsets[index];
-            thread_offsets[index] += temp;
-        }
-    }
-    __syncthreads();
-    __shared__ int ntasks;
-    if (t_id == threads-1) {
-        ntasks = thread_offsets[threads-1] + count;
-    }
-    __syncthreads();
-    if (ntasks == 0) {
-        return ntasks;
-    }
-
-    int offset = thread_offsets[t_id];
-    for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
-        int tile_kl = tile_kl_mapping[t_kl_id];
-        if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
-            break;
-        }
-        int tile_k = tile_kl / nbas_tiles;
-        int tile_l = tile_kl % nbas_tiles;
-        int ksh0 = tile_k * TILE;
-        int lsh0 = tile_l * TILE;
-        int ksh1 = ksh0 + TILE;
-        int lsh1 = lsh0 + TILE;
-        ShellQuartet sq;
-        for (int ish = ish0; ish < ish1; ++ish) {
-            for (int jsh = jsh0; jsh < MIN(ish+1, jsh1); ++jsh) {
-                float q_ij = q_cond [ish*nbas+jsh];
-                float d_ij = dm_cond[ish*nbas+jsh];
-                int bas_ij = ish * nbas + jsh;
-                sq.i = ish;
-                sq.j = jsh;
-                for (int ksh = ksh0; ksh < MIN(ish+1, ksh1); ++ksh) {
-                    float d_ik = dm_cond[ish*nbas+ksh];
-                    float d_jk = dm_cond[jsh*nbas+ksh];
-                    for (int lsh = lsh0; lsh < MIN(ksh+1, lsh1); ++lsh) {
-                        int bas_kl = ksh * nbas + lsh;
-                        if (bas_ij < bas_kl) {
-                            continue;
-                        }
-                        float q_ijkl = q_ij + q_cond[ksh*nbas+lsh];
-                        if (q_ijkl < cutoff) {
-                            continue;
-                        }
-                        float d_cutoff = cutoff - q_ijkl;
-                        if ((do_k && (d_ik+dm_cond[jsh*nbas+lsh] > d_cutoff ||
-                                      d_jk+dm_cond[ish*nbas+lsh] > d_cutoff)) ||
-                            (do_j && d_ij+dm_cond[ksh*nbas+lsh] > d_cutoff)) {
-                            sq.k = ksh;
-                            sq.l = lsh;
-                            shl_quartet_idx[offset] = sq;
-                            ++offset;
-                        }
-                    }
-                }
-            }
-        }
-    }
-    return ntasks;
-}
-
diff --git a/gpu4pyscf/lib/gvhf-rys/create_tasks_ip2.cu b/gpu4pyscf/lib/gvhf-rys/create_tasks_ip2.cu
index ef62227a..df22b535 100644
--- a/gpu4pyscf/lib/gvhf-rys/create_tasks_ip2.cu
+++ b/gpu4pyscf/lib/gvhf-rys/create_tasks_ip2.cu
@@ -71,40 +71,35 @@ static int _fill_ejk_ip2_type2_tasks(ShellQuartet *shl_quartet_idx,
         }
     }
 
-    // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
-    extern __shared__ int thread_offsets[];
-    thread_offsets[t_id] = count;
+    extern __shared__ int cum_count[];
+    cum_count[t_id] = count;
     // Up-sweep phase
     for (int stride = 1; stride < threads; stride *= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
         if (index < threads) {
-            thread_offsets[index] += thread_offsets[index-stride];
+            cum_count[index] += cum_count[index-stride];
         }
     }
     __syncthreads();
-    if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
     // Down-sweep phase
-    for (int stride = threads/2; stride > 0; stride /= 2) {
+    for (int stride = threads/4; stride > 0; stride /= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
-        if (index < threads) {
-            int temp = thread_offsets[index - stride];
-            thread_offsets[index - stride] = thread_offsets[index];
-            thread_offsets[index] += temp;
+        if (index + stride < threads) {
+            cum_count[index + stride] += cum_count[index];
         }
     }
     __syncthreads();
-    __shared__ int ntasks;
-    if (t_id == threads-1) {
-        ntasks = thread_offsets[threads-1] + count;
-    }
-    __syncthreads();
+    int ntasks = cum_count[threads-1];
     if (ntasks == 0) {
         return ntasks;
     }
 
-    int offset = thread_offsets[t_id];
+    int offset = 0;
+    if (t_id > 0) {
+        offset = cum_count[t_id-1];
+    }
     for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
         int tile_kl = tile_kl_mapping[t_kl_id];
         if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
@@ -218,40 +213,35 @@ static int _fill_ejk_ip2_type3_tasks(ShellQuartet *shl_quartet_idx,
         }
     }
 
-    // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
-    extern __shared__ int thread_offsets[];
-    thread_offsets[t_id] = count;
+    extern __shared__ int cum_count[];
+    cum_count[t_id] = count;
     // Up-sweep phase
     for (int stride = 1; stride < threads; stride *= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
         if (index < threads) {
-            thread_offsets[index] += thread_offsets[index-stride];
+            cum_count[index] += cum_count[index-stride];
         }
     }
     __syncthreads();
-    if (t_id == threads-1) { thread_offsets[threads-1] = 0; }
     // Down-sweep phase
-    for (int stride = threads/2; stride > 0; stride /= 2) {
+    for (int stride = threads/4; stride > 0; stride /= 2) {
         __syncthreads();
         int index = (t_id + 1) * stride * 2 - 1;
-        if (index < threads) {
-            int temp = thread_offsets[index - stride];
-            thread_offsets[index - stride] = thread_offsets[index];
-            thread_offsets[index] += temp;
+        if (index + stride < threads) {
+            cum_count[index + stride] += cum_count[index];
         }
     }
     __syncthreads();
-    __shared__ int ntasks;
-    if (t_id == threads-1) {
-        ntasks = thread_offsets[threads-1] + count;
-    }
-    __syncthreads();
+    int ntasks = cum_count[threads-1];
     if (ntasks == 0) {
         return ntasks;
     }
 
-    int offset = thread_offsets[t_id];
+    int offset = 0;
+    if (t_id > 0) {
+        offset = cum_count[t_id-1];
+    }
     for (int t_kl_id = t_kl0+t_id; t_kl_id < t_kl1; t_kl_id += threads) {
         int tile_kl = tile_kl_mapping[t_kl_id];
         if (tile_q_ij + tile_q_cond[tile_kl] < cutoff) {
diff --git a/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu b/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu
index 1b2b79b3..6cbd22a5 100644
--- a/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu
+++ b/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu
@@ -23,6 +23,9 @@
 #include "rys_roots.cu"
 #include "create_tasks.cu"
 
+// TODO: benchmark performance for 34, 36, 41, 43, 45, 47, 51, 57
+#define GOUT_WIDTH       42
+
 __device__
 static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
                            ShellQuartet *shl_quartet_idx, int ntasks)
@@ -69,7 +72,7 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
     double *g = rw + nsq_per_block * nroots*2;
     double *Rpa_cicj = g + nsq_per_block * g_size*3;
     double Rqc[3], Rpq[3];
-    double gout[GWIDTH];
+    double gout[GOUT_WIDTH];
 
     for (int task0 = 0; task0 < ntasks; task0 += nsq_per_block) {
         __syncthreads();
@@ -126,9 +129,10 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
             double Kab = exp(-theta_ij * (xjxi*xjxi+yjyi*yjyi+zjzi*zjzi));
             Rpa[sq_id+3*nsq_per_block] = fac_sym * ci[ip] * cj[jp] * Kab;
         }
-        for (int gout_start = 0; gout_start < nfij*nfkl; gout_start+=gout_stride*GWIDTH) {
+        for (int gout_start = 0; gout_start < nfij*nfkl;
+             gout_start+=gout_stride*GOUT_WIDTH) {
 #pragma unroll
-        for (int n = 0; n < GWIDTH; ++n) { gout[n] = 0; }
+        for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; }
 
         for (int klp = 0; klp < kprim*lprim; ++klp) {
             int kp = klp / lprim;
@@ -197,11 +201,6 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
                     }
                     double rt = rw[sq_id + irys*2*nsq_per_block];
                     double rt_aa = rt / (aij + akl);
-                    double rt_aij = rt_aa * akl;
-                    double rt_akl = rt_aa * aij;
-                    double b00 = .5 * rt_aa;
-                    double b10 = .5/aij * (1 - rt_aij);
-                    double b01 = .5/akl * (1 - rt_akl);
 
                     // TRR
                     //for i in range(lij):
@@ -211,6 +210,8 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
                     //        trr(i,k+1) = c0p * trr(i,k) + k*b01 * trr(i,k-1) + i*b00 * trr(i-1,k)
                     if (lij > 0) {
                         __syncthreads();
+                        double rt_aij = rt_aa * akl;
+                        double b10 = .5/aij * (1 - rt_aij);
                         // gx(0,n+1) = c0*gx(0,n) + n*b10*gx(0,n-1)
                         for (int n = gout_id; n < 3; n += gout_stride) {
                             double *_gx = g + n * g_size * nsq_per_block;
@@ -230,6 +231,9 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
 
                     if (lkl > 0) {
                         int lij3 = (lij+1)*3;
+                        double rt_akl = rt_aa * aij;
+                        double b00 = .5 * rt_aa;
+                        double b01 = .5/akl * (1 - rt_akl);
                         for (int n = gout_id; n < lij3+gout_id; n += gout_stride) {
                             __syncthreads();
                             int i = n / 3; //for i in range(lij+1):
@@ -315,7 +319,7 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
                     double *gy = gx + nsq_per_block * g_size;
                     double *gz = gy + nsq_per_block * g_size;
 #pragma unroll
-                    for (int n = 0; n < GWIDTH; ++n) {
+                    for (int n = 0; n < GOUT_WIDTH; ++n) {
                         int ijkl = (gout_start + n*gout_stride+gout_id);
                         int kl = ijkl / nfij;
                         int ij = ijkl % nfij;
@@ -338,7 +342,7 @@ static void rys_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds,
         int do_k = vk != NULL;
         for (int i_dm = 0; i_dm < jk.n_dm; ++i_dm) {
 #pragma unroll
-            for (int n = 0; n < GWIDTH; ++n) {
+            for (int n = 0; n < GOUT_WIDTH; ++n) {
                 int ijkl = (gout_start + n*gout_stride+gout_id);
                 int kl = ijkl / nfij;
                 int ij = ijkl % nfij;
@@ -422,7 +426,7 @@ static void rys_sr_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds
     double *g = rw + nsq_per_block * nroots*2;
     double *Rpa_cicj = g + nsq_per_block * g_size*3;
     double Rqc[3], Rpq[3];
-    double gout[GWIDTH];
+    double gout[GOUT_WIDTH];
 
     for (int task0 = 0; task0 < ntasks; task0 += nsq_per_block) {
         __syncthreads();
@@ -479,9 +483,10 @@ static void rys_sr_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds
             double Kab = exp(-theta_ij * (xjxi*xjxi+yjyi*yjyi+zjzi*zjzi));
             Rpa[sq_id+3*nsq_per_block] = fac_sym * ci[ip] * cj[jp] * Kab;
         }
-        for (int gout_start = 0; gout_start < nfij*nfkl; gout_start+=gout_stride*GWIDTH) {
+        for (int gout_start = 0; gout_start < nfij*nfkl;
+             gout_start+=gout_stride*GOUT_WIDTH) {
 #pragma unroll
-        for (int n = 0; n < GWIDTH; ++n) { gout[n] = 0; }
+        for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; }
 
         for (int klp = 0; klp < kprim*lprim; ++klp) {
             int kp = klp / lprim;
@@ -669,7 +674,7 @@ static void rys_sr_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds
                     double *gy = gx + nsq_per_block * g_size;
                     double *gz = gy + nsq_per_block * g_size;
 #pragma unroll
-                    for (int n = 0; n < GWIDTH; ++n) {
+                    for (int n = 0; n < GOUT_WIDTH; ++n) {
                         int ijkl = gout_start + n*gout_stride+gout_id;
                         int kl = ijkl / nfij;
                         int ij = ijkl % nfij;
@@ -692,7 +697,7 @@ static void rys_sr_jk_general(RysIntEnvVars envs, JKMatrix jk, BoundsInfo bounds
         int do_k = vk != NULL;
         for (int i_dm = 0; i_dm < jk.n_dm; ++i_dm) {
 #pragma unroll
-            for (int n = 0; n < GWIDTH; ++n) {
+            for (int n = 0; n < GOUT_WIDTH; ++n) {
                 int ijkl = (gout_start + n*gout_stride+gout_id);
                 int kl = ijkl / nfij;
                 int ij = ijkl % nfij;
diff --git a/gpu4pyscf/lib/gvhf-rys/rys_jk_driver.cu b/gpu4pyscf/lib/gvhf-rys/rys_jk_driver.cu
index 04c6d3ee..ba3c14a5 100644
--- a/gpu4pyscf/lib/gvhf-rys/rys_jk_driver.cu
+++ b/gpu4pyscf/lib/gvhf-rys/rys_jk_driver.cu
@@ -201,7 +201,7 @@ int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao,
         int gout_stride = scheme[1];
         int ij_prims = iprim * jprim;
         dim3 threads(quartets_per_block, gout_stride);
-        int buflen = (nroots*4 + g_size*3 + ij_prims*4) * quartets_per_block;// + ij_prims*4*TILE2;
+        int buflen = (nroots*2 + g_size*3 + ij_prims*4) * quartets_per_block;// + ij_prims*4*TILE2;
         rys_sr_jk_kernel<<<workers, threads, buflen*sizeof(double)>>>(envs, jk, bounds, pool, batch_head);
     }
     cudaError_t err = cudaGetLastError();
@@ -329,7 +329,7 @@ int RYS_per_atom_jk_ip1(double *ejk, double j_factor, double k_factor,
         int ij_prims = iprim * jprim;
         dim3 threads(quartets_per_block, gout_stride);
         int buflen = (nroots*2 + g_size*3 + ij_prims*4) * quartets_per_block;
-        buflen = MAX(buflen, 9*gout_stride*quartets_per_block);
+        buflen = MAX(buflen, 12*gout_stride*quartets_per_block);
         rys_ejk_ip1_kernel<<<workers, threads, buflen*sizeof(double)>>>(envs, jk, bounds, pool, batch_head);
     }
     cudaError_t err = cudaGetLastError();
diff --git a/gpu4pyscf/lib/logger.py b/gpu4pyscf/lib/logger.py
index c715976e..54713c43 100644
--- a/gpu4pyscf/lib/logger.py
+++ b/gpu4pyscf/lib/logger.py
@@ -17,9 +17,6 @@
 import cupy
 from pyscf import lib
 
-from pyscf.lib import parameters as param
-import pyscf.__config__
-
 INFO = lib.logger.INFO
 NOTE = lib.logger.NOTE
 WARN = lib.logger.WARN
@@ -29,66 +26,63 @@
 TIMER_LEVEL = lib.logger.TIMER_LEVEL
 flush = lib.logger.flush
 
-if sys.version_info < (3, 0):
-    process_clock = time.clock
-    perf_counter = time.time
-else:
-    process_clock = time.process_time
-    perf_counter = time.perf_counter
+process_clock = time.process_time
+perf_counter = time.perf_counter
 
 
 def init_timer(rec):
-    if rec.verbose >= TIMER_LEVEL:
-        e0 = cupy.cuda.Event()
-        e0.record()
-        return (process_clock(), perf_counter(), e0)
-    elif rec.verbose >= DEBUG:
-        return (process_clock(), perf_counter())
-    else:
-        return process_clock(),
+    e0 = cupy.cuda.Event()
+    e0.record()
+    return (process_clock(), perf_counter(), e0)
 
 def timer(rec, msg, cpu0=None, wall0=None, gpu0=None):
-    if cpu0 is None:
-        cpu0 = rec._t0
-    if wall0 and gpu0:
-        rec._t0, rec._w0, rec._e0 = process_clock(), perf_counter(), cupy.cuda.Event()
+    if gpu0:
+        t0, w0, e0 = process_clock(), perf_counter(), cupy.cuda.Event()
+        e0.record()
         if rec.verbose >= TIMER_LEVEL:
-            rec._e0.record()
-            rec._e0.synchronize()
-
+            e0.synchronize()
             flush(rec, '    CPU time for %-50s %9.2f sec, wall time %9.2f sec, GPU time %9.2f ms'
-                  % (msg, rec._t0-cpu0, rec._w0-wall0, cupy.cuda.get_elapsed_time(gpu0,rec._e0)))
-        return rec._t0, rec._w0, rec._e0
+                  % (msg, t0-cpu0, w0-wall0, cupy.cuda.get_elapsed_time(gpu0,e0)))
+        return t0, w0, e0
     elif wall0:
-        rec._t0, rec._w0 = process_clock(), perf_counter()
+        t0, w0 = process_clock(), perf_counter()
         if rec.verbose >= TIMER_LEVEL:
             flush(rec, '    CPU time for %s %9.2f sec, wall time %9.2f sec'
-                  % (msg, rec._t0-cpu0, rec._w0-wall0))
-        return rec._t0, rec._w0
+                  % (msg, t0-cpu0, w0-wall0))
+        return t0, w0
     else:
-        rec._t0 = process_clock()
+        t0 = process_clock()
         if rec.verbose >= TIMER_LEVEL:
-            flush(rec, '    CPU time for %s %9.2f sec' % (msg, rec._t0-cpu0))
-        return rec._t0,
+            flush(rec, '    CPU time for %s %9.2f sec' % (msg, t0-cpu0))
+        return t0,
 
 def _timer_debug1(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True):
     if rec.verbose >= DEBUG1:
         return timer(rec, msg, cpu0, wall0, gpu0)
-    elif wall0 and gpu0:
-        rec._t0, rec._w0, rec._e0 = process_clock(), perf_counter(), cupy.cuda.Event()
-        rec._e0.record()
-        return rec._t0, rec._w0, rec._e0
+    elif gpu0:
+        t0, w0, e0 = process_clock(), perf_counter(), cupy.cuda.Event()
+        e0.record()
+        return t0, w0, e0
     elif wall0:
-        rec._t0, rec._w0 = process_clock(), perf_counter()
-        return rec._t0, rec._w0
+        t0, w0 = process_clock(), perf_counter()
+        return t0, w0
     else:
-        rec._t0 = process_clock()
-        return rec._t0,
+        t0 = process_clock()
+        return t0,
 
 def _timer_debug2(rec, msg, cpu0=None, wall0=None, gpu0=None, sync=True):
     if rec.verbose >= DEBUG2:
         return timer(rec, msg, cpu0, wall0, gpu0)
-    return cpu0, wall0, gpu0
+    elif gpu0:
+        t0, w0, e0 = process_clock(), perf_counter(), cupy.cuda.Event()
+        e0.record()
+        return t0, w0, e0
+    elif wall0:
+        t0, w0 = process_clock(), perf_counter()
+        return t0, w0
+    else:
+        t0 = process_clock()
+        return t0,
 
 info = lib.logger.info
 note = lib.logger.note
diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py
index c961a9a2..ce52046a 100644
--- a/gpu4pyscf/lib/memcpy.py
+++ b/gpu4pyscf/lib/memcpy.py
@@ -15,6 +15,27 @@
 
 import cupy
 import numpy as np
+from gpu4pyscf.__config__ import _p2p_access
+
+__all__ = ['p2p_transfer', 'copy_array']
+
+def p2p_transfer(a, b):
+    ''' If the direct P2P data transfer is not available, transfer data via CPU memory
+    '''
+    if a.device == b.device:
+        a[:] = b
+    elif _p2p_access:
+        a[:] = b
+        '''
+    elif a.strides == b.strides and a.flags.c_contiguous and a.dtype == b.dtype:
+        # cupy supports a direct copy from different devices without p2p. See also
+        # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L48
+        # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L1015
+        a[:] = b
+        '''
+    else:
+        copy_array(b, a)
+    return a
 
 def find_contiguous_chunks(shape, h_strides, d_strides):
     """
diff --git a/gpu4pyscf/lib/multi_gpu.py b/gpu4pyscf/lib/multi_gpu.py
new file mode 100644
index 00000000..f9e1e8ee
--- /dev/null
+++ b/gpu4pyscf/lib/multi_gpu.py
@@ -0,0 +1,153 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from concurrent.futures import ThreadPoolExecutor
+import cupy as cp
+import numpy as np
+from pyscf.lib import prange
+from gpu4pyscf.lib.memcpy import p2p_transfer
+from gpu4pyscf.__config__ import num_devices
+
+def run(func, args=(), kwargs={}, non_blocking=False):
+    '''Execute a function on each GPU.
+
+    Kwargs:
+        non_blocking: If `True`, functions are executed in parallel using multi-threads.
+    '''
+    if num_devices == 1:
+        return [func(*args, *kwargs)]
+
+    def proc(device_id):
+        with cp.cuda.Device(device_id):
+            return func(*args, **kwargs)
+
+    if not non_blocking:
+        return [proc(i) for i in range(num_devices)]
+
+    with ThreadPoolExecutor(max_workers=num_devices) as ex:
+        futures = [ex.submit(proc, i) for i in range(num_devices)]
+        return [fut.result() for fut in futures]
+
+def map(func, tasks, args=(), kwargs={}, schedule='dynamic') -> list:
+    '''Distributes tasks to multiple GPU devices for parallel computation.
+
+    Kwargs:
+        schedule: controls how the tasks are distributed. Can be 'static' or 'dynamic'.
+            If 'static', tasks are distributed in the round-robin fashion;
+            If 'dynamic', tasks are scheduled dynamically, with better load balance.
+    '''
+    if num_devices == 1:
+        return [func(t, *args, *kwargs) for t in tasks]
+
+    tasks = list(enumerate(tasks))
+    result = [None] * len(tasks)
+
+    def consumer():
+        if schedule == 'dynamic':
+            stream = cp.cuda.stream.get_current_stream()
+            while tasks:
+                try:
+                    key, t = tasks.pop()
+                except IndexError:
+                    return
+                result[key] = func(t, *args, **kwargs)
+                stream.synchronize()
+        else:
+            device_id = cp.cuda.device.get_device_id()
+            for key, t in tasks[device_id::num_devices]:
+                result[key] = func(t, *args, **kwargs)
+
+    run(consumer, non_blocking=True)
+    return result
+
+def reduce(func, tasks, args=(), kwargs={}, schedule='dynamic'):
+    '''Processes tasks on multiple GPU devices and returns the sum of the results.
+    '''
+    result = map(func, tasks, args, kwargs)
+    dtype = cp.result_type(*result)
+    if num_devices == 1:
+        out = result[0].astype(dtype=dtype, copy=False)
+        for r in result[1:]:
+            out += r
+        return out
+
+    groups = [None] * num_devices
+    for r in result:
+        device_id = r.device.id
+        if groups[device_id] is None:
+            groups[device_id] = r.astype(dtype, copy=False)
+        else:
+            groups[device_id] += r
+
+    for i in num_devices:
+        if groups[i] is None:
+            groups[i] = cp.zeros(result[0].shape, dtype=dtype)
+    return array_reduce(groups, inplace=True)
+
+def array_broadcast(a):
+    '''Broadcast a cupy ndarray to all devices, return a list of cupy ndarrays.
+    '''
+    if num_devices == 1:
+        return [a]
+
+    out = [None] * num_devices
+    out[0] = a
+
+    # Tree broadcast
+    step = num_devices >> 1
+    while step > 0:
+        for device_id in range(0, num_devices, 2*step):
+            if device_id + step < num_devices:
+                with cp.cuda.Device(device_id+step):
+                    out[device_id+step] = dst = cp.empty_like(a)
+                    p2p_transfer(dst, a)
+        step >>= 1
+    return out
+
+def array_reduce(array_list, inplace=False):
+    '''The sum of cupy ndarrays from all devices to device 0.
+    '''
+    assert len(array_list) == num_devices
+    if num_devices == 1:
+        return array_list[0]
+
+    a0 = array_list[0]
+    out_shape = a0.shape
+    size = a0.size
+    dtype = a0.dtype
+    assert all(x.dtype == dtype for x in array_list)
+
+    array_list = list(array_list)
+    for device_id in range(num_devices):
+        with cp.cuda.Device(device_id):
+            if inplace or device_id % 2 == 1:
+                array_list[device_id] = array_list[device_id].ravel()
+            else:
+                array_list[device_id] = array_list[device_id].copy().ravel()
+
+    blksize = 1024*1024*1024 // dtype.itemsize # 1GB
+    # Tree-reduce
+    step = 1
+    while step < num_devices:
+        for device_id in range(0, num_devices, 2*step):
+            if device_id + step < num_devices:
+                with cp.cuda.Device(device_id):
+                    dst = array_list[device_id]
+                    src = array_list[device_id+step]
+                    buf = cp.empty_like(dst[:blksize])
+                    for p0, p1 in prange(0, size, blksize):
+                        dst[p0:p1] += p2p_transfer(buf[:p1-p0], src[p0:p1])
+        step *= 2
+    return array_list[0].reshape(out_shape)
diff --git a/gpu4pyscf/lib/pbc/CMakeLists.txt b/gpu4pyscf/lib/pbc/CMakeLists.txt
index a961cbc2..f8d7a842 100644
--- a/gpu4pyscf/lib/pbc/CMakeLists.txt
+++ b/gpu4pyscf/lib/pbc/CMakeLists.txt
@@ -2,6 +2,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")# -maxrregcount=12
 
 add_library(pbc SHARED
   pbc_driver.cu ft_ao.cu unrolled_ft_ao.cu
+  fill_int3c2e.cu unrolled_int3c2e.cu
 )
 
 set_target_properties(pbc PROPERTIES
diff --git a/gpu4pyscf/lib/pbc/fill_int3c2e.cu b/gpu4pyscf/lib/pbc/fill_int3c2e.cu
new file mode 100644
index 00000000..55aa3fcc
--- /dev/null
+++ b/gpu4pyscf/lib/pbc/fill_int3c2e.cu
@@ -0,0 +1,702 @@
+/*
+ * Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+#include "gvhf-rys/vhf.cuh"
+#include "rys_roots.cu"
+#include "int3c2e.cuh"
+
+#define THREADS         (WARP_SIZE*WARPS)
+// TODO: benchmark performance for 32, 38, 40, 45, 54
+#define GOUT_WIDTH      45
+
+__global__
+void pbc_int3c2e_kernel(double *out, PBCInt3c2eEnvVars envs, PBCInt3c2eBounds bounds)
+{
+    int nksh_per_block = blockDim.x;
+    int gout_stride = blockDim.y;
+    int nsp_per_block = blockDim.z;
+    int ksh_id = threadIdx.x;
+    int gout_id = threadIdx.y;
+    int sp_id = threadIdx.z;
+    int sp_block_id = blockIdx.x;
+    int ksh_block_id = blockIdx.y;
+
+    int nksp_per_block = nksh_per_block * nsp_per_block;
+    int ksp_id = nksh_per_block * sp_id + ksh_id;
+    int thread_id = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
+    int warp_id = thread_id / WARP_SIZE;
+    int nimgs = envs.nimgs;
+    int sp0_this_block = sp_block_id * nsp_per_block * SPTAKS_PER_BLOCK;
+    int ksh0_this_block = ksh_block_id * nksh_per_block;
+    int nksh = MIN(bounds.nksh - ksh0_this_block, nksh_per_block);
+    int ksh0 = ksh0_this_block + bounds.ksh0;
+
+    int li = bounds.li;
+    int lj = bounds.lj;
+    int lk = bounds.lk;
+    int lij = li + lj;
+    int nroots = bounds.nroots;
+    int nfi = bounds.nfi;
+    int nfij = bounds.nfij;
+    int nfk = bounds.nfk;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int stride_j = bounds.stride_j;
+    int stride_k = bounds.stride_k;
+    int g_size = bounds.g_size;
+    int *idx_ij = c_g_pair_idx + c_g_pair_offsets[li*LMAX1+lj];
+    int *idy_ij = idx_ij + nfij;
+    int *idz_ij = idy_ij + nfij;
+    int lk_offset = lk * (lk + 1) * (lk + 2) / 2;
+    int *idx_k = c_g_cart_idx + lk_offset;
+    int *idy_k = idx_k + nfk;
+    int *idz_k = idy_k + nfk;
+    int *bas = envs.bas;
+    double *env = envs.env;
+    double *img_coords = envs.img_coords;
+    int *img_idx = bounds.img_idx;
+    int *sp_img_offsets = bounds.img_offsets;
+    double omega = env[PTR_RANGE_OMEGA];
+
+    int gx_len = g_size * nksp_per_block;
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + ksp_id;
+    double *g = rw + nksp_per_block * nroots*2;
+    double *gx = g;
+    double *gy = gx + gx_len;
+    double *gz = gy + gx_len;
+    double *rjri = gz + gx_len;
+    double *Rpq = rjri + nksp_per_block * 3;
+    __shared__ int img_counts_in_warp[WARPS];
+    double gout[GOUT_WIDTH];
+
+    int ntasks = nksh * nsp_per_block * SPTAKS_PER_BLOCK;
+    for (int task_id = 0; task_id < ntasks; task_id += nksp_per_block) {
+        // convert task_id to ish, jsh, ksh
+        int ijk_idx = task_id + ksp_id;
+        int ksh = ijk_idx % nksh + ksh0;
+        int pair_ij_idx = ijk_idx / nksh + sp0_this_block;
+        int img1 = 1;
+        int pair_ij = pair_ij_idx;
+        if (pair_ij_idx >= bounds.npairs_ij) {
+            pair_ij = sp0_this_block;
+        } else {
+            img1 = sp_img_offsets[pair_ij_idx+1];
+        }
+        int bas_ij = bounds.bas_ij_idx[pair_ij];
+        int img0 = sp_img_offsets[pair_ij];
+        int thread_id_in_warp = thread_id % WARP_SIZE;
+        if (thread_id_in_warp == 0) {
+            img_counts_in_warp[warp_id] = 0;
+        }
+        atomicMax(&img_counts_in_warp[warp_id], img1-img0);
+        __syncthreads();
+
+        int nbas = envs.cell0_nbas * envs.bvk_ncells;
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+
+        for (int gout_start = 0; gout_start < nfij*nfk;
+             gout_start+=gout_stride*GOUT_WIDTH) {
+#pragma unroll
+            for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; }
+
+            for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+                int ijp = ijkp / kprim;
+                int kp = ijkp % kprim;
+                int ip = ijp / jprim;
+                int jp = ijp % jprim;
+                double ai = expi[ip];
+                double aj = expj[jp];
+                double ak = expk[kp];
+                double aij = ai + aj;
+                double cijk = ci[ip] * cj[jp] * ck[kp];
+                __syncthreads();
+                if (gout_id == 0) {
+                    double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+                    gy[0] = fac;
+                }
+                int img_counts = img_counts_in_warp[warp_id];
+                for (int img = 0; img < img_counts; ++img) {
+                    int img_id = img0 + img;
+                    __syncthreads();
+                    if (img_id >= img1) {
+                        // ensure the same number of images processed in the same warp
+                        img_id = img0;
+                        if (gout_id == 0) {
+                            gy[0] = 0.;
+                        }
+                    }
+                    int img_ij = img_idx[img_id];
+                    int iL = img_ij / nimgs;
+                    int jL = img_ij % nimgs;
+                    double xi = ri[0] + img_coords[iL*3+0];
+                    double yi = ri[1] + img_coords[iL*3+1];
+                    double zi = ri[2] + img_coords[iL*3+2];
+                    double xj = rj[0] + img_coords[jL*3+0];
+                    double yj = rj[1] + img_coords[jL*3+1];
+                    double zj = rj[2] + img_coords[jL*3+2];
+                    double xjxi = xj - xi;
+                    double yjyi = yj - yi;
+                    double zjzi = zj - zi;
+                    double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+                    double aj_aij = aj / aij;
+                    double theta_ij = ai * aj_aij;
+                    double Kab = theta_ij * rr_ij;
+
+                    double xij = xjxi * aj_aij + xi;
+                    double yij = yjyi * aj_aij + yi;
+                    double zij = zjzi * aj_aij + zi;
+                    double xpq = xij - rk[0];
+                    double ypq = yij - rk[1];
+                    double zpq = zij - rk[2];
+                    double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+                    double theta = aij * ak / (aij + ak);
+                    double omega2 = omega * omega;
+                    double theta_fac = omega2 / (omega2 + theta);
+                    double theta_rr = theta * rr;
+// Somehow, this screening test does not filter out many integrals.
+// More benchmarks are needed
+#if 0
+                    __shared__ int8_t img_mask[WARPS];
+                    if (thread_id_in_warp == 0) {
+                        img_mask[warp_id] = 0;
+                    }
+                    float Kab_f32 = Kab;
+                    // IMPORTANT: run the screening test on each warp.
+                    // When nksh_per_block*gout_stride>32, gout is evaluated across warps.
+                    // If tests are skipped for some warps, g[xyz] vectors and
+                    // gout on these warps will never be evaluated. These warps
+                    // may proceeed to a wrong __syncthreads() barrier and
+                    // produce wrong g[xyz].
+                    float log_cutoff = envs.log_cutoff;
+                    if ((thread_id_in_warp / nksh_per_block == 0) &&
+                        img0+img < img1 && 5.f+2.f*lij-Kab_f32 > log_cutoff) {
+                        // check any not vanished integrals
+                        float ai_f32 = ai;
+                        float aj_f32 = aj;
+                        float aij_f32 = aij;
+                        float ak_f32 = ak;
+                        float fi = ai_f32 / aij_f32;
+                        float fj = aj_f32 / aij_f32;
+                        // fac_guess = log(sqrt(2.x/(omega*sqrt(pi))) * ((2*li+1)*(2*lj+1)*(2*lk+1))**.5/(4*pi)**1.5)
+                        //           ~ between [0, 2]
+                        float fac_guess = 1.f;
+                        // fac in Eq 63 of arXiv:2302.11307 ~ log(ci*cj*ck * (pi^2/(aij*ak))**1.5)
+                        float log_fac = logf(fabs(cijk)) + 3.434f - 1.5f*logf(aij_f32*ak_f32) + fac_guess;
+                        float theta_fac_rr = (float)theta_fac * (float)theta_rr;
+                        float rt_aa = sqrtf((float)rr) / (aij_f32+ak_f32) + 1e-9f;
+                        float rt_aij = rt_aa * ak_f32;
+                        float rt_akl = rt_aa * aij_f32;
+                        float r = sqrtf((float)rr_ij);
+                        float ti = fj * r + rt_aij;
+                        float tj = fi * r + rt_aij;
+                        float ti_fac = .5f*li * logf(ti*ti + .5f*li/aij_f32);
+                        float tj_fac = .5f*lj * logf(tj*tj + .5f*lj/aij_f32);
+                        float tk_fac = .5f*lk * logf(rt_akl*rt_akl + .5f*lk/ak_f32);
+                        float estimator = log_fac + ti_fac + tj_fac + tk_fac - Kab_f32 - theta_fac_rr;
+                        if (estimator > log_cutoff) {
+                            img_mask[warp_id] = 1;
+                        }
+                    }
+                    __syncthreads();
+                    if (img_mask[warp_id] == 0) {
+                        continue;
+                    }
+#endif
+                    if (gout_id == 0) {
+                        rjri[0*nksp_per_block] = xjxi;
+                        rjri[1*nksp_per_block] = yjyi;
+                        rjri[2*nksp_per_block] = zjzi;
+                        Rpq[0*nksp_per_block] = xpq;
+                        Rpq[1*nksp_per_block] = ypq;
+                        Rpq[2*nksp_per_block] = zpq;
+                        gx[0] = exp(-Kab);
+                    }
+                    int _nroots = nroots/2;
+                    rys_roots(_nroots, theta_rr, rw+nroots*nksp_per_block,
+                              nksp_per_block, gout_id, gout_stride);
+                    rys_roots(_nroots, theta_fac*theta_rr, rw,
+                              nksp_per_block, gout_id, gout_stride);
+                    __syncthreads();
+                    double sqrt_theta_fac = -sqrt(theta_fac);
+                    for (int irys = gout_id; irys < _nroots; irys+=gout_stride) {
+                        rw[ irys*2   *nksp_per_block] *= theta_fac;
+                        rw[(irys*2+1)*nksp_per_block] *= sqrt_theta_fac;
+                    }
+                    double s0x, s1x, s2x;
+                    for (int irys = 0; irys < nroots; ++irys) {
+                        __syncthreads();
+                        if (gout_id == 0) {
+                            gz[0] = rw[(irys*2+1)*nksp_per_block];
+                        }
+                        double rt = rw[ irys*2   *nksp_per_block];
+                        double rt_aa = rt / (aij + ak);
+
+                        if (lij > 0) {
+                            __syncthreads();
+                            double rt_aij = rt_aa * ak;
+                            double b10 = .5/aij * (1 - rt_aij);
+                            // gx(0,n+1) = c0*gx(0,n) + n*b10*gx(0,n-1)
+                            for (int n = gout_id; n < 3; n += gout_stride) {
+                                double *_gx = gx + n * gx_len;
+                                double xpa = rjri[n*nksp_per_block] * aj_aij;
+                                //double c0x = Rpa[ir] - rt_aij * Rpq[n];
+                                double c0x = xpa - rt_aij * Rpq[n*nksp_per_block];
+                                s0x = _gx[0];
+                                s1x = c0x * s0x;
+                                _gx[nksp_per_block] = s1x;
+                                for (int i = 1; i < lij; ++i) {
+                                    s2x = c0x * s1x + i * b10 * s0x;
+                                    _gx[(i+1)*nksp_per_block] = s2x;
+                                    s0x = s1x;
+                                    s1x = s2x;
+                                }
+                            }
+                        }
+
+                        if (lk > 0) {
+                            int lij3 = (lij+1)*3;
+                            double rt_ak  = rt_aa * aij;
+                            double b00 = .5 * rt_aa;
+                            double b01 = .5/ak  * (1 - rt_ak );
+                            for (int n = gout_id; n < lij3+gout_id; n += gout_stride) {
+                                __syncthreads();
+                                int i = n / 3; //for i in range(lij+1):
+                                int _ix = n % 3; // TODO: remove _ix for nroots > 2
+                                double *_gx = gx + (i + _ix * g_size) * nksp_per_block;
+                                double cpx = rt_ak * Rpq[_ix*nksp_per_block];
+                                //for i in range(lij+1):
+                                //    trr(i,1) = c0p * trr(i,0) + i*b00 * trr(i-1,0)
+                                if (n < lij3) {
+                                    s0x = _gx[0];
+                                    s1x = cpx * s0x;
+                                    if (i > 0) {
+                                        s1x += i * b00 * _gx[-nksp_per_block];
+                                    }
+                                    _gx[stride_k*nksp_per_block] = s1x;
+                                }
+                                //for k in range(1, lk):
+                                //    for i in range(lij+1):
+                                //        trr(i,k+1) = cp * trr(i,k) + k*b01 * trr(i,k-1) + i*b00 * trr(i-1,k)
+                                for (int k = 1; k < lk; ++k) {
+                                    __syncthreads();
+                                    if (n < lij3) {
+                                        s2x = cpx*s1x + k*b01*s0x;
+                                        if (i > 0) {
+                                            s2x += i * b00 * _gx[(k*stride_k-1)*nksp_per_block];
+                                        }
+                                        _gx[(k*stride_k+stride_k)*nksp_per_block] = s2x;
+                                        s0x = s1x;
+                                        s1x = s2x;
+                                    }
+                                }
+                            }
+                        }
+
+                        // hrr
+                        // g(i,j+1) = rirj * g(i,j) +  g(i+1,j)
+                        // g(...,k,l+1) = rkrl * g(...,k,l) + g(...,k+1,l)
+                        if (lj > 0) {
+                            __syncthreads();
+                            if (task_id < ntasks) {
+                                int lk3 = (lk+1)*3;
+                                for (int m = gout_id; m < lk3; m += gout_stride) {
+                                    int k = m / 3;
+                                    int _ix = m % 3;
+                                    double xjxi = rjri[_ix*nksp_per_block];
+                                    double *_gx = g + (_ix*g_size + k*stride_k) * nksp_per_block;
+                                    for (int j = 0; j < lj; ++j) {
+                                        int ij = (lij-j) + j*stride_j;
+                                        s1x = _gx[ij*nksp_per_block];
+                                        for (--ij; ij >= j*stride_j; --ij) {
+                                            s0x = _gx[ij*nksp_per_block];
+                                            _gx[(ij+stride_j)*nksp_per_block] = s1x - xjxi * s0x;
+                                            s1x = s0x;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        __syncthreads();
+#pragma unroll
+                        for (int n = 0; n < GOUT_WIDTH; ++n) {
+                            int ijk = gout_start + n*gout_stride+gout_id;
+                            int k  = ijk / nfij;
+                            int ij = ijk % nfij;
+                            if (k >= nfk) break;
+                            int addrx = (idx_ij[ij] + idx_k[k] * stride_k) * nksp_per_block;
+                            int addry = (idy_ij[ij] + idy_k[k] * stride_k) * nksp_per_block;
+                            int addrz = (idz_ij[ij] + idz_k[k] * stride_k) * nksp_per_block;
+                            gout[n] += gx[addrx] * gy[addry] * gz[addrz];
+                        }
+                    }
+                }
+            }
+
+            if (pair_ij_idx < bounds.npairs_ij) {
+                int *ao_loc = envs.ao_loc;
+                int nbasp = envs.cell0_nbas;
+                int ncells = envs.bvk_ncells;
+                int cell_i = ish / nbasp;
+                int cell0_ish = ish % nbasp;
+                int cell_j = jsh / nbasp;
+                int cell0_jsh = jsh % nbasp;
+                int nrow = bounds.nrow;
+                int ncol = bounds.ncol;
+                size_t naux = bounds.naux;
+                int i0 = ao_loc[cell0_ish] - ao_loc[bounds.ish0];
+                int j0 = ao_loc[cell0_jsh] - ao_loc[bounds.jsh0];
+                int k0 = ao_loc[ksh] - ao_loc[bounds.ksh0];
+                double *eri_tensor = out + (((cell_i * nrow + i0) * ncells +
+                                              cell_j) * ncol + j0) * naux + k0;
+                int nKj = ncells * ncol;
+                for (int n = 0; n < GOUT_WIDTH; ++n) {
+                    int ijk = gout_start + n*gout_stride+gout_id;
+                    size_t k  = ijk / nfij;
+                    size_t ij = ijk % nfij;
+                    if (k >= nfk) break;
+                    size_t i = ij % nfi;
+                    size_t j = ij / nfi;
+                    size_t addr = (i*nKj+j)*naux + k;
+                    eri_tensor[addr] = gout[n];
+                }
+            }
+        }
+    }
+}
+
+__global__
+void sr_int3c2e_img_counts_kernel(int *img_counts, PBCInt3c2eEnvVars envs,
+                                  float *exps, float *log_coeff, float *aux_exps,
+                                  int ish0, int jsh0, int nish, int njsh)
+{
+    int Ki = blockIdx.x;
+    int Kj = blockIdx.y;
+    int cell_i = Ki / nish;
+    int cell_j = Kj / njsh;
+    int cell0_ish = Ki % nish + ish0;
+    int cell0_jsh = Kj % njsh + jsh0;
+    int nbasp = envs.cell0_nbas;
+    int ish = cell_i * nbasp + cell0_ish;
+    int jsh = cell_j * nbasp + cell0_jsh;
+    int ncells = envs.bvk_ncells;
+    int nKj = ncells * njsh;
+    int thread_id = threadIdx.x;
+    int threads = blockDim.x;
+    int nimgs = envs.nimgs;
+    int nimgs2 = nimgs * nimgs;
+    int cell0_natm = envs.cell0_natm;
+    int *atm = envs.atm;
+    int *bas = envs.bas;
+    double *env = envs.env;
+    double *img_coords = envs.img_coords;
+    extern __shared__ float x_cache[];
+    float *y_cache = x_cache + cell0_natm;
+    float *z_cache = y_cache + cell0_natm;
+    for (int k = thread_id; k < cell0_natm; k += threads) {
+        double *rk = env + atm[k*ATM_SLOTS+PTR_COORD];
+        x_cache[k] = rk[0];
+        y_cache[k] = rk[1];
+        z_cache[k] = rk[2];
+    }
+    __syncthreads();
+
+    int li = bas[ANG_OF + ish0*BAS_SLOTS];
+    int lj = bas[ANG_OF + jsh0*BAS_SLOTS];
+    float ai = exps[cell0_ish];
+    float aj = exps[cell0_jsh];
+    float log_ci = log_coeff[cell0_ish];
+    float log_cj = log_coeff[cell0_jsh];
+    float aij = ai + aj;
+    float u = .5f / aij;
+    float fi = ai / aij;
+    float fj = aj / aij;
+    float theta_ij = ai * aj / aij;
+    float omega = env[PTR_RANGE_OMEGA];
+    if (omega == 0) {
+        omega = 0.1f;
+    }
+    float omega2 = omega * omega;
+    double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+    double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+    float xi = ri[0];
+    float yi = ri[1];
+    float zi = ri[2];
+    float xj = rj[0];
+    float yj = rj[1];
+    float zj = rj[2];
+    float log_cutoff = envs.log_cutoff;
+
+    // fac_guess = log(sqrt(2.x/(omega*sqrt(pi))) * ((2*li+1)*(2*lj+1)*(2*lk+1))**.5/(4*pi)**1.5)
+    //           ~ between [0, 2]
+    float fac_guess = .5f - logf(omega2)/4;
+    float log_fac = log_ci + log_cj + 1.717f - 1.5f*logf(aij) + fac_guess;
+
+    int count = 0;
+    for (int ijL = thread_id; ijL < nimgs2; ijL += threads) {
+        int iL = ijL / nimgs;
+        int jL = ijL % nimgs;
+        float xiL = xi + img_coords[iL*3+0];
+        float yiL = yi + img_coords[iL*3+1];
+        float ziL = zi + img_coords[iL*3+2];
+        float xjL = xj + img_coords[jL*3+0];
+        float yjL = yj + img_coords[jL*3+1];
+        float zjL = zj + img_coords[jL*3+2];
+        float xjxi = xjL - xiL;
+        float yjyi = yjL - yiL;
+        float zjzi = zjL - ziL;
+        float xij = xjxi * fj + xiL;
+        float yij = yjyi * fj + yiL;
+        float zij = zjzi * fj + ziL;
+        float theta = (omega2 * aij) / (omega2 + aij);
+        float rr_min = 1e3f;
+        float theta_rr_min = 1e6f;
+        for (int k = 0; k < cell0_natm; ++k) {
+            float dx = xij - x_cache[k];
+            float dy = yij - y_cache[k];
+            float dz = zij - z_cache[k];
+            float rr = dx * dx + dy * dy + dz * dz;
+            float ak = aux_exps[k];
+            float theta_k = theta * ak / (theta + ak);
+            float theta_rr = theta_k * rr;
+            if (theta_rr < theta_rr_min) {
+                theta_rr_min = theta_rr;
+                rr_min = rr;
+            }
+        }
+
+        // exp(- 1/(1/aij+1/ak+1/omega^2) * r_guess^2) < 1e-9
+        // => ~ exp(- omega^2 * r_guess^2) < 1e-9
+        // => r_guess > 5/omega
+        // 1/(1/aij+1/ak+1/omega^2)*r_guess/aij in Eq 64 of arXiv:2302.11307
+        //     ~ omega^2*r_guess/aij ~ omega/aij * 5.f
+        //float rt_aij = fabs(omega)/aij * 5.;
+        float rt_aij = omega2 * sqrtf(rr_min) / aij + 1e-9f;
+        float rr_ij = xjxi * xjxi + yjyi * yjyi + zjzi * zjzi;
+        float dr = sqrtf(rr_ij);
+        float dri = fj * dr + rt_aij;
+        float drj = fi * dr + rt_aij;
+        float dri_fac = .5f*li * logf(dri*dri + li*u);
+        float drj_fac = .5f*lj * logf(drj*drj + lj*u);
+        float estimator = log_fac + dri_fac + drj_fac - theta_ij*rr_ij - theta_rr_min;
+        if (estimator > log_cutoff) {
+            count += 1;
+        }
+    }
+
+    extern __shared__ int counts[];
+    counts[thread_id] = count;
+    __syncthreads();
+    for (int stride = threads / 2; stride > 0; stride /= 2) {
+        if (thread_id < stride) {
+            counts[thread_id] += counts[thread_id + stride];
+        }
+        __syncthreads();
+    }
+    if (thread_id == 0) {
+        img_counts[Ki*nKj+Kj] = counts[0];
+    }
+}
+
+__global__
+void sr_int3c2e_img_idx_kernel(int *img_idx, int *img_offsets, int *bas_mapping,
+                               PBCInt3c2eEnvVars envs,
+                               float *exps, float *log_coeff, float *aux_exps,
+                               int ish0, int jsh0, int nish, int njsh)
+{
+    int thread_id = threadIdx.x;
+    int threads = blockDim.x;
+    int ncells = envs.bvk_ncells;
+    int nKj = ncells * njsh;
+    int row_id = blockIdx.x;
+    int bas_ij = bas_mapping[row_id];
+    int Ki = bas_ij / nKj;
+    int Kj = bas_ij % nKj;
+    int cell_i = Ki / nish;
+    int cell_j = Kj / njsh;
+    int cell0_ish = Ki % nish + ish0;
+    int cell0_jsh = Kj % njsh + jsh0;
+    int nbasp = envs.cell0_nbas;
+    int ish = cell_i * nbasp + cell0_ish;
+    int jsh = cell_j * nbasp + cell0_jsh;
+    int nimgs = envs.nimgs;
+    int nimgs2 = nimgs * nimgs;
+    int cell0_natm = envs.cell0_natm;
+    int *atm = envs.atm;
+    int *bas = envs.bas;
+    double *env = envs.env;
+    double *img_coords = envs.img_coords;
+    extern __shared__ int8_t mask[];
+    uint16_t* cum_count = (uint16_t *)(mask + IMG_BLOCK);
+    float *x_cache = (float *)(cum_count + threads);
+    float *y_cache = x_cache + cell0_natm;
+    float *z_cache = y_cache + cell0_natm;
+    for (int k = thread_id; k < cell0_natm; k += threads) {
+        double *rk = env + atm[k*ATM_SLOTS+PTR_COORD];
+        x_cache[k] = rk[0];
+        y_cache[k] = rk[1];
+        z_cache[k] = rk[2];
+    }
+    for (int i = thread_id; i < IMG_BLOCK; i += threads) {
+        mask[i] = 0;
+    }
+    __syncthreads();
+
+    int li = bas[ANG_OF + ish0*BAS_SLOTS];
+    int lj = bas[ANG_OF + jsh0*BAS_SLOTS];
+    float ai = exps[cell0_ish];
+    float aj = exps[cell0_jsh];
+    float log_ci = log_coeff[cell0_ish];
+    float log_cj = log_coeff[cell0_jsh];
+    float aij = ai + aj;
+    float u = .5f / aij;
+    float fi = ai / aij;
+    float fj = aj / aij;
+    float theta_ij = ai * aj / aij;
+    float omega = env[PTR_RANGE_OMEGA];
+    if (omega == 0) {
+        omega = 0.1f;
+    }
+    float omega2 = omega * omega;
+    double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+    double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+    float xi = ri[0];
+    float yi = ri[1];
+    float zi = ri[2];
+    float xj = rj[0];
+    float yj = rj[1];
+    float zj = rj[2];
+    float log_cutoff = envs.log_cutoff;
+
+    // fac_guess = log(sqrt(2.x/(omega*sqrt(pi))) * ((2*li+1)*(2*lj+1)*(2*lk+1))**.5/(4*pi)**1.5)
+    //           ~ between [0, 2]
+    float fac_guess = .5f - logf(omega2)/4;
+    float log_fac = log_ci + log_cj + 1.717f - 1.5f*logf(aij) + fac_guess;
+    int offset_start = img_offsets[row_id];
+
+    for (int img_start = 0; img_start < nimgs2; img_start += IMG_BLOCK) {
+        int block_nimgs2 = MIN(IMG_BLOCK, nimgs2-img_start);
+        int bacth_size = (block_nimgs2 + threads - 1) / threads;
+        int ij0 = img_start + thread_id * bacth_size;
+        int ij1 = MIN(ij0 + bacth_size, nimgs2);
+
+        int count = 0;
+        for (int ijL = ij0; ijL < ij1; ++ijL) {
+            int iL = ijL / nimgs;
+            int jL = ijL % nimgs;
+            float xiL = xi + img_coords[iL*3+0];
+            float yiL = yi + img_coords[iL*3+1];
+            float ziL = zi + img_coords[iL*3+2];
+            float xjL = xj + img_coords[jL*3+0];
+            float yjL = yj + img_coords[jL*3+1];
+            float zjL = zj + img_coords[jL*3+2];
+            float xjxi = xjL - xiL;
+            float yjyi = yjL - yiL;
+            float zjzi = zjL - ziL;
+            float xij = xjxi * fj + xiL;
+            float yij = yjyi * fj + yiL;
+            float zij = zjzi * fj + ziL;
+            float theta = (omega2 * aij) / (omega2 + aij);
+            float rr_min = 1e3f;
+            float theta_rr_min = 1e6f;
+            for (int k = 0; k < cell0_natm; ++k) {
+                float dx = xij - x_cache[k];
+                float dy = yij - y_cache[k];
+                float dz = zij - z_cache[k];
+                float rr = dx * dx + dy * dy + dz * dz;
+                float ak = aux_exps[k];
+                float theta_k = theta * ak / (theta + ak);
+                float theta_rr = theta_k * rr;
+                if (theta_rr < theta_rr_min) {
+                    theta_rr_min = theta_rr;
+                    rr_min = rr;
+                }
+            }
+
+            // exp(- 1/(1/aij+1/ak+1/omega^2) * r_guess^2) < 1e-9
+            // => ~ exp(- omega^2 * r_guess^2) < 1e-9
+            // => r_guess > 5/omega
+            // 1/(1/aij+1/ak+1/omega^2)*r_guess/aij in Eq 64 of arXiv:2302.11307
+            //     ~ omega^2*r_guess/aij ~ omega/aij * 5.f
+            //float rt_aij = fabs(omega)/aij * 5.;
+            float rt_aij = omega2 * sqrtf(rr_min) / aij + 1e-9f;
+            float rr_ij = xjxi * xjxi + yjyi * yjyi + zjzi * zjzi;
+            float dr = sqrtf(rr_ij);
+            float dri = fj * dr + rt_aij;
+            float drj = fi * dr + rt_aij;
+            float dri_fac = .5f*li * logf(dri*dri + li*u);
+            float drj_fac = .5f*lj * logf(drj*drj + lj*u);
+            float estimator = log_fac + dri_fac + drj_fac - theta_ij*rr_ij - theta_rr_min;
+            if (estimator > log_cutoff) {
+                mask[ijL - img_start] = 1;
+                count += 1;
+            }
+        }
+
+        cum_count[thread_id] = count;
+        // Up-sweep phase
+        for (int stride = 1; stride < threads; stride *= 2) {
+            __syncthreads();
+            int index = (thread_id + 1) * stride * 2 - 1;
+            if (index < threads) {
+                cum_count[index] += cum_count[index-stride];
+            }
+        }
+        __syncthreads();
+        // Down-sweep phase
+        for (int stride = threads/4; stride > 0; stride /= 2) {
+            __syncthreads();
+            int index = (thread_id + 1) * stride * 2 - 1;
+            if (index + stride < threads) {
+                cum_count[index + stride] += cum_count[index];
+            }
+        }
+        __syncthreads();
+
+        int offset = offset_start;
+        if (thread_id > 0) {
+            offset += cum_count[thread_id-1];
+        }
+        for (int ijL = ij0; ijL < ij1; ++ijL) {
+            if (mask[ijL-img_start]) {
+                img_idx[offset] = ijL;
+                mask[ijL-img_start] = 0;
+                ++offset;
+            }
+        }
+        offset_start += cum_count[threads-1];
+        __syncthreads();
+    }
+}
diff --git a/gpu4pyscf/lib/pbc/ft_ao.cu b/gpu4pyscf/lib/pbc/ft_ao.cu
index d9b6d5e2..40438340 100644
--- a/gpu4pyscf/lib/pbc/ft_ao.cu
+++ b/gpu4pyscf/lib/pbc/ft_ao.cu
@@ -20,7 +20,7 @@
 #include <cuda_runtime.h>
 
 #include "gvhf-rys/vhf.cuh"
-#include "ft_ao.h"
+#include "ft_ao.cuh"
 
 #define GOUT_WIDTH      19
 // pi^1.5
@@ -204,7 +204,7 @@ void ft_aopair_kernel(double *out, AFTIntEnvVars envs, AFTBoundsInfo bounds)
 #pragma unroll
             for (int n = 0; n < GOUT_WIDTH; ++n) {
                 int ij = n*gout_stride + gout_id;
-                if (ij >= nfij) continue;
+                if (ij >= nfij) break;
                 int addrx = idx_ij[ij] * nGv_per_block;
                 int addry = idy_ij[ij] * nGv_per_block;
                 int addrz = idz_ij[ij] * nGv_per_block;
@@ -237,7 +237,7 @@ void ft_aopair_kernel(double *out, AFTIntEnvVars envs, AFTBoundsInfo bounds)
                  + Gv_block_id*nGv_per_block + Gv_id) * OF_COMPLEX;
         for (int n = 0; n < GOUT_WIDTH; ++n) {
             int ij = n*gout_stride + gout_id;
-            if (ij >= nfij) continue;
+            if (ij >= nfij) break;
             size_t i = ij % nfi;
             size_t j = ij / nfi;
             size_t addr = (i*nao+j)*nGv;
diff --git a/gpu4pyscf/lib/pbc/ft_ao.h b/gpu4pyscf/lib/pbc/ft_ao.cuh
similarity index 100%
rename from gpu4pyscf/lib/pbc/ft_ao.h
rename to gpu4pyscf/lib/pbc/ft_ao.cuh
diff --git a/gpu4pyscf/lib/pbc/int3c2e.cuh b/gpu4pyscf/lib/pbc/int3c2e.cuh
new file mode 100644
index 00000000..746fa138
--- /dev/null
+++ b/gpu4pyscf/lib/pbc/int3c2e.cuh
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2024 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#define WARP_SIZE       32
+// corresponding to 256 threads
+#define WARPS           8
+#define IMG_MASK_SLOTS  1024
+#define L_AUX_MAX       6
+#define SPTAKS_PER_BLOCK        32
+#define IMG_BLOCK       16384
+
+#ifndef HAVE_DEFINED_PBCINT3CENVVAS_H
+#define HAVE_DEFINED_PBCINT3CENVVAS_H
+typedef struct {
+    uint16_t cell0_natm; // in the reference cell
+    uint16_t cell0_nbas; // in the reference cell
+    uint16_t bvk_ncells; // in bvk-cell
+    uint16_t nimgs; // number of images in lattice sum
+    int *atm;
+    int *bas;
+    double *env;
+    int *ao_loc; // in bvk-cell
+    double *img_coords; // vectors in lattice sum
+    float log_cutoff;
+} PBCInt3c2eEnvVars;
+
+typedef struct {
+    uint8_t li;
+    uint8_t lj;
+    uint8_t lk;
+    uint8_t nroots;
+    uint8_t nfi;
+    uint8_t nfij;
+    uint8_t nfk;
+    uint8_t iprim;
+    uint8_t jprim;
+    uint8_t kprim;
+    uint8_t stride_i;
+    uint8_t stride_j;
+    uint8_t stride_k;
+    uint8_t g_size;
+    uint16_t nrow;
+    uint16_t ncol;
+    uint16_t naux;
+    uint16_t nksh;
+    uint16_t ish0;
+    uint16_t jsh0;
+    uint16_t ksh0;
+    int npairs_ij;
+    int *bas_ij_idx;
+    int *img_idx; // indices of img_coords in each shell-pair
+    int *img_offsets; // offset img_idx for each shell-pair
+} PBCInt3c2eBounds;
+
+#ifdef __CUDACC__
+extern __constant__ int c_g_pair_idx[];
+extern __constant__ int c_g_pair_offsets[];
+extern __constant__ int c_g_cart_idx[];
+#endif
+#endif
diff --git a/gpu4pyscf/lib/pbc/pbc_driver.cu b/gpu4pyscf/lib/pbc/pbc_driver.cu
index b800efd6..45ab84cb 100644
--- a/gpu4pyscf/lib/pbc/pbc_driver.cu
+++ b/gpu4pyscf/lib/pbc/pbc_driver.cu
@@ -1,3 +1,19 @@
+/*
+ * Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -5,23 +21,37 @@
 #include <cuda_runtime.h>
 
 #include "gvhf-rys/vhf.cuh"
-#include "ft_ao.h"
+#include "int3c2e.cuh"
+#include "ft_ao.cuh"
 
-__constant__ int c_g_pair_idx[3675];
+__constant__ int c_g_pair_idx[3675]; // corresponding to LMAX=4
 __constant__ int c_g_pair_offsets[LMAX1*LMAX1];
+__constant__ int c_g_cart_idx[252]; // corresponding to LMAX=6
 
 extern __global__
 void ft_aopair_kernel(double *out, AFTIntEnvVars envs, AFTBoundsInfo bounds);
 extern __global__
 void ft_aopair_fill_triu(double *out, int *conj_mapping, int bvk_ncells, int nGv);
+extern __global__
+void pbc_int3c2e_kernel(double *out, PBCInt3c2eEnvVars envs, PBCInt3c2eBounds bounds);
+extern __global__
+void sr_int3c2e_img_counts_kernel(int *img_counts, PBCInt3c2eEnvVars envs,
+                                  float *exps, float *log_coeff, float *aux_exps,
+                                  int ish0, int jsh0, int nish, int njsh);
+extern __global__
+void sr_int3c2e_img_idx_kernel(int *img_idx, int *img_offsets, int *bas_mapping,
+                               PBCInt3c2eEnvVars envs,
+                               float *exps, float *log_coeff, float *aux_exps,
+                               int ish0, int jsh0, int nish, int njsh);
 
 int ft_ao_unrolled(double *out, AFTIntEnvVars *envs, AFTBoundsInfo *bounds, int *scheme);
+int int3c2e_unrolled(double *out, PBCInt3c2eEnvVars *envs, PBCInt3c2eBounds *bounds);
 
 extern "C" {
-int PBC_build_ft_ao(double *out, AFTIntEnvVars *envs,
-                    int *scheme, int *shls_slice, int npairs_ij, int ngrids,
-                    int *ish_in_pair, int *jsh_in_pair, double *grids,
-                    int *atm, int natm, int *bas, int nbas, double *env)
+int build_ft_ao(double *out, AFTIntEnvVars *envs,
+                int *scheme, int *shls_slice, int npairs_ij, int ngrids,
+                int *ish_in_pair, int *jsh_in_pair, double *grids,
+                int *atm, int natm, int *bas, int nbas, double *env)
 {
     uint16_t ish0 = shls_slice[0];
     uint16_t jsh0 = shls_slice[2];
@@ -53,13 +83,13 @@ int PBC_build_ft_ao(double *out, AFTIntEnvVars *envs,
     }
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA Error in PBC_build_ft_ao: %s\n", cudaGetErrorString(err));
+        fprintf(stderr, "CUDA Error in build_ft_ao: %s\n", cudaGetErrorString(err));
         return 1;
     }
     return 0;
 }
 
-int PBC_ft_aopair_fill_triu(double *out, int *conj_mapping, int nao, int bvk_ncells, int nGv)
+int ft_aopair_fill_triu(double *out, int *conj_mapping, int nao, int bvk_ncells, int nGv)
 {
     int nGv2 = nGv * 2; // *2 for complex number
     int threads = 1024;
@@ -67,18 +97,147 @@ int PBC_ft_aopair_fill_triu(double *out, int *conj_mapping, int nao, int bvk_nce
     ft_aopair_fill_triu<<<blocks, threads>>>(out, conj_mapping, bvk_ncells, nGv2);
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA Error in PBC_ft_aopair_fill_triu: %s\n", cudaGetErrorString(err));
+        fprintf(stderr, "CUDA Error in ft_aopair_fill_triu: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int fill_int3c2e(double *out, PBCInt3c2eEnvVars *envs,
+                 int *scheme, int *shls_slice, int bvk_ncells,
+                 int nrow, int ncol, int naux, int npairs_ij,
+                 int *bas_ij_idx, int *img_idx, int *img_offsets,
+                 int *atm, int natm, int *bas, int nbas, double *env)
+{
+    uint16_t ish0 = shls_slice[0];
+    uint16_t jsh0 = shls_slice[2];
+    uint16_t ksh0 = shls_slice[4] + nbas;
+    uint16_t ksh1 = shls_slice[5] + nbas;
+    uint16_t nksh = ksh1 - ksh0;
+    uint8_t li = bas[ANG_OF + ish0*BAS_SLOTS];
+    uint8_t lj = bas[ANG_OF + jsh0*BAS_SLOTS];
+    uint8_t lk = bas[ANG_OF + ksh0*BAS_SLOTS];
+    uint8_t iprim = bas[NPRIM_OF + ish0*BAS_SLOTS];
+    uint8_t jprim = bas[NPRIM_OF + jsh0*BAS_SLOTS];
+    uint8_t kprim = bas[NPRIM_OF + ksh0*BAS_SLOTS];
+    uint8_t nfi = (li+1)*(li+2)/2;
+    uint8_t nfj = (lj+1)*(lj+2)/2;
+    uint8_t nfk = (lk+1)*(lk+2)/2;
+    uint8_t nfij = nfi * nfj;
+    uint8_t order = li + lj + lk;
+    uint8_t nroots = order / 2 + 1;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) { // SR ERIs
+        nroots *= 2;
+    }
+    uint8_t stride_i = 1;
+    uint8_t stride_j = li + 1;
+    uint8_t stride_k = stride_j * (lj + 1);
+    // up to (gg|i)
+    uint8_t g_size = stride_k * (lk + 1);
+    PBCInt3c2eBounds bounds = {li, lj, lk, nroots, nfi, nfij, nfk,
+        iprim, jprim, kprim, stride_i, stride_j, stride_k, g_size,
+        (uint16_t)nrow, (uint16_t)ncol, (uint16_t)naux, nksh, ish0, jsh0, ksh0,
+        npairs_ij, bas_ij_idx, img_idx, img_offsets};
+
+    if (!int3c2e_unrolled(out, envs, &bounds)) {
+        int nksh_per_block = scheme[0];
+        int gout_stride = scheme[1];
+        int nsp_per_block = scheme[2];
+        dim3 threads(nksh_per_block, gout_stride, nsp_per_block);
+        int tasks_per_block = SPTAKS_PER_BLOCK * nsp_per_block;
+        int sp_blocks = (npairs_ij + tasks_per_block - 1) / tasks_per_block;
+        int ksh_blocks = (nksh + nksh_per_block - 1) / nksh_per_block;
+        dim3 blocks(sp_blocks, ksh_blocks);
+        int buflen = (nroots*2+g_size*3+6) * (nksh_per_block * nsp_per_block) * sizeof(double);
+        pbc_int3c2e_kernel<<<blocks, threads, buflen>>>(out, *envs, bounds);
+    }
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in fill_int3c2e: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int int3c2e_img_counts(int *img_counts, PBCInt3c2eEnvVars *envs,
+                       int *shls_slice, float *exps, float *log_cs, float *aux_exps,
+                       int bvk_ncells, int cell0_natm)
+{
+    int ish0 = shls_slice[0];
+    int ish1 = shls_slice[1];
+    int jsh0 = shls_slice[2];
+    int jsh1 = shls_slice[3];
+    int nish = ish1 - ish0;
+    int njsh = jsh1 - jsh0;
+    dim3 blocks(bvk_ncells*nish, bvk_ncells*njsh);
+    int buflen = cell0_natm * 3 * sizeof(float);
+    int threads = 512;
+    buflen = MAX(buflen, threads*sizeof(int));
+    sr_int3c2e_img_counts_kernel<<<blocks, threads, buflen>>>(
+        img_counts, *envs, exps, log_cs, aux_exps, ish0, jsh0, nish, njsh);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in int3c2e_q_mask: %s\n", cudaGetErrorString(err));
         return 1;
     }
     return 0;
 }
 
-int PBC_FT_init_constant(int *g_pair_idx, int *offsets,
-                         double *env, int env_size, int shm_size)
+int int3c2e_img_idx(int *img_idx, int *img_offsets, int *bas_mapping, int nrow,
+                    PBCInt3c2eEnvVars *envs,
+                    int *shls_slice, float *exps, float *log_cs, float *aux_exps,
+                    int bvk_ncells, int cell0_natm)
+
+{
+    int ish0 = shls_slice[0];
+    int ish1 = shls_slice[1];
+    int jsh0 = shls_slice[2];
+    int jsh1 = shls_slice[3];
+    int nish = ish1 - ish0;
+    int njsh = jsh1 - jsh0;
+    dim3 blocks(bvk_ncells*nish, bvk_ncells*njsh);
+    int buflen = cell0_natm * 3 * sizeof(float);
+    int threads = 512;
+    buflen = buflen + threads*sizeof(uint16_t) + IMG_BLOCK;
+    sr_int3c2e_img_idx_kernel<<<nrow, threads, buflen>>>(
+        img_idx, img_offsets, bas_mapping, *envs,
+        exps, log_cs, aux_exps, ish0, jsh0, nish, njsh);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in int3c2e_img_idx: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int init_constant(int *g_pair_idx, int *offsets,
+                  double *env, int env_size, int shm_size)
 {
     cudaMemcpyToSymbol(c_g_pair_idx, g_pair_idx, 3675*sizeof(int));
     cudaMemcpyToSymbol(c_g_pair_offsets, offsets, sizeof(int) * LMAX1*LMAX1);
+
+    int *g_cart_idx = (int *)malloc(252*sizeof(int));
+    int *idx, *idy, *idz;
+    idx = g_cart_idx;
+    for (int l = 0; l <= L_AUX_MAX; ++l) {
+        int nf = (l + 1) * (l + 2) / 2;
+        idy = idx + nf;
+        idz = idy + nf;
+        for (int i = 0, ix = l; ix >= 0; --ix) {
+        for (int iy = l - ix; iy >= 0; --iy, ++i) {
+            int iz = l - ix - iy;
+            idx[i] = ix;
+            idy[i] = iy;
+            idz[i] = iz;
+        } }
+        idx += nf * 3;
+    }
+    cudaMemcpyToSymbol(c_g_cart_idx, g_cart_idx, 252*sizeof(int));
+    free(g_cart_idx);
+
     cudaFuncSetAttribute(ft_aopair_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    cudaFuncSetAttribute(pbc_int3c2e_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
         fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size,
diff --git a/gpu4pyscf/lib/pbc/rys_roots.cu b/gpu4pyscf/lib/pbc/rys_roots.cu
new file mode 100644
index 00000000..a8700bd2
--- /dev/null
+++ b/gpu4pyscf/lib/pbc/rys_roots.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gvhf-rys/rys_roots.cuh"
+
+#define SQRTPIE4        .8862269254527580136
+#define PIE4            .7853981633974483096
+
+__device__
+static void rys_roots(int nroots, double x, double *rw,
+                      int block_size, int worker_id, int workers)
+{
+    if (x < 3.e-7){
+        int off = nroots * (nroots - 1) / 2;
+        for (int i = worker_id; i < nroots; i += workers)  {
+            rw[(i*2  )*block_size] = ROOT_SMALLX_R0[off+i] + ROOT_SMALLX_R1[off+i] * x;
+            rw[(i*2+1)*block_size] = ROOT_SMALLX_W0[off+i] + ROOT_SMALLX_W1[off+i] * x;
+        }
+        return;
+    }
+
+    if (nroots == 1) {
+        if (worker_id == 0) {
+            double tt = sqrt(x);
+            double fmt0 = SQRTPIE4 / tt * erf(tt);
+            rw[block_size] = fmt0;
+            double e = exp(-x);
+            double b = .5 / x;
+            double fmt1 = b * (fmt0 - e);
+            rw[0] = fmt1 / fmt0;
+        }
+        return;
+    }
+
+    if (x > 35+nroots*5) {
+        int off = nroots * (nroots - 1) / 2;
+        double t = sqrt(PIE4/x);
+        for (int i = worker_id; i < nroots; i += workers)  {
+            rw[(i*2  )*block_size] = ROOT_LARGEX_R_DATA[off+i] / x;
+            rw[(i*2+1)*block_size] = ROOT_LARGEX_W_DATA[off+i] * t;
+        }
+        return;
+    }
+
+    double *datax = ROOT_RW_DATA + DEGREE1*INTERVALS * nroots*(nroots-1);
+    int it = (int)(x * .4);
+    double u = (x - it * 2.5) * 0.8 - 1.;
+    double u2 = u * 2.;
+    for (int rt_id = worker_id; rt_id < nroots*2; rt_id += workers) {
+        double *c = datax + rt_id * DEGREE1 * INTERVALS;
+        //for i in range(2, degree + 1):
+        //    c0, c1 = c[degree-i] - c1, c0 + c1*u2
+        double c0 = c[it + DEGREE   *INTERVALS];
+        double c1 = c[it +(DEGREE-1)*INTERVALS];
+        double c2, c3;
+#pragma unroll
+        for (int n = DEGREE-2; n > 0; n-=2) {
+            c2 = c[it + n   *INTERVALS] - c1;
+            c3 = c0 + c1*u2;
+            c1 = c2 + c3*u2;
+            c0 = c[it +(n-1)*INTERVALS] - c3;
+        }
+        if (DEGREE % 2 == 0) {
+            c2 = c[it] - c1;
+            c3 = c0 + c1*u2;
+            rw[rt_id*block_size] = c2 + c3*u;
+        } else {
+            rw[rt_id*block_size] = c0 + c1*u;
+        }
+    }
+}
diff --git a/gpu4pyscf/lib/pbc/unrolled_ft_ao.cu b/gpu4pyscf/lib/pbc/unrolled_ft_ao.cu
index d2845274..d95d22b2 100644
--- a/gpu4pyscf/lib/pbc/unrolled_ft_ao.cu
+++ b/gpu4pyscf/lib/pbc/unrolled_ft_ao.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright 2024 The PySCF Developers. All Rights Reserved.
+ * Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include "gvhf-rys/vhf.cuh"
-#include "ft_ao.h"
+#include "ft_ao.cuh"
 #define OVERLAP_FAC     5.56832799683170787
 #define OF_COMPLEX      2
 
diff --git a/gpu4pyscf/lib/pbc/unrolled_int3c2e.cu b/gpu4pyscf/lib/pbc/unrolled_int3c2e.cu
new file mode 100644
index 00000000..0c7e0174
--- /dev/null
+++ b/gpu4pyscf/lib/pbc/unrolled_int3c2e.cu
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "int3c2e.cuh"
+
+int int3c2e_unrolled(double *out, PBCInt3c2eEnvVars *envs, PBCInt3c2eBounds *bounds)
+{
+    return 0;
+}
diff --git a/gpu4pyscf/lib/solvent/pcm.cu b/gpu4pyscf/lib/solvent/pcm.cu
index 7615f314..4d34ce97 100644
--- a/gpu4pyscf/lib/solvent/pcm.cu
+++ b/gpu4pyscf/lib/solvent/pcm.cu
@@ -78,9 +78,9 @@ static void _pcm_d_s(double *matrix_d, double *matrix_s,
 
 __global__
 static void _pcm_dD_dS(double *matrix_dd, double *matrix_ds,
-                    const double *coords, const double *norm_vec, const double *r_vdw,
-                    const double *charge_exp, const double *switch_fun,
-                    int n)
+                       const double *coords, const double *norm_vec,
+                       const double *charge_exp,
+                       int n)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     int j = blockIdx.y * blockDim.y + threadIdx.y;
@@ -130,6 +130,127 @@ static void _pcm_dD_dS(double *matrix_dd, double *matrix_ds,
     }
 }
 
+__global__
+static void _pcm_d2D_d2S(double *matrix_d2D, double *matrix_d2S,
+                         const double *coords, const double *norm_vec,
+                         const double *charge_exp,
+                         int n)
+{
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= n || j >= n) {
+        return;
+    }
+
+    // calculate xi
+    const double ei = charge_exp[i];
+    const double ej = charge_exp[j];
+    const double eij = ei * ej / sqrt(ei*ei + ej*ej);
+
+    // calculate r
+    const double dx = coords[3*i]   - coords[3*j];
+    const double dy = coords[3*i+1] - coords[3*j+1];
+    const double dz = coords[3*i+2] - coords[3*j+2];
+    const double rij = norm3d(dx, dy, dz);
+    const double rij_1 = (i != j) ? (1.0 / rij) : 0.0; // This guarantees that if i == j, all matrix elements = 0
+    const double rij_2 = rij_1 * rij_1;
+    const double rij_3 = rij_2 * rij_1;
+    const double rij_4 = rij_2 * rij_2;
+    const double rij_5 = rij_2 * rij_3;
+    const double eij2 = eij * eij;
+
+    const double eij_rij = eij * rij;
+    const double erf_eij_rij = erf(eij_rij);
+    const double exp_minus_eij2_rij2 = exp(-eij_rij * eij_rij);
+    const double two_eij_over_sqrt_pi = 2.0 * eij / SQRT_PI;
+    const double two_eij_over_sqrt_pi_exp_minus_eij2_rij2 = exp_minus_eij2_rij2 * two_eij_over_sqrt_pi;
+
+    const double S_direct_product_prefactor = -two_eij_over_sqrt_pi_exp_minus_eij2_rij2 * (3 * rij_4 + 2 * eij2 * rij_2)
+                                              + 3 * rij_5 * erf_eij_rij;
+    const double S_xyz_diagonal_prefactor = two_eij_over_sqrt_pi_exp_minus_eij2_rij2 * rij_2 - rij_3 * erf_eij_rij;
+
+    const int n2 = n * n;
+    matrix_d2S[i*n + j         ] = dx * dx * S_direct_product_prefactor + S_xyz_diagonal_prefactor;
+    matrix_d2S[i*n + j + n2    ] = dx * dy * S_direct_product_prefactor;
+    matrix_d2S[i*n + j + n2 * 2] = dx * dz * S_direct_product_prefactor;
+    matrix_d2S[i*n + j + n2 * 3] = dy * dx * S_direct_product_prefactor;
+    matrix_d2S[i*n + j + n2 * 4] = dy * dy * S_direct_product_prefactor + S_xyz_diagonal_prefactor;
+    matrix_d2S[i*n + j + n2 * 5] = dy * dz * S_direct_product_prefactor;
+    matrix_d2S[i*n + j + n2 * 6] = dz * dx * S_direct_product_prefactor;
+    matrix_d2S[i*n + j + n2 * 7] = dz * dy * S_direct_product_prefactor;
+    matrix_d2S[i*n + j + n2 * 8] = dz * dz * S_direct_product_prefactor + S_xyz_diagonal_prefactor;
+
+    if (matrix_d2D != NULL) {
+        const double nxj = norm_vec[3*j];
+        const double nyj = norm_vec[3*j+1];
+        const double nzj = norm_vec[3*j+2];
+        const double nj_rij = dx * nxj + dy * nyj + dz * nzj;
+
+        const double eij4 = eij2 * eij2;
+        const double rij_6 = rij_4 * rij_2;
+        const double rij_7 = rij_4 * rij_3;
+
+        const double D_direct_product_prefactor = (-two_eij_over_sqrt_pi_exp_minus_eij2_rij2 * (15 * rij_6 + 10 * eij2 * rij_4 + 4 * eij4 * rij_2)
+                                                   + 15 * rij_7 * erf_eij_rij) * nj_rij;
+        matrix_d2D[i*n + j         ] = D_direct_product_prefactor * dx * dx - S_direct_product_prefactor * (dx * nxj + dx * nxj + nj_rij);
+        matrix_d2D[i*n + j + n2    ] = D_direct_product_prefactor * dx * dy - S_direct_product_prefactor * (dy * nxj + dx * nyj);
+        matrix_d2D[i*n + j + n2 * 2] = D_direct_product_prefactor * dx * dz - S_direct_product_prefactor * (dz * nxj + dx * nzj);
+        matrix_d2D[i*n + j + n2 * 3] = D_direct_product_prefactor * dy * dx - S_direct_product_prefactor * (dx * nyj + dy * nxj);
+        matrix_d2D[i*n + j + n2 * 4] = D_direct_product_prefactor * dy * dy - S_direct_product_prefactor * (dy * nyj + dy * nyj + nj_rij);
+        matrix_d2D[i*n + j + n2 * 5] = D_direct_product_prefactor * dy * dz - S_direct_product_prefactor * (dz * nyj + dy * nzj);
+        matrix_d2D[i*n + j + n2 * 6] = D_direct_product_prefactor * dz * dx - S_direct_product_prefactor * (dx * nzj + dz * nxj);
+        matrix_d2D[i*n + j + n2 * 7] = D_direct_product_prefactor * dz * dy - S_direct_product_prefactor * (dy * nzj + dz * nyj);
+        matrix_d2D[i*n + j + n2 * 8] = D_direct_product_prefactor * dz * dz - S_direct_product_prefactor * (dz * nzj + dz * nzj + nj_rij);
+    }
+}
+
+__global__
+static void _pcm_d2F_to_d2Sii(const double* F, const double* dF, const double* d2F, const double* charge_exp,
+                              double* d2Sii, const int n_atom, const int n_grid)
+{
+    const int i_grid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ij_atom = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i_grid >= n_grid || ij_atom >= n_atom * n_atom) {
+        return;
+    }
+
+    const int i_atom = ij_atom / n_atom;
+    const int j_atom = ij_atom % n_atom;
+
+    const double zeta = charge_exp[i_grid];
+    const double F_value = F[i_grid];
+    const double F_1 = 1.0 / F_value;
+    const double F_2 = F_1 * F_1;
+    const double combined_factor = SQRT2_PI * zeta * F_2;
+
+    const double dFix = dF[(i_atom * 3    ) * n_grid + i_grid];
+    const double dFiy = dF[(i_atom * 3 + 1) * n_grid + i_grid];
+    const double dFiz = dF[(i_atom * 3 + 2) * n_grid + i_grid];
+    const double dFjx = dF[(j_atom * 3    ) * n_grid + i_grid];
+    const double dFjy = dF[(j_atom * 3 + 1) * n_grid + i_grid];
+    const double dFjz = dF[(j_atom * 3 + 2) * n_grid + i_grid];
+
+    const double d2Fixjx = d2F[((i_atom * n_atom + j_atom) * 9 + 0 * 3    ) * n_grid + i_grid];
+    const double d2Fixjy = d2F[((i_atom * n_atom + j_atom) * 9 + 0 * 3 + 1) * n_grid + i_grid];
+    const double d2Fixjz = d2F[((i_atom * n_atom + j_atom) * 9 + 0 * 3 + 2) * n_grid + i_grid];
+    const double d2Fiyjx = d2F[((i_atom * n_atom + j_atom) * 9 + 1 * 3    ) * n_grid + i_grid];
+    const double d2Fiyjy = d2F[((i_atom * n_atom + j_atom) * 9 + 1 * 3 + 1) * n_grid + i_grid];
+    const double d2Fiyjz = d2F[((i_atom * n_atom + j_atom) * 9 + 1 * 3 + 2) * n_grid + i_grid];
+    const double d2Fizjx = d2F[((i_atom * n_atom + j_atom) * 9 + 2 * 3    ) * n_grid + i_grid];
+    const double d2Fizjy = d2F[((i_atom * n_atom + j_atom) * 9 + 2 * 3 + 1) * n_grid + i_grid];
+    const double d2Fizjz = d2F[((i_atom * n_atom + j_atom) * 9 + 2 * 3 + 2) * n_grid + i_grid];
+
+    d2Sii[((i_atom * n_atom + j_atom) * 9 + 0 * 3    ) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFix * dFjx - d2Fixjx);
+    d2Sii[((i_atom * n_atom + j_atom) * 9 + 0 * 3 + 1) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFix * dFjy - d2Fixjy);
+    d2Sii[((i_atom * n_atom + j_atom) * 9 + 0 * 3 + 2) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFix * dFjz - d2Fixjz);
+    d2Sii[((i_atom * n_atom + j_atom) * 9 + 1 * 3    ) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiy * dFjx - d2Fiyjx);
+    d2Sii[((i_atom * n_atom + j_atom) * 9 + 1 * 3 + 1) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiy * dFjy - d2Fiyjy);
+    d2Sii[((i_atom * n_atom + j_atom) * 9 + 1 * 3 + 2) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiy * dFjz - d2Fiyjz);
+    d2Sii[((i_atom * n_atom + j_atom) * 9 + 2 * 3    ) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiz * dFjx - d2Fizjx);
+    d2Sii[((i_atom * n_atom + j_atom) * 9 + 2 * 3 + 1) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiz * dFjy - d2Fizjy);
+    d2Sii[((i_atom * n_atom + j_atom) * 9 + 2 * 3 + 2) * n_grid + i_grid] = combined_factor * (2 * F_1 * dFiz * dFjz - d2Fizjz);
+}
+
 extern "C" {
 int pcm_d_s(cudaStream_t stream, double *matrix_d, double *matrix_s,
                     const double *coords, const double *norm_vec, const double *r_vdw,
@@ -149,15 +270,47 @@ int pcm_d_s(cudaStream_t stream, double *matrix_d, double *matrix_s,
 }
 
 int pcm_dd_ds(cudaStream_t stream, double *matrix_dD, double *matrix_dS,
-                    const double *coords, const double *norm_vec, const double *r_vdw,
-                    const double *charge_exp, const double *switch_fun,
-                    int n)
+              const double *coords, const double *norm_vec,
+              const double *charge_exp,
+              int n)
 {
     int ntilex = (n + THREADS - 1) / THREADS;
     int ntiley = (n + THREADS - 1) / THREADS;
     dim3 threads(THREADS, THREADS);
     dim3 blocks(ntilex, ntiley);
-    _pcm_dD_dS<<<blocks, threads, 0, stream>>>(matrix_dD, matrix_dS, coords, norm_vec, r_vdw, charge_exp, switch_fun, n);
+    _pcm_dD_dS<<<blocks, threads, 0, stream>>>(matrix_dD, matrix_dS, coords, norm_vec, charge_exp, n);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        return 1;
+    }
+    return 0;
+}
+
+int pcm_d2d_d2s(cudaStream_t stream, double *matrix_d2D, double *matrix_d2S,
+                const double *coords, const double *norm_vec,
+                const double *charge_exp,
+                int n)
+{
+    const int ntilex = (n + THREADS - 1) / THREADS;
+    const int ntiley = (n + THREADS - 1) / THREADS;
+    const dim3 threads(THREADS, THREADS);
+    const dim3 blocks(ntilex, ntiley);
+    _pcm_d2D_d2S<<<blocks, threads, 0, stream>>>(matrix_d2D, matrix_d2S, coords, norm_vec, charge_exp, n);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        return 1;
+    }
+    return 0;
+}
+
+int pcm_d2f_to_d2sii(cudaStream_t stream, const double* F, const double* dF, const double* d2F, const double* charge_exp,
+                     double* d2Sii, const int n_atom, const int n_grid)
+{
+    const int ntilex = (n_grid + THREADS - 1) / THREADS;
+    const int ntiley = (n_atom * n_atom + THREADS - 1) / THREADS;
+    const dim3 threads(THREADS, THREADS);
+    const dim3 blocks(ntilex, ntiley);
+    _pcm_d2F_to_d2Sii<<<blocks, threads, 0, stream>>>(F, dF, d2F, charge_exp, d2Sii, n_atom, n_grid);
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
         return 1;
diff --git a/gpu4pyscf/lib/utils.py b/gpu4pyscf/lib/utils.py
index 0b7c613f..5f38a29c 100644
--- a/gpu4pyscf/lib/utils.py
+++ b/gpu4pyscf/lib/utils.py
@@ -105,6 +105,7 @@ def device(obj):
 def format_sys_info():
     '''Format a list of system information for printing.'''
     from cupyx._runtime import get_runtime_info
+    from gpu4pyscf.__config__ import num_devices, mem_fraction, props as device_props
 
     pyscf_info = lib.repo_info(pyscf.__file__)
     gpu4pyscf_info = lib.repo_info(os.path.join(__file__, '..', '..'))
@@ -112,7 +113,6 @@ def format_sys_info():
     cuda_version = f"{cuda_version // 1000}.{(cuda_version % 1000) // 10}"
 
     runtime_info = get_runtime_info()
-    device_props = cupy.cuda.runtime.getDeviceProperties(0)
     result = [
         f'System: {platform.uname()}  Threads {lib.num_threads()}',
         f'Python {sys.version}',
@@ -134,6 +134,8 @@ def format_sys_info():
         'Device info',
         f'    Device name {device_props["name"]}',
         f'    Device global memory {device_props["totalGlobalMem"] / 1024**3:.2f} GB',
+        f'    CuPy memory fraction {mem_fraction}',
+        f'    Num. Devices {num_devices}',
         f'GPU4PySCF {gpu4pyscf.__version__}',
         f'GPU4PySCF path  {gpu4pyscf_info["path"]}'
     ]
diff --git a/gpu4pyscf/mp/dfmp2.py b/gpu4pyscf/mp/dfmp2.py
index 92652402..da398dcb 100644
--- a/gpu4pyscf/mp/dfmp2.py
+++ b/gpu4pyscf/mp/dfmp2.py
@@ -20,7 +20,7 @@
 from gpu4pyscf.mp import mp2
 from gpu4pyscf.lib import logger
 from gpu4pyscf.lib.cupy_helper import contract, tag_array, reduce_to_device
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 from pyscf import __config__
 
 WITH_T2 = getattr(__config__, 'mp_dfmp2_with_t2', True)
@@ -45,8 +45,8 @@ def _dfmp2_tasks(mp, mo_coeff, mo_energy, device_id=0):
     return Lov
 
 def get_occ_blk(Lov_dist, i, nocc, nvir):
-    occ_blk_dist = [None] * _num_devices
-    for device_id in range(_num_devices):
+    occ_blk_dist = [None] * num_devices
+    for device_id in range(num_devices):
         with cupy.cuda.Device(device_id), _streams[device_id]:
             Lov = Lov_dist[device_id]
             mat = cupy.dot(Lov[:,i*nvir:(i+1)*nvir].T,
@@ -73,8 +73,8 @@ def kernel(mp, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2,
 
     # Submit tasks to different devices
     futures = []
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(_dfmp2_tasks, mp, mo_coeff, mo_energy, 
                                      device_id=device_id)
             futures.append(future)
diff --git a/gpu4pyscf/pbc/df/aft.py b/gpu4pyscf/pbc/df/aft.py
index 5f9edc37..4bc4aa50 100644
--- a/gpu4pyscf/pbc/df/aft.py
+++ b/gpu4pyscf/pbc/df/aft.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,7 +27,6 @@
 from pyscf.pbc.gto.pseudo import pp_int
 from pyscf.pbc.lib.kpts_helper import is_zero
 from pyscf.pbc.df import ft_ao
-from pyscf.pbc.df.aft import _check_kpts
 from pyscf.pbc.tools import k2gamma
 from gpu4pyscf.pbc.tools.pbc import get_coulG
 from gpu4pyscf.pbc.df import aft_jk
@@ -201,3 +200,19 @@ def get_jk(self, dm, hermi=1, kpts=None, kpts_band=None,
     to_gpu = utils.to_gpu
     device = utils.device
     to_cpu = utils.to_cpu
+
+def _check_kpts(mydf, kpts):
+    '''Check if the argument kpts is a single k-point'''
+    if kpts is None:
+        kpts = mydf.kpts
+        if kpts is None:
+            kpts = np.zeros((1, 3))
+            is_single_kpt = True
+        else:
+            kpts = np.asarray(kpts)
+            is_single_kpt = kpts.ndim == 1 or is_zero(kpts)
+    else:
+        kpts = np.asarray(kpts)
+        is_single_kpt = kpts.ndim == 1
+    kpts = kpts.reshape(-1,3)
+    return kpts, is_single_kpt
diff --git a/gpu4pyscf/pbc/df/aft_jk.py b/gpu4pyscf/pbc/df/aft_jk.py
index 225f97cb..040fc955 100644
--- a/gpu4pyscf/pbc/df/aft_jk.py
+++ b/gpu4pyscf/pbc/df/aft_jk.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/gpu4pyscf/pbc/df/df.py b/gpu4pyscf/pbc/df/df.py
index 756c94df..45d41f22 100644
--- a/gpu4pyscf/pbc/df/df.py
+++ b/gpu4pyscf/pbc/df/df.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 __all__ = ['GDF']
 
+import warnings
 import ctypes
 import tempfile
 import numpy as np
@@ -28,20 +29,19 @@
 from pyscf import lib
 from pyscf.pbc.df import aft as aft_cpu
 from pyscf.pbc.df import df as df_cpu
-from pyscf.pbc.df.aft import _check_kpts
 from pyscf.pbc.df.gdf_builder import libpbc
-from pyscf.pbc.lib.kpts_helper import is_zero, unique
-from pyscf.pbc.df.rsdf_builder import _RSGDFBuilder, _RSNucBuilder
+from pyscf.pbc.lib.kpts_helper import is_zero
 from gpu4pyscf.lib import logger
-from gpu4pyscf.pbc.df import df_jk
-from gpu4pyscf.lib.cupy_helper import return_cupy_array, pack_tril, unpack_tril
+from gpu4pyscf.pbc.df import df_jk, rsdf_builder
+from gpu4pyscf.pbc.df.aft import _check_kpts
+from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh
+from gpu4pyscf.lib.cupy_helper import return_cupy_array, pack_tril, get_avail_mem
 from gpu4pyscf.lib import utils
 
 class GDF(lib.StreamObject):
     '''Gaussian density fitting
     '''
     blockdim = df_cpu.GDF.blockdim
-    _dataname = 'j3c'
     _prefer_ccdf = False
     force_dm_kbuild = False
 
@@ -56,51 +56,25 @@ class GDF(lib.StreamObject):
     reset = df_cpu.GDF.reset
     dump_flags = df_cpu.GDF.dump_flags
 
-    def build(self, j_only=None, with_j3c=True, kpts_band=None):
+    def build(self, j_only=None, kpts_band=None):
+        warnings.warn(
+            'PBC.df is currently experimental and subject to significant changes.')
         if j_only is not None:
             self._j_only = j_only
-        if self.kpts_band is not None:
-            self.kpts_band = np.reshape(self.kpts_band, (-1,3))
-        assert kpts_band is None
+        assert kpts_band is None and self.kpts_band is None
 
         self.check_sanity()
         self.dump_flags()
+        cell = self.cell
+        auxcell = df_cpu.make_auxcell(cell, self.auxbasis, self.exp_to_discard)
+        self.auxcell = auxcell
 
-        self.auxcell = df_cpu.make_auxcell(self.cell, self.auxbasis,
-                                           self.exp_to_discard)
-
-        if with_j3c and self._cderi_to_save is not None:
-            if isinstance(self._cderi_to_save, str):
-                cderi = self._cderi_to_save
-            else:
-                cderi = self._cderi_to_save.name
-            self._cderi = cderi
-            t1 = (logger.process_clock(), logger.perf_counter())
-            self._make_j3c(self.cell, self.auxcell, None, cderi)
-            t1 = logger.timer_debug1(self, 'j3c', *t1)
+        t1 = (logger.process_clock(), logger.perf_counter())
+        self._cderi, self._cderip = rsdf_builder.build_cderi(
+            cell, auxcell, self.kpts, j_only=j_only)
+        t1 = logger.timer_debug1(self, 'j3c', *t1)
         return self
 
-    def _make_j3c(self, cell=None, auxcell=None, kptij_lst=None, cderi_file=None):
-        if cell is None: cell = self.cell
-        if auxcell is None: auxcell = self.auxcell
-        if cderi_file is None: cderi_file = self._cderi_to_save
-
-        # Remove duplicated k-points. Duplicated kpts may lead to a buffer
-        # located in incore.wrap_int3c larger than necessary. Integral code
-        # only fills necessary part of the buffer, leaving some space in the
-        # buffer unfilled.
-        if self.kpts_band is None:
-            kpts_union = self.kpts
-        else:
-            kpts_union = unique(np.vstack([self.kpts, self.kpts_band]))[0]
-
-        dfbuilder = _RSGDFBuilder(cell, auxcell, kpts_union)
-        dfbuilder.mesh = self.mesh
-        dfbuilder.linear_dep_threshold = self.linear_dep_threshold
-        j_only = self._j_only or len(kpts_union) == 1
-        dfbuilder.make_j3c(cderi_file, j_only=j_only, dataname=self._dataname,
-                           kptij_lst=kptij_lst)
-
     has_kpts = df_cpu.GDF.has_kpts
     weighted_coulG = return_cupy_array(aft_cpu.weighted_coulG)
     pw_loop = NotImplemented
@@ -108,48 +82,72 @@ def _make_j3c(self, cell=None, auxcell=None, kptij_lst=None, cderi_file=None):
     get_naoaux = df_cpu.GDF.get_naoaux
     range_coulomb = aft_cpu.AFTDFMixin.range_coulomb
 
-    def sr_loop(self, kpti_kptj=np.zeros((2,3)), max_memory=2000,
-                compact=True, blksize=None, aux_slice=None):
-        '''Short range part'''
-        assert aux_slice is None
+    def sr_loop(self, ki, kj, compact=True, blksize=None):
+        '''Iterator for the 3-index cderi tensor over the auxliary dimension'''
         if self._cderi is None:
             self.build()
         cell = self.cell
-        kpti, kptj = kpti_kptj
-        unpack = is_zero(kpti-kptj) and not compact
         nao = cell.nao
         if blksize is None:
-            blksize = max_memory*1e6/16/(nao**2*2)
-            blksize /= 2  # For prefetch
-            blksize = max(16, min(int(blksize), self.blockdim))
-            logger.debug2(self, 'max_memory %d MB, blksize %d', max_memory, blksize)
-
-        def load(aux_slice):
-            b0, b1 = aux_slice
-            naux = b1 - b0
-            Lpq = cp.asarray(j3c[b0:b1])
-            if compact and Lpq.shape[1] == nao**2:
-                Lpq = pack_tril(Lpq.reshape(naux, nao, nao))
-            elif unpack and Lpq.shape[1] != nao**2:
-                Lpq = unpack_tril(Lpq)
-            return Lpq
-
-        with df_cpu._load3c(self._cderi, self._dataname, kpti_kptj) as j3c:
-            slices = lib.prange(0, j3c.shape[0], blksize)
-            for Lpq in lib.map_with_prefetch(load, slices):
-                yield Lpq, 1
-
-        if cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum':
-            # Truncated Coulomb operator is not positive definite. Load the
-            # CDERI tensor of negative part.
-            with df_cpu._load3c(self._cderi, self._dataname+'-', kpti_kptj,
-                                ignore_key_error=True) as j3c:
-                slices = lib.prange(0, j3c.shape[0], blksize)
-                for Lpq in lib.map_with_prefetch(load, slices):
-                    yield Lpq, -1
-
-    get_pp = return_cupy_array(df_cpu.GDF.get_pp)
-    get_nuc = return_cupy_array(df_cpu.GDF.get_nuc)
+            avail_mem = get_avail_mem() * .8
+            blksize = avail_mem/16/(nao**2*3)
+            if blksize < 16:
+                raise RuntimeError('Insufficient GPU memory')
+            blksize = min(int(blksize), self.blockdim)
+            logger.debug2(self, 'max_memory %d MB, blksize %d', avail_mem*1e-6, blksize)
+
+        if (ki, kj) in self._cderi:
+            req_conj = False
+        elif (kj, ki) in self._cderi:
+            req_conj = True
+        else:
+            raise RuntimeError('CDERI for kpoints {ki},{kj} not generated')
+
+        Lpq_kij = self._cderi[ki,kj]
+        naux = len(Lpq_kij)
+        for b0, b1 in lib.prange(0, naux, blksize):
+            if req_conj:
+                Lpq = Lpq_kij[b0:b1].transpose(0,2,1).conj()
+            else:
+                Lpq = Lpq_kij[b0:b1]
+            assert Lpq[0].size == nao**2
+            if compact:
+                Lpq = pack_tril(Lpq.reshape(-1, nao, nao))
+            yield Lpq, 1
+
+        if cell.dimension == 2:
+            assert cell.low_dim_ft_type != 'inf_vacuum'
+            Lpq_kij = self._cderip[ki,kj]
+            naux = len(Lpq_kij)
+            for b0, b1 in lib.prange(0, naux, blksize):
+                if req_conj:
+                    Lpq = Lpq_kij[b0:b1].transpose(0,2,1).conj()
+                else:
+                    Lpq = Lpq_kij[b0:b1]
+                assert Lpq[0].size == nao**2
+                if compact:
+                    Lpq = pack_tril(Lpq.reshape(-1, nao, nao))
+                yield Lpq, -1
+
+    def get_pp(self, kpts=None):
+        kpts, is_single_kpt = _check_kpts(self, kpts)
+        if is_single_kpt and is_zero(kpts):
+            vpp = rsdf_builder.get_pp(self.cell)
+        else:
+            vpp = rsdf_builder.get_pp(self.cell, kpts)
+            if is_single_kpt:
+                vpp = vpp[0]
+        return vpp
+
+    def get_nuc(self, kpts=None):
+        kpts, is_single_kpt = _check_kpts(self, kpts)
+        if is_single_kpt and is_zero(kpts):
+            nuc = rsdf_builder.get_nuc(self.cell)
+        else:
+            nuc = rsdf_builder.get_nuc(self.cell, kpts)
+            if is_single_kpt:
+                nuc = nuc[0]
+        return nuc
 
     # Note: Special exxdiv by default should not be used for an arbitrary
     # input density matrix. When the df object was used with the molecular
diff --git a/gpu4pyscf/pbc/df/df_jk.py b/gpu4pyscf/pbc/df/df_jk.py
index bdaf2427..36ce3acf 100644
--- a/gpu4pyscf/pbc/df/df_jk.py
+++ b/gpu4pyscf/pbc/df/df_jk.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 from gpu4pyscf.lib.cupy_helper import contract, unpack_tril
 from gpu4pyscf.pbc.df.fft_jk import _ewald_exxdiv_for_G0, _format_dms, _format_jks
 
-def density_fit(mf, auxbasis=None, mesh=None, with_df=None):
+def density_fit(mf, auxbasis=None, with_df=None):
     '''Generate density-fitting SCF object
 
     Args:
@@ -34,8 +34,6 @@ def density_fit(mf, auxbasis=None, mesh=None, with_df=None):
             Same format to the input attribute mol.basis.  If auxbasis is
             None, auxiliary basis based on AO basis (if possible) or
             even-tempered Gaussian basis will be used.
-        mesh : tuple
-            number of grids in each direction
         with_df : DF object
     '''
     from gpu4pyscf.pbc.df.df import GDF
@@ -45,27 +43,21 @@ def density_fit(mf, auxbasis=None, mesh=None, with_df=None):
         else:
             kpts = np.reshape(mf.kpt, (1,3))
         with_df = GDF(mf.cell, kpts)
-        with_df.max_memory = mf.max_memory
         with_df.stdout = mf.stdout
         with_df.verbose = mf.verbose
         with_df.auxbasis = auxbasis
-        if mesh is not None:
-            with_df.mesh = mesh
 
-    mf = mf.copy()
+    mf = mf.copy().reset()
     mf.with_df = with_df
-    mf._eri = None
     return mf
 
 
 def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None):
     log = logger.new_logger(mydf)
     t0 = log.init_timer()
-    if mydf._cderi is None or not mydf.has_kpts(kpts_band):
-        if mydf._cderi is not None:
-            log.warn('DF integrals for band k-points were not found %s. '
-                     'DF integrals will be rebuilt to include band k-points.',
-                     mydf._cderi)
+    assert kpts_band is None or kpts_band is kpts
+    assert mydf.has_kpts(kpts)
+    if mydf._cderi is None:
         mydf.build(j_only=True, kpts_band=kpts_band)
         t0 = log.timer_debug1('Init get_j_kpts', *t0)
 
@@ -83,11 +75,9 @@ def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None):
     nband = len(kpts_band)
 
     rho = cp.zeros((nset,naux), dtype=np.complex128)
-    max_memory = max(2000, (mydf.max_memory - lib.current_memory()[0]))
-    for k, kpt in enumerate(kpts):
-        kptii = np.asarray((kpt,kpt))
+    for k in range(nkpts):
         p1 = 0
-        for Lpq, sign in mydf.sr_loop(kptii, max_memory, False):
+        for Lpq, sign in mydf.sr_loop(k, k, False):
             Lpq = Lpq.reshape(-1,nao,nao)
             p0, p1 = p1, p1+Lpq.shape[0]
             rho[:,p0:p1] += sign * contract('Lpq,xqp->xL', Lpq, dms[:,k])
@@ -102,9 +92,8 @@ def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None):
         vj = cp.zeros((nset,nband,nao_pair), dtype=np.complex128)
 
     for k, kpt in enumerate(kpts_band):
-        kptii = np.asarray((kpt,kpt))
         p1 = 0
-        for Lpq, sign in mydf.sr_loop(kptii, max_memory, aos2symm):
+        for Lpq, sign in mydf.sr_loop(k, k, aos2symm):
             nrow = Lpq.shape[0]
             p0, p1 = p1, p1+nrow
             Lpq = Lpq.reshape(nrow, -1)
@@ -137,11 +126,9 @@ def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None,
         raise RuntimeError('GDF does not support exxdiv %s' % exxdiv)
 
     t0 = (logger.process_clock(), logger.perf_counter())
-    if mydf._cderi is None or not mydf.has_kpts(kpts_band):
-        if mydf._cderi is not None:
-            log.warn('DF integrals for band k-points were not found %s. '
-                     'DF integrals will be rebuilt to include band k-points.',
-                     mydf._cderi)
+    assert kpts_band is None or kpts_band is kpts
+    assert mydf.has_kpts(kpts)
+    if mydf._cderi is None:
         mydf.build(kpts_band=kpts_band)
         t0 = log.timer_debug1('Init get_k_kpts', *t0)
 
@@ -186,12 +173,12 @@ def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), kpts_band=None,
     # K_pq = ( p{k1} i{k2} | i{k2} q{k1} )
     # input dm is not Hermitian/PSD --> build K from dm
     log.debug2('get_k_kpts: build K from dm')
-    max_memory = max(2000, mydf.max_memory-lib.current_memory()[0])
-    def make_kpt(ki, kj, swap_2e, inverse_idx=None):
-        kpti = kpts[ki]
-        kptj = kpts_band[kj]
-        #TODO: utilize kk_adapted_iter with time_reversal_symmetry, as that in aft_jk
-        for Lpq, sign in mydf.sr_loop((kpti,kptj), max_memory, compact=False):
+    if mydf._cderi is None:
+        mydf.build()
+    def make_kpt(ki, kj, swap_2e):
+        if (ki, kj) not in mydf._cderi:
+            kj, ki = ki, kj
+        for Lpq, sign in mydf.sr_loop(ki, kj, compact=False):
             Lpq = Lpq.reshape(-1, nao, nao)
             tmp = contract('njk,Lkl->nLjl', dms[:,ki], Lpq)
             if sign > 0:
@@ -207,23 +194,23 @@ def make_kpt(ki, kj, swap_2e, inverse_idx=None):
                     vk[:,ki] -= contract('nLki,Lji->nkj', tmp, Lpq.conj())
 
     t1 = log.init_timer()
-    if kpts_band is kpts:  # normal k-points HF/DFT
-        for ki in range(nkpts):
-            for kj in range(ki):
-                make_kpt(ki, kj, True)
-            make_kpt(ki, ki, False)
-            t1 = log.timer_debug1('get_k_kpts: make_kpt ki>=kj (%d,*)'%ki, *t1)
-    else:
+    if kpts_band is not kpts:  # normal k-points HF/DFT
         raise NotImplementedError
-
-    if exxdiv == 'ewald':
-        _ewald_exxdiv_for_G0(cell, kpts, dms, vk, kpts_band)
+    #TODO: utilize kk_adapted_iter with time_reversal_symmetry, as that in aft_jk
+    for ki in range(nkpts):
+        for kj in range(ki):
+            make_kpt(ki, kj, True)
+        make_kpt(ki, ki, False)
+        t1 = log.timer_debug1('get_k_kpts: make_kpt ki>=kj (%d,*)'%ki, *t1)
 
     if (is_zero(kpts) and is_zero(kpts_band) and
         not np.iscomplexobj(dm_kpts)):
         vk = vk.real
     vk *= 1./nkpts
 
+    if exxdiv == 'ewald':
+        _ewald_exxdiv_for_G0(cell, kpts, dms, vk, kpts_band)
+
     log.timer('get_k_kpts', *t0)
     return _format_jks(vk, dm_kpts, input_band, kpts)
 
@@ -243,29 +230,17 @@ def get_jk(mydf, dm, hermi=1, kpt=np.zeros(3),
     '''JK for given k-point'''
     log = logger.new_logger(mydf)
     t0 = log.init_timer()
-    if mydf._cderi is None or not mydf.has_kpts(kpts_band):
-        if mydf._cderi is not None:
-            log.warn('DF integrals for band k-points were not found %s. '
-                     'DF integrals will be rebuilt to include band k-points.',
-                     mydf._cderi)
+    assert is_zero(kpt)
+    assert kpts_band is None
+    if mydf._cderi is None:
         mydf.build(j_only=not with_k, kpts_band=kpts_band)
         t0 = log.timer_debug1('Init get_jk', *t0)
 
-    vj = vk = None
-    if kpts_band is not None and abs(kpt-kpts_band).sum() > 1e-9:
-        kpt = np.reshape(kpt, (1,3))
-        if with_k:
-            vk = get_k_kpts(mydf, dm, hermi, kpt, kpts_band, exxdiv)
-        if with_j:
-            vj = get_j_kpts(mydf, dm, hermi, kpt, kpts_band)
-        return vj, vk
-
     cell = mydf.cell
-    dm = np.asarray(dm, order='C')
+    dm = cp.asarray(dm, order='C')
     dms = _format_dms(dm, [kpt])
     nset, _, nao = dms.shape[:3]
     dms = dms.reshape(nset,nao,nao)
-    kptii = np.asarray((kpt,kpt))
     if with_j:
         vj = cp.zeros((nset,nao,nao), dtype=np.complex128)
     if with_k:
@@ -294,9 +269,7 @@ def get_jk(mydf, dm, hermi=1, kpt=np.zeros(3),
         '''
         vk = cp.zeros((nset,nao,nao), dtype=np.complex128)
 
-    mem_now = lib.current_memory()[0]
-    max_memory = max(2000, (mydf.max_memory - mem_now))
-    for Lpq, sign in mydf.sr_loop(kptii, max_memory, False):
+    for Lpq, sign in mydf.sr_loop(0, 0, False):
         if with_j:
             #:rho_coeff = np.einsum('Lpq,xqp->xL', Lpq, dms)
             #:vj += np.dot(rho_coeff, Lpq.reshape(-1,nao**2))
diff --git a/gpu4pyscf/pbc/df/fft.py b/gpu4pyscf/pbc/df/fft.py
index 9d54b118..d074d9b3 100644
--- a/gpu4pyscf/pbc/df/fft.py
+++ b/gpu4pyscf/pbc/df/fft.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/gpu4pyscf/pbc/df/fft_jk.py b/gpu4pyscf/pbc/df/fft_jk.py
index dbf64378..1d17ed6d 100644
--- a/gpu4pyscf/pbc/df/fft_jk.py
+++ b/gpu4pyscf/pbc/df/fft_jk.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/gpu4pyscf/pbc/df/ft_ao.py b/gpu4pyscf/pbc/df/ft_ao.py
index d93678d6..cdd59951 100644
--- a/gpu4pyscf/pbc/df/ft_ao.py
+++ b/gpu4pyscf/pbc/df/ft_ao.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,7 +26,6 @@
 from pyscf.gto.mole import ANG_OF, ATOM_OF, PTR_COORD
 from pyscf.scf import _vhf
 from pyscf.pbc import tools as pbctools
-from pyscf.pbc.gto.cell import _extract_pgto_params
 from pyscf.pbc.tools import k2gamma
 from pyscf.pbc.lib.kpts_helper import is_zero
 from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh
@@ -36,6 +35,7 @@
 from gpu4pyscf.scf.jk import (
     g_pair_idx, _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE)
 from gpu4pyscf.pbc.lib.kpts_helper import conj_images_in_bvk_cell
+from gpu4pyscf.pbc.gto.cell import extract_pgto_params
 from gpu4pyscf.__config__ import props as gpu_specs
 
 __all__ = [
@@ -43,8 +43,8 @@
 ]
 
 libpbc = load_library('libpbc')
-libpbc.PBC_build_ft_ao.restype = ctypes.c_int
-libpbc.PBC_FT_init_constant.restype = ctypes.c_int
+libpbc.build_ft_ao.restype = ctypes.c_int
+libpbc.init_constant.restype = ctypes.c_int
 
 LMAX = 4
 GOUT_WIDTH = 19
@@ -71,27 +71,25 @@ def ft_ao(cell, Gv, shls_slice=None, b=None,
           gxyz=None, Gvbase=None, kpt=np.zeros(3), verbose=None):
     from pyscf.pbc.df.ft_ao import ft_ao
     out = ft_ao(cell, Gv, shls_slice, b, gxyz, Gvbase, kpt, verbose)
-    return cp.asarray(out)
+    if out.flags.c_contiguous:
+        return cp.asarray(out)
+    else:
+        return cp.asarray(out, order='F')
 
-def _bas_overlap_mask(cell, bvkmesh_Ls, Ls, cutoff=None):
+def _bas_overlap_mask(cell, bvkmesh_Ls, Ls):
     '''integral screening mask for basis product between cell and supmol'''
     # consider only the most diffused component of a basis
-    exps, cs = _extract_pgto_params(cell, 'min')
+    exps, cs = extract_pgto_params(cell, 'diffused')
     ls = cell._bas[:,ANG_OF]
     bas_coords = cp.asarray(cell.atom_coords()[cell._bas[:,ATOM_OF]])
 
-    vol = cell.vol
-    if cutoff is None:
-        theta_ij = exps.min() / 2
-        lattice_sum_factor = max(2*np.pi*cell.rcut/(vol*theta_ij), 1)
-        cutoff = cell.precision/lattice_sum_factor * .1
-        logger.debug(cell, 'Set ft_ao cutoff to %g', cutoff)
-
     ls = cp.asarray(ls)
     exps = cp.asarray(exps)
     norm = cp.asarray(cs) * ((2*ls+1)/(4*np.pi))**.5
     aij = exps[:,None] + exps
-    theta = exps[:,None] * exps / aij
+    fi = exps[:,None] / aij
+    fj = exps[None,:] / aij
+    theta = exps[:,None] * fj
 
     Ls = cp.asarray(Ls)
     # rj format: (bvk_cell_id, bas_id, lattice_img_id)
@@ -100,16 +98,18 @@ def _bas_overlap_mask(cell, bvkmesh_Ls, Ls, cutoff=None):
 
     dr = cp.linalg.norm(rirj, axis=4)
 
-    dri = exps[None,None,:,None]/aij[:,None,:,None] * dr
-    drj = exps[:,None,None,None]/aij[:,None,:,None] * dr
+    dri = fj[:,None,:,None] * dr
+    drj = fi[:,None,:,None] * dr
     li = ls[:,None,None,None]
     lj = ls[None,None,:,None]
     fac_dri = (li * .5/aij[:,None,:,None] + dri**2) ** (li*.5)
     fac_drj = (lj * .5/aij[:,None,:,None] + drj**2) ** (lj*.5)
-    fl = 2*np.pi/vol * (dr/theta[:,None,:,None]) + 1.
+    rad = cell.vol**(-1./3) * dr + 1
+    surface = 4*np.pi * rad**2
+    fl = cp.where(surface > 1, surface, 1)
     fac_norm = norm[:,None]*norm * (np.pi/aij)**1.5
     ovlp = fac_norm[:,None,:,None] * cp.exp(-theta[:,None,:,None]*dr**2) * fac_dri * fac_drj * fl
-    return ovlp > cutoff
+    return ovlp > cell.precision
 
 def gen_ft_kernel(cell, kpts=None, verbose=None):
     r'''
@@ -132,11 +132,12 @@ def __init__(self, cell, kpts=None, bvk_kmesh=None):
         self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts))
         self.coeff = cp.asarray(coeff, dtype=np.complex128)
 
-        if kpts is not None and bvk_kmesh is None:
-            bvk_kmesh = kpts_to_kmesh(cell, kpts)
-
-        # create BVK super-cell
         if bvk_kmesh is None:
+            if kpts is None or is_zero(kpts):
+                bvk_kmesh = np.ones(3, dtype=int)
+            else:
+                bvk_kmesh = kpts_to_kmesh(cell, kpts)
+        if np.prod(bvk_kmesh) == 1:
             bvkcell = cell
         else:
             bvkcell = pbctools.super_cell(cell, bvk_kmesh, wrap_around=True)
@@ -169,7 +170,7 @@ def gen_ft_kernel(self, verbose=None):
         Ls = Ls[cp.linalg.norm(Ls-.5, axis=1).argsort()]
 
         if bvk_kmesh is None:
-            bvkmesh_Ls = cp.zeros(3)
+            bvkmesh_Ls = cp.zeros((1, 3))
         else:
             bvkmesh_Ls = cp.asarray(
                 k2gamma.translation_vectors_for_kmesh(cell, bvk_kmesh, True))
@@ -209,7 +210,7 @@ def gen_ft_kernel(self, verbose=None):
             conj_mapping = cp.asarray(conj_images_in_bvk_cell(bvk_kmesh), dtype=np.int32)
 
         init_constant(cell)
-        kern = libpbc.PBC_build_ft_ao
+        kern = libpbc.build_ft_ao
         cp.cuda.Stream.null.synchronize()
         log.timer_debug1('initialize ft_kern', *cput0)
 
@@ -270,7 +271,7 @@ def _ft_sub(Gv, q, kptjs, transform_ao=True):
                     cell._atm.ctypes, ctypes.c_int(cell.natm),
                     cell._bas.ctypes, ctypes.c_int(cell.nbas), cell._env.ctypes)
                 if err != 0:
-                    raise RuntimeError(f'PBC_build_ft_ao kernel for {ll_pattern} failed')
+                    raise RuntimeError(f'build_ft_ao kernel for {ll_pattern} failed')
                 if log.verbose >= logger.DEBUG1:
                     t1, t1p = log.timer_debug1(f'processing {ll_pattern}', *t1), t1
                     if ll_pattern not in timing_collection:
@@ -290,24 +291,25 @@ def _ft_sub(Gv, q, kptjs, transform_ao=True):
                 #ix, iy = cp.tril_indices(nao, -1)
                 #for k, ck in enumerate(conj_mapping):
                 #    out[iy,ix,ck] = out[ix,iy,k]
-                err = libpbc.PBC_ft_aopair_fill_triu(
+                err = libpbc.ft_aopair_fill_triu(
                     ctypes.cast(out.data.ptr, ctypes.c_void_p),
                     ctypes.cast(conj_mapping.data.ptr, ctypes.c_void_p),
                     ctypes.c_int(nao), ctypes.c_int(bvk_ncells), ctypes.c_int(nGv))
                 if err != 0:
-                    raise RuntimeError('PBC_ft_aopair_fill_triu kernel failed')
+                    raise RuntimeError('ft_aopair_fill_triu kernel failed')
 
             log.debug1('transform BvK-cell to k-points')
-            if kptjs is not None:
+            gamma_point_only = kptjs is None or is_zero(kptjs)
+            if not gamma_point_only:
                 kptjs = cp.asarray(kptjs, order='C').reshape(-1,3)
                 expLk = cp.exp(1j*cp.dot(bvkmesh_Ls, kptjs.T))
-                out = contract('Lk,LpqG->kGpq', expLk, out)
+                out = contract('Lk,LpqG->kpqG', expLk, out)
 
             if transform_ao:
                 log.debug1('transform basis')
                 #:out = einsum('pqLG,pi,qj->LGij', out, coeff, coeff)
-                out = contract('kGpq,qj->kGpj', out, coeff)
-                out = contract('kGpj,pi->kGij', out, coeff)
+                out = contract('kpqG,pi->kiqG', out, coeff)
+                out = contract('kiqG,qj->kijG', out, coeff)
 
             log.timer('ft_aopair', *cput0)
             return out
@@ -323,7 +325,7 @@ def ft_kernel(Gv, q=np.zeros(3), kptjs=kpts, transform_ao=True):
             avail_mem = get_avail_mem()
 
             if 2*out_size < avail_mem * .8:
-                return _ft_sub(Gv, q, kptjs, transform_ao)
+                return _ft_sub(Gv, q, kptjs, transform_ao).transpose(0,3,1,2)
 
             elif out_size < avail_mem * .8:
                 if kptjs is None:
@@ -332,16 +334,16 @@ def ft_kernel(Gv, q=np.zeros(3), kptjs=kpts, transform_ao=True):
                     kptjs = kptjs.reshape(-1, 3)
                     nkpts = len(kptjs)
                 if transform_ao:
-                    out = cp.empty((nkpts, nGv, nao_orig, nao_orig), dtype=np.complex128)
+                    out = cp.empty((nkpts, nao_orig, nao_orig, nGv), dtype=np.complex128)
                 else:
-                    out = cp.empty((nkpts, nGv, nao, nao), dtype=np.complex128)
+                    out = cp.empty((nkpts, nao, nao, nGv), dtype=np.complex128)
                 Gv_block = int((avail_mem * .95 - out_size) / (2*nao**2*bvk_ncells*16))
                 Gv_block &= 0xfffffc
                 if Gv_block >= 4:
                     logger.debug1(cell, 'Processing ft_kernel in sub-blocks, Gv_block = %d', Gv_block)
                     for p0, p1 in lib.prange(0, nGv, Gv_block):
-                        out[:,p0:p1] = _ft_sub(Gv[p0:p1], q, kptjs, transform_ao)
-                    return out
+                        out[:,:,:,p0:p1] = _ft_sub(Gv[p0:p1], q, kptjs, transform_ao)
+                    return out.transpose(0,3,1,2)
 
             raise RuntimeError('Not enough GPU memory. '
                                f'Available: {avail_mem*1e-9:.2f} GB. '
@@ -365,7 +367,7 @@ class AFTIntEnvVars(ctypes.Structure):
 
 def init_constant(cell):
     g_idx, offsets = g_pair_idx()
-    err = libpbc.PBC_FT_init_constant(
+    err = libpbc.init_constant(
         g_idx.ctypes, offsets.ctypes, cell._env.ctypes, ctypes.c_int(cell._env.size),
         ctypes.c_int(SHM_SIZE))
     if err != 0:
diff --git a/gpu4pyscf/pbc/df/int3c2e.py b/gpu4pyscf/pbc/df/int3c2e.py
new file mode 100644
index 00000000..f92b6ef6
--- /dev/null
+++ b/gpu4pyscf/pbc/df/int3c2e.py
@@ -0,0 +1,482 @@
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Perodic 3-center 2-electron short-range Coulomb integral helper functions
+'''
+
+import ctypes
+import math
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.lib.parameters import ANGULAR
+from pyscf.gto.mole import ANG_OF, ATOM_OF, PTR_COORD, PTR_EXP, conc_env
+from pyscf.pbc import tools as pbctools
+from pyscf.pbc.tools import k2gamma
+from pyscf.pbc.lib.kpts_helper import is_zero
+from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import contract
+from gpu4pyscf.gto.mole import group_basis, PTR_BAS_COORD
+from gpu4pyscf.scf.jk import _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE
+from gpu4pyscf.pbc.gto.cell import extract_pgto_params
+from gpu4pyscf.pbc.df.ft_ao import libpbc, init_constant
+
+__all__ = [
+    'sr_aux_e2',
+]
+
+libpbc.fill_int3c2e.restype = ctypes.c_int
+
+LMAX = 4
+L_AUX_MAX = 6
+GOUT_WIDTH = 45
+THREADS = 256
+BVK_CELL_SHELLS = 2400
+
+def sr_aux_e2(cell, auxcell, omega, kpts=None, bvk_kmesh=None, j_only=False):
+    r'''
+    Short-range 3-center integrals (ij|k). The auxiliary basis functions are
+    placed at the second electron.
+    '''
+    if bvk_kmesh is None and kpts is not None:
+        if j_only:
+            # Coulomb integrals requires smaller kmesh to converge finite-size effects
+            bvk_kmesh = kpts_to_kmesh(cell, bvk_kmesh)
+        else:
+            # The remote images may contribute to certain k-point mesh,
+            # contributing to the finite-size effects in exchange matrix.
+            rcut = estimate_rcut(cell, auxcell, omega).max()
+            bvk_kmesh = kpts_to_kmesh(cell, kpts, rcut=rcut)
+    bvk_kmesh, bvk_kmesh_inp = guess_bvk_kmesh(cell, bvk_kmesh), bvk_kmesh
+    logger.debug(cell, 'BvK input %s, set to %s for sr_aux_e2', bvk_kmesh_inp, bvk_kmesh)
+    int3c2e_opt = SRInt3c2eOpt(cell, auxcell, omega, bvk_kmesh)
+    nao, nao_orig = int3c2e_opt.coeff.shape
+    naux = int3c2e_opt.aux_coeff.shape[0]
+
+    gamma_point = kpts is None or (kpts.ndim == 1 and is_zero(kpts))
+    if gamma_point:
+        out = cp.zeros((nao, nao, naux))
+    else:
+        kpts = np.asarray(kpts).reshape(-1, 3)
+        expLk = cp.exp(1j*cp.asarray(int3c2e_opt.bvkmesh_Ls.dot(kpts.T)))
+        nL, nkpts = expLk.shape
+        if j_only:
+            expLLk = contract('Lk,Mk->LMk', expLk.conj(), expLk)
+            expLLk = expLLk.view(np.float64).reshape(nL,nL,nkpts,2)
+            out = cp.zeros((nkpts, nao, nao, naux), dtype=np.complex128)
+        else:
+            out = cp.zeros((nkpts, nkpts, nao, nao, naux), dtype=np.complex128)
+
+    ao_loc = int3c2e_opt.sorted_cell.ao_loc
+    aux_loc = int3c2e_opt.sorted_auxcell.ao_loc
+
+    for shls_slice, eri3c in int3c2e_opt.int3c2e_kernel():
+        i0, i1, j0, j1 = ao_loc[list(shls_slice[:4])]
+        k0, k1 = aux_loc[list(shls_slice[4:])]
+        if gamma_point:
+            out[i0:i1,j0:j1,k0:k1] = tmp = eri3c.sum(axis=(0,2))
+            if i0 != j0:
+                out[j0:j1,i0:i1,k0:k1] = tmp.transpose(1,0,2)
+        elif j_only:
+            tmp = contract('LMkz,LpMqr->kpqrz', expLLk, eri3c)
+            tmp = tmp.view(np.complex128)[...,0]
+            out[:,i0:i1,j0:j1,k0:k1] = tmp
+            if i0 != j0:
+                out[:,j0:j1,i0:i1,k0:k1] = tmp.transpose(0,2,1,3).conj()
+        else:
+            expLkz = expLk.view(np.float64).reshape(nL,nkpts,2)
+            tmp = contract('Lkz,MpLqr->Mkpqrz', expLkz, eri3c)
+            tmp = tmp.view(np.complex128)[...,0]
+            tmp = contract('Mk,Mlpqr->klpqr', expLk.conj(), tmp)
+            out[:,:,i0:i1,j0:j1,k0:k1] = tmp
+            if i0 != j0:
+                out[:,:,j0:j1,i0:i1,k0:k1] = tmp.transpose(1,0,3,2,4).conj()
+        tmp = None
+
+    if kpts is None:
+        out = contract('pqr,rk->pqk', out, int3c2e_opt.aux_coeff)
+        out = contract('pqk,qj->pjk', out, int3c2e_opt.coeff)
+        out = contract('pjk,pi->ijk', out, int3c2e_opt.coeff)
+    elif j_only:
+        #:out = einsum('MpNqr,pi,qj,rk->MiNjk', out, coeff, coeff, auxcoeff)
+        out = contract('Npqr,rk->Npqk', out, int3c2e_opt.aux_coeff)
+        out = contract('Npqk,qj->Npjk', out, int3c2e_opt.coeff)
+        out = contract('Npjk,pi->Nijk', out, int3c2e_opt.coeff)
+    else:
+        #:out = einsum('MpNqr,pi,qj,rk->MiNjk', out, coeff, coeff, auxcoeff)
+        out = contract('MNpqr,rk->MNpqk', out, int3c2e_opt.aux_coeff)
+        out = contract('MNpqk,qj->MNpjk', out, int3c2e_opt.coeff)
+        out = contract('MNpjk,pi->MNijk', out, int3c2e_opt.coeff)
+    return out
+
+def create_img_idx(cell, bvkcell, auxcell, Ls, int3c2e_envs):
+    '''integral screening'''
+    # consider only the most diffused component of a basis
+    exps, cs = extract_pgto_params(cell, 'diffused')
+    ls = cell._bas[:,ANG_OF]
+    exps = cp.asarray(exps, dtype=np.float32)
+    log_cs = np.log(np.abs(cs * ((2*ls+1)/(4*np.pi))**.5))
+    log_cs = cp.asarray(log_cs, np.float32)
+    nbas = cell.nbas
+    nk = bvkcell.nbas // nbas
+
+    # Search the most diffused functions on each atom
+    aux_exps, aux_cs = extract_pgto_params(auxcell, 'diffused')
+    aux_ls = auxcell._bas[:,ANG_OF]
+    r2_aux = np.log(aux_cs**2 / cell.precision * 10**aux_ls) / aux_exps
+    atom_aux_exps = []
+    atoms = auxcell._bas[:,ATOM_OF]
+    atom_aux_exps = cp.full(cell.natm, 1e8, dtype=np.float32)
+    for ia in range(cell.natm):
+        bas_mask = atoms == ia
+        es = aux_exps[bas_mask]
+        if len(es) > 0:
+            atom_aux_exps[ia] = es[r2_aux[bas_mask].argmax()]
+
+    def gen_img_idx(ish0, ish1, jsh0, jsh1):
+        nish = ish1 - ish0
+        njsh = jsh1 - jsh0
+        #TODO: only tril part when i == j
+        ij_pairs = nk * nish * nk * njsh
+        img_counts = cp.zeros(ij_pairs, dtype=np.int32)
+        err = libpbc.int3c2e_img_counts(
+            ctypes.cast(img_counts.data.ptr, ctypes.c_void_p),
+            ctypes.byref(int3c2e_envs),
+            (ctypes.c_int*4)(ish0, ish1, jsh0, jsh1),
+            ctypes.cast(exps.data.ptr, ctypes.c_void_p),
+            ctypes.cast(log_cs.data.ptr, ctypes.c_void_p),
+            ctypes.cast(atom_aux_exps.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(nk), ctypes.c_int(cell.natm))
+        if err != 0:
+            raise RuntimeError('int3c2e_img_counts failed')
+
+        remaining_idx = np.nonzero(img_counts > 0)[0]
+        remaining_idx = remaining_idx[img_counts[remaining_idx].argsort()[::-1]]
+        remaining_idx = cp.asarray(remaining_idx, dtype=np.int32, order='C')
+        ij_pairs = remaining_idx.size
+        img_offsets = cp.empty(ij_pairs+1, dtype=np.int32)
+        cp.cumsum(img_counts[remaining_idx], out=img_offsets[1:])
+        img_offsets[0] = 0
+
+        img_idx = cp.empty(int(img_offsets[-1]), dtype=np.int32)
+        err = libpbc.int3c2e_img_idx(
+            ctypes.cast(img_idx.data.ptr, ctypes.c_void_p),
+            ctypes.cast(img_offsets.data.ptr, ctypes.c_void_p),
+            ctypes.cast(remaining_idx.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(ij_pairs),
+            ctypes.byref(int3c2e_envs),
+            (ctypes.c_int*4)(ish0, ish1, jsh0, jsh1),
+            ctypes.cast(exps.data.ptr, ctypes.c_void_p),
+            ctypes.cast(log_cs.data.ptr, ctypes.c_void_p),
+            ctypes.cast(atom_aux_exps.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(nk), ctypes.c_int(cell.natm))
+        if err != 0:
+            raise RuntimeError('int3c2e_img_idx failed')
+
+        Ki, i, Kj, j = cp.unravel_index(remaining_idx, (nk, nish, nk, njsh))
+        i += ish0
+        j += jsh0
+        # one-dimensional indices corresponding to [Ki,i,Kj,j]
+        bas_ij = cp.ravel_multi_index((Ki, i, Kj, j), (nk, nbas, nk, nbas))
+        bas_ij = cp.asarray(bas_ij, dtype=np.int32)
+        return img_idx, img_offsets, bas_ij
+    return gen_img_idx
+
+class SRInt3c2eOpt:
+    def __init__(self, cell, auxcell, omega, bvk_kmesh=None):
+        assert omega < 0
+        self.omega = omega
+
+        self.cell = cell
+        cell, coeff, uniq_l_ctr, l_ctr_counts = group_basis(cell, tile=1)
+        self.sorted_cell = cell
+        self.uniq_l_ctr = uniq_l_ctr
+        self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts))
+        self.coeff = cp.asarray(coeff)
+        self.sorted_cell.omega = omega
+
+        self.auxcell = auxcell
+        auxcell, coeff, uniq_l_ctr, l_ctr_counts = group_basis(auxcell, tile=1)
+        self.sorted_auxcell = auxcell
+        self.uniq_l_ctr_aux = uniq_l_ctr
+        self.l_ctr_aux_offsets = np.append(0, np.cumsum(l_ctr_counts))
+        self.aux_coeff = cp.asarray(coeff)
+        self.sorted_auxcell.omega = omega
+
+        if bvk_kmesh is None:
+            bvk_kmesh = np.ones(3, dtype=int)
+        self.bvk_kmesh = bvk_kmesh
+        self.bvkmesh_Ls = k2gamma.translation_vectors_for_kmesh(cell, bvk_kmesh, True)
+
+        if np.prod(bvk_kmesh) == 1:
+            bvkcell = cell
+        else:
+            bvkcell = pbctools.super_cell(cell, bvk_kmesh, wrap_around=True)
+            # PTR_BAS_COORD was not initialized in pbctools.supe_rcell
+            bvkcell._bas[:,PTR_BAS_COORD] = bvkcell._atm[bvkcell._bas[:,ATOM_OF],PTR_COORD]
+        self.bvkcell = bvkcell
+
+    def int3c2e_kernel(self, cutoff=None, verbose=None):
+        cell = self.sorted_cell
+        auxcell = self.sorted_auxcell
+        uniq_l_ctr = self.uniq_l_ctr
+        l_ctr_offsets = self.l_ctr_offsets
+        l_ctr_aux_offsets = self.l_ctr_aux_offsets
+        bvkcell = self.bvkcell
+
+        log = logger.new_logger(cell, verbose)
+        cput0 = log.init_timer()
+        rcut = estimate_rcut(cell, auxcell, self.omega).max()
+        Ls = cp.asarray(bvkcell.get_lattice_Ls(rcut=rcut))
+        Ls = Ls[cp.linalg.norm(Ls-.5, axis=1).argsort()]
+        nimgs = len(Ls)
+        log.debug('int3c2e_kernel rcut = %g, nimgs = %d', rcut, nimgs)
+
+        if cutoff is None:
+            omega = cell.omega
+            aux_exp, _, aux_l = most_diffused_pgto(auxcell)
+            cell_exp, _, cell_l = most_diffused_pgto(cell)
+            if omega == 0:
+                theta = 1./(1./cell_exp + 1./aux_exp)
+            else:
+                theta = 1./(1./cell_exp + 1./aux_exp + omega**-2)
+            lsum = cell_l * 2 + aux_l + 1
+            rad = cell.vol**(-1./3) * rcut + 1
+            surface = 4*np.pi * rad**2
+            lattice_sum_factor = 2*np.pi*rcut*lsum/(cell.vol*theta) + surface
+            cutoff = cell.precision / lattice_sum_factor
+            log.debug1('int3c_kernel integral omega=%g theta=%g cutoff=%g',
+                       omega, theta, cutoff)
+
+        _atm_cpu, _bas_cpu, _env_cpu = conc_env(
+            bvkcell._atm, bvkcell._bas, _scale_sp_ctr_coeff(bvkcell),
+            auxcell._atm, auxcell._bas, _scale_sp_ctr_coeff(auxcell))
+        #NOTE: PTR_BAS_COORD is not updated in conc_env()
+        off = _bas_cpu[bvkcell.nbas,PTR_EXP] - auxcell._bas[0,PTR_EXP]
+        _bas_cpu[bvkcell.nbas:,PTR_BAS_COORD] += off
+
+        bvk_ao_loc = bvkcell.ao_loc
+        aux_loc = auxcell.ao_loc
+
+        _atm = cp.array(_atm_cpu, dtype=np.int32)
+        _bas = cp.array(_bas_cpu, dtype=np.int32)
+        _env = cp.array(_env_cpu, dtype=np.float64)
+        ao_loc = _conc_locs(bvk_ao_loc, aux_loc)
+        bvk_ncells = bvkcell.nbas // cell.nbas
+        int3c2e_envs = Int3c2eEnvVars(
+            cell.natm, cell.nbas, bvk_ncells, nimgs,
+            _atm.data.ptr, _bas.data.ptr, _env.data.ptr, ao_loc.data.ptr,
+            Ls.data.ptr, math.log(cutoff),
+        )
+        # Keep a reference to these arrays, prevent releasing them upon returning the closure
+        int3c2e_envs._env_ref_holder = (_atm, _bas, _env, ao_loc, Ls)
+
+        gen_img_idx = create_img_idx(cell, bvkcell, auxcell, Ls, int3c2e_envs)
+
+        uniq_l = uniq_l_ctr[:,0]
+        n_groups = np.count_nonzero(uniq_l <= LMAX)
+        init_constant(cell)
+        kern = libpbc.fill_int3c2e
+        cp.cuda.Stream.null.synchronize()
+        t1 = log.timer_debug1('initialize int3c2e_kernel', *cput0)
+        timing_collection = {}
+        kern_counts = 0
+
+        cell_ao_loc = cell.ao_loc
+        di = (cell_ao_loc[l_ctr_offsets[1:]] - cell_ao_loc[l_ctr_offsets[:-1]]).max()
+        dk = (aux_loc[l_ctr_aux_offsets[1:]] - aux_loc[l_ctr_aux_offsets[:-1]]).max()
+        buf = cp.empty((bvk_ncells,di, bvk_ncells,di, dk))
+
+        ij_tasks = ((i, j) for i in range(n_groups) for j in range(i+1))
+        for i, j in ij_tasks:
+            li = uniq_l[i]
+            lj = uniq_l[j]
+            ish0, ish1 = l_ctr_offsets[i], l_ctr_offsets[i+1]
+            jsh0, jsh1 = l_ctr_offsets[j], l_ctr_offsets[j+1]
+            nrow = bvk_ao_loc[ish1] - bvk_ao_loc[ish0]
+            ncol = bvk_ao_loc[jsh1] - bvk_ao_loc[jsh0]
+            img_idx, img_offsets, bas_ij_idx = gen_img_idx(ish0, ish1, jsh0, jsh1)
+
+            for k, lk in enumerate(self.uniq_l_ctr_aux[:,0]):
+                ksh0, ksh1 = l_ctr_aux_offsets[k:k+2]
+                naux = aux_loc[ksh1] - aux_loc[ksh0]
+                shls_slice = ish0, ish1, jsh0, jsh1, ksh0, ksh1
+                eri3c = cp.ndarray((bvk_ncells, nrow, bvk_ncells, ncol, naux),
+                                   dtype=np.float64, memptr=buf.data)
+                eri3c.fill(0.)
+                lll = f'({ANGULAR[li]}{ANGULAR[lj]}|{ANGULAR[lk]})'
+                scheme = int3c2e_scheme(li, lj, lk)
+                log.debug2('int3c2e_scheme for %s: %s', lll, scheme)
+                err = kern(
+                    ctypes.cast(eri3c.data.ptr, ctypes.c_void_p),
+                    ctypes.byref(int3c2e_envs), (ctypes.c_int*3)(*scheme),
+                    (ctypes.c_int*6)(*shls_slice),
+                    ctypes.c_int(bvk_ncells), ctypes.c_int(nrow),
+                    ctypes.c_int(ncol), ctypes.c_int(naux),
+                    ctypes.c_int(bas_ij_idx.size),
+                    ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p),
+                    ctypes.cast(img_idx.data.ptr, ctypes.c_void_p),
+                    ctypes.cast(img_offsets.data.ptr, ctypes.c_void_p),
+                    _atm_cpu.ctypes, ctypes.c_int(bvkcell.natm),
+                    _bas_cpu.ctypes, ctypes.c_int(bvkcell.nbas), _env_cpu.ctypes)
+                if err != 0:
+                    raise RuntimeError(f'fill_int3c2e kernel for {lll} failed')
+                if log.verbose >= logger.DEBUG1:
+                    t1, t1p = log.timer_debug1(f'processing {lll}', *t1), t1
+                    if lll not in timing_collection:
+                        timing_collection[lll] = 0
+                    timing_collection[lll] += t1[1] - t1p[1]
+                    kern_counts += 1
+                yield shls_slice, eri3c
+
+        if log.verbose >= logger.DEBUG1:
+            log.timer('int3c2e', *cput0)
+            log.debug1('kernel launches %d', kern_counts)
+            for lll, t in timing_collection.items():
+                log.debug1('%s wall time %.2f', lll, t)
+
+class Int3c2eEnvVars(ctypes.Structure):
+    _fields_ = [
+        ('cell0_natm', ctypes.c_uint16),
+        ('cell0_nbas', ctypes.c_uint16),
+        ('bvk_ncells', ctypes.c_uint16),
+        ('nimgs', ctypes.c_uint16),
+        ('atm', ctypes.c_void_p),
+        ('bas', ctypes.c_void_p),
+        ('env', ctypes.c_void_p),
+        ('ao_loc', ctypes.c_void_p),
+        ('img_coords', ctypes.c_void_p),
+        ('log_cutoff', ctypes.c_float),
+    ]
+
+def _conc_locs(ao_loc1, ao_loc2):
+    comp_loc = np.append(ao_loc1[:-1], ao_loc1[-1] + ao_loc2)
+    return cp.array(comp_loc, dtype=np.int32)
+
+def int3c2e_scheme(li, lj, lk, shm_size=SHM_SIZE):
+    order = li + lj + lk
+    nroots = (order//2 + 1) * 2
+
+    g_size = (li+1)*(lj+1)*(lk+1)
+    unit = g_size*3 + nroots*2 + 6
+    nksp_max = shm_size//(unit*8)
+    nksp_max = _nearest_power2(nksp_max)
+
+    nfi = (li + 1) * (li + 2) // 2
+    nfj = (lj + 1) * (lj + 2) // 2
+    nfk = (lk + 1) * (lk + 2) // 2
+    gout_size = nfi * nfj * nfk
+    gout_stride = (gout_size + GOUT_WIDTH-1) // GOUT_WIDTH
+    # Round up to the next 2^n
+    gout_stride = _nearest_power2(gout_stride, return_leq=False)
+
+    # Align nksh*gout_stride to warp size
+    if gout_stride < 32:
+        nksh_per_block = 32 // gout_stride
+        nsp_per_block = min(THREADS // 32, nksp_max // nksh_per_block)
+    else:
+        nksh_per_block = THREADS // gout_stride
+        nsp_per_block = 1
+    if nksp_max < nksh_per_block:
+        raise RuntimeError('GOUT_WIDTH too small or not enough shared memory')
+
+    gout_stride = THREADS // (nksh_per_block*nsp_per_block)
+    return nksh_per_block, gout_stride, nsp_per_block
+
+def most_diffused_pgto(cell):
+    exps, cs = extract_pgto_params(cell, 'diffused')
+    ls = cell._bas[:,ANG_OF]
+    r2 = np.log(cs**2 / cell.precision * 10**ls) / exps
+    idx = r2.argmax()
+    return exps[idx], cs[idx], ls[idx]
+
+# This modified rcut estimation function will be available in pyscf-2.8 or newer
+def estimate_rcut(cell, auxcell, omega):
+    '''Estimate rcut for 3c2e SR-integrals'''
+    if cell.nbas == 0 or auxcell.nbas == 0:
+        return np.zeros(1)
+
+    if omega == 0:
+        # No SR integrals in int3c2e if omega=0
+        assert cell.dimension == 0
+        return np.zeros(1)
+
+    precision = cell.precision
+    ak, ck, lk = most_diffused_pgto(auxcell)
+
+    # the most diffused orbital basis
+    cell_exps, cs = extract_pgto_params(cell, 'diffused')
+    ls = cell._bas[:,ANG_OF]
+    r2_cell = np.log(cs**2 / precision * 10**ls) / cell_exps
+    ai_idx = r2_cell.argmax()
+    ai = cell_exps[ai_idx]
+    aj = cell_exps
+    li = ls[ai_idx]
+    lj = ls
+    ci = cs[ai_idx]
+    cj = cs
+
+    aij = ai + aj
+    lij = li + lj
+    l3 = lij + lk
+    theta = 1./(omega**-2 + 1./aij + 1./ak)
+    norm_ang = ((2*li+1)*(2*lj+1))**.5/(4*np.pi)
+    c1 = ci * cj * ck * norm_ang
+    sfac = aij*aj/(aij*aj + ai*theta)
+    fl = 2
+    fac = 2**li*np.pi**2.5*c1 * theta**(l3-.5)
+    rad = cell.vol**(-1./3) * cell.rcut + 1
+    surface = 4*np.pi * rad**2
+    lattice_sum_factor = 2*np.pi*cell.rcut/(cell.vol*theta) + surface
+    fac *= lattice_sum_factor
+    fac /= aij**(li+1.5) * ak**(lk+1.5) * aj**lj
+    fac *= fl / precision
+
+    r0 = cell.rcut  # initial guess
+    r0 = (np.log(fac * (sfac*r0)**(l3-1) + 1.) / (sfac*theta))**.5
+    r0 = (np.log(fac * (sfac*r0)**(l3-1) + 1.) / (sfac*theta))**.5
+    rcut = r0
+    return rcut
+
+def guess_bvk_kmesh(cell, bvk_kmesh, target_size=BVK_CELL_SHELLS):
+    '''Generate a sufficient large bvk cell for fill_int3c2e kernel to achieve
+    better load balance'''
+    if bvk_kmesh is None:
+        bvk_kmesh = np.ones(3, dtype=int)
+    else:
+        bvk_kmesh = bvk_kmesh.copy()
+    bvk_ncells = np.prod(bvk_kmesh)
+
+    # produce a cell with ~2000 shells
+    replica = target_size / (bvk_ncells * cell.nbas)
+    if replica < 1:
+        return bvk_kmesh
+
+    mesh_max = cell.nimgs * 2 + 1
+    bvk_multiplier = mesh_max / bvk_kmesh
+    if cell.dimension == 2:
+        fac = (replica / np.prod(bvk_multiplier[:2]))**.5
+        fac = min(fac, 1)
+        bvk_kmesh[:2] *= (fac * bvk_multiplier[:2]).astype(int)
+    else:
+        # The replica on each axis should be proportional to the required nimg
+        # along each direction.
+        fac = (replica / np.prod(bvk_multiplier))**(1./3)
+        # The replica is not necessary to be more than the required nimg.
+        fac = min(fac, 1)
+        bvk_kmesh *= (fac * bvk_multiplier).astype(int)
+
+    return bvk_kmesh
diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py
new file mode 100644
index 00000000..9f892504
--- /dev/null
+++ b/gpu4pyscf/pbc/df/rsdf_builder.py
@@ -0,0 +1,427 @@
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Build GDF tensor using the range-separation integral algorithm.
+'''
+
+import os
+import ctypes
+import warnings
+import numpy as np
+import cupy as cp
+from cupyx.scipy.linalg import solve_triangular
+from pyscf import lib
+#from pyscf.pbc import gto as pbcgto
+#from pyscf.pbc.gto import pseudo
+from pyscf.pbc.tools import pbc as pbctools
+from pyscf.pbc.lib.kpts_helper import is_zero
+from pyscf.pbc.df.rsdf_builder import (
+    RCUT_THRESHOLD, estimate_ke_cutoff_for_omega)
+from pyscf.pbc.df import aft as aft_cpu
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem
+from gpu4pyscf.pbc.df import ft_ao
+from gpu4pyscf.pbc.lib.kpts_helper import kk_adapted_iter
+from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh
+from gpu4pyscf.pbc.gto.cell import extract_pgto_params
+from gpu4pyscf.pbc.df.int3c2e import sr_aux_e2, estimate_rcut
+
+OMEGA_MIN = 0.3
+
+# In the ED of the j2c2e metric, the default LINEAR_DEP_THR setting in pyscf-2.8
+# is too loose. The linear dependency truncation often leads to serious errors.
+# PBC GDF very differs to the molecular GDF approximation where diffused
+# functions typically have insignificant contributions. The diffused auxliary
+# crystial orbitals have large impacts on the accuracy of Coulomb integrals. A
+# tight linear dependency threshold have to be applied to control the error,
+# even this may cause more numericial stability issues.
+LINEAR_DEP_THR = 1e-11
+# Use eigenvalue decomposition in decompose_j2c
+PREFER_ED = False
+
+def build_cderi(cell, auxcell, kpts=None, j_only=False,
+                omega=None, linear_dep_threshold=LINEAR_DEP_THR):
+    assert cell.low_dim_ft_type != 'inf_vacuum'
+    assert cell.dimension >= 2
+    if cell.omega != 0:
+        assert cell.omega < 0
+        omega = abs(cell.omega)
+        with_long_range = False
+    else:
+        if omega is None:
+            cell_exps, cs = extract_pgto_params(cell, 'diffused')
+            omega = cell_exps.min()**.5
+            logger.debug(cell, 'omega guess in rsdf_builder = %g', omega)
+        omega = abs(omega)
+        with_long_range = True
+
+    if kpts is None or is_zero(kpts):
+        return build_cderi_gamma_point(
+            cell, auxcell, omega, with_long_range, linear_dep_threshold)
+    elif j_only:
+        return build_cderi_j_only(
+            cell, auxcell, kpts, omega, with_long_range, linear_dep_threshold)
+    else:
+        return build_cderi_kk(
+            cell, auxcell, kpts, omega, with_long_range, linear_dep_threshold)
+
+def build_cderi_kk(cell, auxcell, kpts, omega=OMEGA_MIN, with_long_range=True,
+                   linear_dep_threshold=LINEAR_DEP_THR):
+    log = logger.new_logger(cell)
+    t0 = log.init_timer()
+    if kpts is None:
+        kpts = np.zeros((1, 3))
+        bvk_kmesh = kmesh = np.ones(3, dtype=int)
+    else:
+        # The remote images may contribute to certain k-point mesh, contributing
+        # to the finite-size effects in HFX. For sufficiently large number of
+        # kpts, the truncation radious cell.rcut may cause finite-size errors.
+        kpts = kpts.reshape(-1, 3)
+        rcut = estimate_rcut(cell, auxcell, omega).max()
+        bvk_kmesh = kmesh = kpts_to_kmesh(cell, kpts, rcut=rcut)
+        if len(kpts) != np.prod(kmesh):
+            # When targeting many kpts, num-kpts can be more than num-bvk-images.
+            # Using a large radius to regenerate MP kmesh. The new MP kmesh
+            # should cover all kpts.
+            kmesh = kpts_to_kmesh(cell, kpts, rcut=rcut*20)
+    j3c = sr_aux_e2(cell, auxcell, -omega, kpts, bvk_kmesh)
+    t1 = log.timer('pass1: int3c2e', *t0)
+
+    kpt_iters = list(kk_adapted_iter(kmesh))
+    uniq_kpts = kpts[[x[0] for x in kpt_iters]]
+    log.debug('Generate auxcell 2c2e integrals')
+    j2c = _get_2c2e(auxcell, uniq_kpts, omega, with_long_range) # on CPU
+    t1 = log.timer('int2c2e', *t1)
+
+    if with_long_range:
+        ft_ao_iter = _ft_ao_iter_generator(cell, auxcell, bvk_kmesh, omega, log)
+
+    prefer_ed = PREFER_ED
+    if cell.dimension == 2:
+        prefer_ed = True
+    cderi = {}
+    cderip = {}
+    for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters):
+        log.debug1('make_cderi for k-point %d %s', kp, kpts[kp])
+        log.debug1('ki_idx = %s', ki_idx)
+        log.debug1('kj_idx = %s', kj_idx)
+
+        if with_long_range:
+            '''exp(-i*(G + k) dot r) * Coulomb_kernel'''
+            for pqG, auxG_conj in ft_ao_iter(kpts[kp], kpts[kj_idx]):
+                # \sum_G coulG * ints(ij * exp(-i G * r)) * ints(P * exp(i G * r))
+                # = \sum_G FT(ij, G) conj(FT(aux, G)) , where aux
+                # functions |P> are assumed to be real
+                j3c[ki_idx,kj_idx] += contract('kpqG,Gr->kpqr', pqG, auxG_conj)
+
+        j2c_k = j2c[j2c_idx]
+        if kp == kp_conj: # self conjugated
+            # DF metric for self-conjugated k-point should be real
+            j2c_k = j2c_k.real
+        cd_j2c, cd_j2c_negative, j2ctag = decompose_j2c(
+            j2c_k, prefer_ed, linear_dep_threshold)
+        if cd_j2c.dtype != j3c.dtype:
+            cd_j2c = cd_j2c.astype(j3c.dtype)
+
+        for ki, kj in zip(ki_idx, kj_idx):
+            j3c_k = j3c[ki,kj]
+            cderi[ki,kj] = _solve_cderi(cd_j2c, j3c_k, j2ctag)
+            if cd_j2c_negative is not None:
+                assert cell.dimension == 2
+                cderip[ki,kj] = _solve_cderi(cd_j2c_negative, j3c_k, j2ctag)
+    t1 = log.timer('pass2: solve cderi', *t1)
+    return cderi, cderip
+
+def build_cderi_gamma_point(cell, auxcell, omega=OMEGA_MIN, with_long_range=True,
+                            linear_dep_threshold=LINEAR_DEP_THR):
+    log = logger.new_logger(cell)
+    t0 = log.init_timer()
+    kmesh = None
+    kpts = None
+
+    j3c = sr_aux_e2(cell, auxcell, -omega)
+    t1 = log.timer('pass1: int3c2e', *t0)
+
+    log.debug('Generate auxcell 2c2e integrals')
+    j2c = _get_2c2e(auxcell, kpts, omega, with_long_range) # on CPU
+    j2c = j2c[0].real
+    t1 = log.timer('int2c2e', *t1)
+
+    cderi = {}
+    cderip = {}
+    if with_long_range:
+        ft_ao_iter = _ft_ao_iter_generator(cell, auxcell, kmesh, omega, log)
+        for pqG, auxG_conj in ft_ao_iter():
+            # \sum_G coulG * ints(ij * exp(-i G * r)) * ints(P * exp(i G * r))
+            # = \sum_G FT(ij, G) conj(FT(aux, G)) , where aux
+            # functions |P> are assumed to be real
+            j3c += contract('pqG,Gr->pqr', pqG[0], auxG_conj).real
+
+    prefer_ed = PREFER_ED
+    if cell.dimension == 2:
+        prefer_ed = True
+    cd_j2c, cd_j2c_negative, j2ctag = decompose_j2c(
+        j2c, prefer_ed, linear_dep_threshold)
+
+    cderi[0,0] = _solve_cderi(cd_j2c, j3c, j2ctag)
+    if cd_j2c_negative is not None:
+        assert cell.dimension == 2
+        cderip[0,0] = _solve_cderi(cd_j2c_negative, j3c, j2ctag)
+    t1 = log.timer('pass2: solve cderi', *t1)
+    return cderi, cderip
+
+def build_cderi_j_only(cell, auxcell, kpts, omega=OMEGA_MIN, with_long_range=True,
+                       linear_dep_threshold=LINEAR_DEP_THR):
+    log = logger.new_logger(cell)
+    t0 = log.init_timer()
+    if kpts is None:
+        kpts = np.zeros((1, 3))
+        bvk_kmesh = np.ones(3, dtype=int)
+    else:
+        # Coulomb integrals requires smaller kmesh to converge finite-size effects.
+        # A relatively small bvk_kmesh can be used for Coulomb integrals.
+        kpts = kpts.reshape(-1, 3)
+        bvk_kmesh = kpts_to_kmesh(cell, kpts)
+    # TODO: time-reversal symmetry in j3c, j2c
+    j3c = sr_aux_e2(cell, auxcell, -omega, kpts, bvk_kmesh, j_only=True)
+    t1 = log.timer('pass1: int3c2e', *t0)
+
+    log.debug('Generate auxcell 2c2e integrals')
+    j2c = _get_2c2e(auxcell, None, omega, with_long_range) # on CPU
+    j2c = j2c[0].real
+    t1 = log.timer('int2c2e', *t1)
+
+    # TODO: consider time-reversal symmetry
+    cderi = {}
+    cderip = {}
+    if with_long_range:
+        ft_ao_iter = _ft_ao_iter_generator(cell, auxcell, bvk_kmesh, omega, log)
+        kpt = np.zeros(3)
+        for pqG, auxG_conj in ft_ao_iter(kpt, kpts):
+            # \sum_G coulG * ints(ij * exp(-i G * r)) * ints(P * exp(i G * r))
+            # = \sum_G FT(ij, G) conj(FT(aux, G)) , where aux
+            # functions |P> are assumed to be real
+            j3c += contract('kpqG,Gr->kpqr', pqG, auxG_conj)
+
+    prefer_ed = PREFER_ED
+    if cell.dimension == 2:
+        prefer_ed = True
+    cd_j2c, cd_j2c_negative, j2ctag = decompose_j2c(
+        j2c, prefer_ed, linear_dep_threshold)
+    if cd_j2c.dtype != j3c.dtype:
+        cd_j2c = cd_j2c.astype(j3c.dtype)
+
+    nkpts = len(kpts)
+    for k in range(nkpts):
+        cderi[k, k] = _solve_cderi(cd_j2c, j3c[k], j2ctag)
+        if cd_j2c_negative is not None:
+            assert cell.dimension == 2
+            cderip[k, k] = _solve_cderi(cd_j2c_negative, j3c[k], j2ctag)
+    t1 = log.timer('pass2: solve cderi', *t1)
+    return cderi, cderip
+
+def _weighted_coulG_LR(cell, Gv, omega, kws, kpt=np.zeros(3)):
+    coulG = pbctools.get_coulG(cell, kpt, exx=False, Gv=Gv, omega=abs(omega))
+    coulG *= kws
+    if is_zero(kpt):
+        assert Gv[0].dot(Gv[0]) == 0
+        coulG[0] -= np.pi / omega**2 / cell.vol
+    return cp.asarray(coulG)
+
+def _ft_ao_iter_generator(cell, auxcell, bvk_kmesh, omega, verbose=None):
+    ke_cutoff = estimate_ke_cutoff_for_omega(cell, omega)
+    mesh = cell.cutoff_to_mesh(ke_cutoff)
+    mesh = cell.symmetrize_mesh(mesh)
+    Gv, Gvbase, kws = cell.get_Gv_weights(mesh)
+    ngrids = len(Gv)
+    nao = cell.nao
+
+    ft_opt = ft_ao.FTOpt(cell, bvk_kmesh=bvk_kmesh)
+    ft_kern = ft_opt.gen_ft_kernel(verbose=verbose)
+    if bvk_kmesh is None:
+        bvk_ncells = 1
+    else:
+        bvk_ncells = np.prod(bvk_kmesh)
+    avail_mem = get_avail_mem() * .8
+    Gblksize = max(16, int(avail_mem/(2*16*nao**2*bvk_ncells))//8*8)
+    Gblksize = min(Gblksize, ngrids, 16384)
+    #logger.debug1(cell, 'Gblksize = %d', Gblksize)
+    def ft_ao_iter(kpt=np.zeros(3), kpts=None):
+        coulG = _weighted_coulG_LR(auxcell, Gv, omega, kws, kpt)
+        auxG_conj = cp.asarray(ft_ao.ft_ao(auxcell, Gv, kpt=kpt).conj(), order='C')
+        auxG_conj *= cp.asarray(coulG[:,None])
+        for p0, p1 in lib.prange(0, ngrids, Gblksize):
+            pqG = ft_kern(Gv[p0:p1], kpt, kpts).transpose(0,2,3,1)
+            yield pqG, auxG_conj[p0:p1]
+    return ft_ao_iter
+
+def decompose_j2c(j2c, prefer_ed=PREFER_ED, linear_dep_threshold=LINEAR_DEP_THR):
+    if prefer_ed:
+        return eigenvalue_decomposed_metric(j2c, linear_dep_threshold)
+    else:
+        return cholesky_decomposed_metric(j2c)
+
+def cholesky_decomposed_metric(j2c):
+    '''Return L for j2c = L L^T'''
+    j2c_negative = None
+    j2ctag = 'CD'
+    # Cupy cholesky does not check positive-definite, seems returning nan in the
+    # resultant CD matrix silently.
+    j2c = cp.asarray(j2c)
+    j2c = cp.linalg.cholesky(j2c)
+    if cp.isnan(j2c[-1,-1]):
+        raise RuntimeError('j2c is not positive definite')
+    return j2c, j2c_negative, j2ctag
+
+def eigenvalue_decomposed_metric(j2c, linear_dep_threshold=LINEAR_DEP_THR):
+    j2c = cp.asarray(j2c)
+    w, v = cp.linalg.eigh(j2c)
+    mask = w > linear_dep_threshold
+    v1 = v[:,mask].conj().T
+    v1 *= w[mask, None]**-.5
+    j2c = v1
+    idx = cp.where(w < -linear_dep_threshold)[0]
+    j2c_negative = None
+    if len(idx) > 0:
+        j2c_negative = (v[:,idx] * (-w[idx])**-.5).conj().T
+    j2ctag = 'ED'
+    return j2c, j2c_negative, j2ctag
+
+# Create 2c2e, store on CPU
+def _get_2c2e(auxcell, uniq_kpts, omega, with_long_range=True):
+    # j2c ~ (-kpt_ji | kpt_ji) => hermi=1
+    precision = auxcell.precision ** 1.5
+    aux_exps, aux_cs = extract_pgto_params(auxcell, 'diffused')
+    aux_exp = aux_exps.min()
+    theta = 1./(2./aux_exp + omega**-2)
+    rad = auxcell.vol**(-1./3) * auxcell.rcut + 1
+    surface = 4*np.pi * rad**2
+    lattice_sum_factor = 2*np.pi*auxcell.rcut/(auxcell.vol*theta) + surface
+    rcut_sr = (np.log(lattice_sum_factor / precision + 1.) / theta)**.5
+    logger.debug1(auxcell, 'auxcell  rcut_sr = %g', rcut_sr)
+    auxcell_sr = auxcell.copy()
+    auxcell_sr.rcut = rcut_sr
+    with auxcell_sr.with_short_range_coulomb(omega):
+        j2c = auxcell_sr.pbc_intor('int2c2e', hermi=1, kpts=uniq_kpts)
+
+    if not with_long_range:
+        return j2c
+
+    ke = estimate_ke_cutoff_for_omega(auxcell, omega, precision)
+    mesh = auxcell.cutoff_to_mesh(ke)
+    mesh = auxcell.symmetrize_mesh(mesh)
+    logger.debug(auxcell, 'Set 2c2e integrals precision %g, mesh %s', precision, mesh)
+
+    Gv, Gvbase, kws = auxcell.get_Gv_weights(mesh)
+    b = auxcell.reciprocal_vectors()
+    gxyz = lib.cartesian_prod([np.arange(len(x)) for x in Gvbase])
+    ngrids = Gv.shape[0]
+    naux = auxcell.nao
+    max_memory = max(1000, auxcell.max_memory - lib.current_memory()[0])
+    blksize = min(ngrids, int(max_memory*.4e6/16/naux), 200000)
+    logger.debug2(auxcell, 'max_memory %s (MB)  blocksize %s', max_memory, blksize)
+
+    if uniq_kpts is None:
+        j2c = cp.asarray(j2c)
+        coulG_LR = _weighted_coulG_LR(auxcell, Gv, omega, kws)
+        for p0, p1 in lib.prange(0, ngrids, blksize):
+            auxG = ft_ao.ft_ao(auxcell, Gv[p0:p1], None, b, gxyz[p0:p1], Gvbase).T
+            j2c += (auxG.conj() * coulG_LR[p0:p1]).dot(auxG.T).real
+            auxG = None
+        j2c = [j2c.real.get()]
+    else:
+        for k, kpt in enumerate(uniq_kpts):
+            j2c_k = cp.asarray(j2c[k])
+            coulG_LR = _weighted_coulG_LR(auxcell, Gv, omega, kws, kpt)
+            gamma_point = is_zero(kpt)
+
+            for p0, p1 in lib.prange(0, ngrids, blksize):
+                auxG = ft_ao.ft_ao(auxcell, Gv[p0:p1], None, b, gxyz[p0:p1], Gvbase, kpt).T
+                if gamma_point:
+                    j2c_k += (auxG.conj() * coulG_LR[p0:p1]).dot(auxG.T).real
+                else:
+                    j2c_k += (auxG.conj() * coulG_LR[p0:p1]).dot(auxG.T)
+                auxG = None
+            j2c[k] = j2c_k.get()
+    return j2c
+
+def _solve_cderi(cd_j2c, j3c, j2ctag):
+    if j2ctag == 'ED':
+        return contract('Lr,pqr->Lpq', cd_j2c, j3c)
+    else:
+        nao, naux = j3c.shape[1:3]
+        j3c = solve_triangular(cd_j2c, j3c.reshape(-1,naux).T, lower=True)
+        return j3c.reshape(naux,nao,nao)
+
+def get_pp_loc_part1(cell, kpts=None, with_pseudo=True, verbose=None):
+    fakenuc = aft_cpu._fake_nuc(cell, with_pseudo=with_pseudo)
+    cell_exps, cs = extract_pgto_params(cell, 'diffused')
+    omega = (2*cell_exps.min())**.5
+    logger.debug(cell, 'omega guess in get_pp_loc_part1 = %g', omega)
+
+    if kpts is None or is_zero(kpts):
+        kpts = None
+        bvk_kmesh = np.ones(3, dtype=int)
+    else:
+        bvk_kmesh = kpts_to_kmesh(cell, kpts)
+    nuc = sr_aux_e2(cell, fakenuc, -omega, kpts, bvk_kmesh, j_only=True)
+    charges = -cp.asarray(cell.atom_charges())
+    if kpts is None:
+        nuc = contract('pqr,r->pq', nuc, charges)
+    else:
+        nuc = contract('kpqr,r->kpq', nuc, charges)
+
+    # TODO: consider time-reversal symmetry
+    ft_ao_iter = _ft_ao_iter_generator(cell, fakenuc, bvk_kmesh, omega, verbose)
+    kpt = np.zeros(3)
+    for i, (pqG, auxG_conj) in enumerate(ft_ao_iter(kpt, kpts)):
+        ZG = auxG_conj.dot(charges)
+        # contributions due to pseudo.pp_int.get_gth_vlocG_part1
+        if (with_pseudo and i == 0 and
+            (cell.dimension == 3 or
+             (cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum'))):
+            exps = cp.asarray(np.hstack(fakenuc.bas_exps()))
+            ZG[0] -= charges.dot(np.pi/exps) / cell.vol
+        if kpts is None:
+            nuc += contract('pqG,G->pq', pqG[0], ZG).real
+        else:
+            nuc += contract('kpqG,G->kpq', pqG, ZG)
+    return nuc
+
+def get_nuc(cell, kpts=None):
+    '''Get the periodic nuc-el AO matrix, with G=0 removed.
+    '''
+    log = logger.new_logger(cell)
+    t0 = log.init_timer()
+    nuc = get_pp_loc_part1(cell, kpts, with_pseudo=False, verbose=log)
+    log.timer('get_nuc', *t0)
+    return nuc
+
+def get_pp(cell, kpts=None):
+    '''Get the periodic pseudopotential nuc-el ao matrix, with G=0 removed.
+    '''
+    from pyscf.pbc.gto import pseudo
+    log = logger.new_logger(cell)
+    t0 = log.init_timer()
+    pp2builder = aft_cpu._IntPPBuilder(cell, kpts)
+    vpp  = cp.asarray(pp2builder.get_pp_loc_part2())
+    t1 = log.timer_debug1('get_pp_loc_part2', *t0)
+    vpp += cp.asarray(pseudo.pp_int.get_pp_nl(cell, kpts))
+    t1 = log.timer_debug1('get_pp_nl', *t1)
+
+    vpp += get_pp_loc_part1(cell, kpts, with_pseudo=True, verbose=log)
+    t1 = log.timer_debug1('get_pp_loc_part1', *t1)
+    log.timer('get_pp', *t0)
+    return vpp
diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_aft.py b/gpu4pyscf/pbc/df/tests/test_pbc_aft.py
index 6ca1d627..98ddad61 100644
--- a/gpu4pyscf/pbc/df/tests/test_pbc_aft.py
+++ b/gpu4pyscf/pbc/df/tests/test_pbc_aft.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,10 +29,9 @@ def setUpModule():
                   'C' :[[0, [1., 1]]],}
     cell.pseudo = {'C':'gth-pade'}
     cell.a = np.eye(3) * 2.5
+    cell.precision = 1e-8
     cell.build()
-    np.random.seed(1)
-    kpts = np.random.random((4,3))
-    kpts[3] = kpts[0]-kpts[1]+kpts[2]
+    kpts = cell.make_kpts([13,1,1])[4:8]
 
     cell1 = pgto.Cell()
     cell1.atom = 'He 1. .5 .5; He .1 1.3 2.1'
@@ -49,22 +48,22 @@ class KnownValues(unittest.TestCase):
     def test_aft_get_pp(self):
         ref = aft_cpu.AFTDF(cell, kpts[0]).get_pp()
         v1 = aft.AFTDF(cell, kpts[0]).get_pp().get()
-        assert abs(v1 - ref).max() < 1e-12
+        assert abs(v1 - ref).max() < 1e-9
 
         kpts4 = cell.make_kpts([4,1,1])
         ref = aft_cpu.AFTDF(cell, kpts4).get_pp()
         v1 = aft.AFTDF(cell, kpts4).get_pp().get()
-        assert abs(v1 - ref).max() < 1e-12
+        assert abs(v1 - ref).max() < 1e-9
 
     def test_aft_get_nuc(self):
         ref = aft_cpu.AFTDF(cell, kpts[0]).get_nuc()
         v1 = aft.AFTDF(cell, kpts[0]).get_nuc().get()
-        assert abs(v1 - ref).max() < 1e-12
+        assert abs(v1 - ref).max() < 1e-9
 
         kpts4 = cell.make_kpts([4,1,1])
         ref = aft_cpu.AFTDF(cell, kpts4).get_nuc()
         v1 = aft.AFTDF(cell, kpts4).get_nuc().get()
-        assert abs(v1 - ref).max() < 1e-12
+        assert abs(v1 - ref).max() < 1e-9
 
     def test_jk(self):
         mesh = [11]*3
@@ -76,15 +75,15 @@ def test_jk(self):
         dm = np.random.random((nao,nao))
         jref, kref = mydf0.get_jk(dm, hermi=0, exxdiv='ewald')
         vj, vk = mydf.get_jk(dm, hermi=0, exxdiv='ewald')
-        assert abs(vj.get() - jref).max() < 1e-12
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vj.get() - jref).max() < 1e-9
+        assert abs(vk.get() - kref).max() < 1e-9
 
         dm = dm + np.random.random((nao,nao)) * 1j
         dm = dm + dm.conj().T
         jref, kref = mydf0.get_jk(dm, hermi=1, exxdiv='ewald')
         vj, vk = mydf.get_jk(dm, hermi=1, exxdiv='ewald')
-        assert abs(vj.get() - jref).max() < 1e-12
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vj.get() - jref).max() < 1e-9
+        assert abs(vk.get() - kref).max() < 1e-9
 
     def test_jk_complex_dm(self):
         scaled_center = [0.3728,0.5524,0.7672]
@@ -98,14 +97,14 @@ def test_jk_complex_dm(self):
         dm = np.random.random((nao,nao)) + np.random.random((nao,nao)) * 1j
         jref, kref = mydf0.get_jk(dm, hermi=0, kpts=kpt, exxdiv='ewald')
         vj, vk = mydf.get_jk(dm, hermi=0, kpts=kpt, exxdiv='ewald')
-        assert abs(vj.get() - jref).max() < 1e-12
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vj.get() - jref).max() < 1e-9
+        assert abs(vk.get() - kref).max() < 1e-9
 
         dm = dm + dm.conj().T
         jref, kref = mydf0.get_jk(dm, hermi=1, kpts=kpt, exxdiv='ewald')
         vj, vk = mydf.get_jk(dm, hermi=1, kpts=kpt, exxdiv='ewald')
-        assert abs(vj.get() - jref).max() < 1e-12
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vj.get() - jref).max() < 1e-9
+        assert abs(vk.get() - kref).max() < 1e-9
 
     def test_aft_j(self):
         kpts = np.random.random((4,3))
@@ -120,7 +119,7 @@ def test_aft_j(self):
         dm = dm + dm.transpose(0,2,1)
         jref = mydf0.get_jk(dm, with_k=False)[0]
         vj = mydf.get_jk(dm, with_k=False)[0]
-        assert abs(vj.get() - jref).max() < 1e-12
+        assert abs(vj.get() - jref).max() < 1e-9
 
     def test_aft_k(self):
         kpts = cell.get_abs_kpts([[-.25,-.25,-.25],
@@ -141,7 +140,7 @@ def test_aft_k(self):
         dm = np.random.random((nkpts,nao,nao))
         kref = mydf0.get_jk(dm, hermi=0, with_j=False)[1]
         vk = mydf.get_jk(dm, hermi=0, with_j=False)[1]
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vk.get() - kref).max() < 1e-9
 
     def test_aft_k1(self):
         kpts = cell.get_abs_kpts([[-.25,-.25,-.25],
@@ -163,7 +162,7 @@ def test_aft_k1(self):
         dm = dm + dm.transpose(0,2,1)
         kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1]
         vk = mydf.get_jk(dm, hermi=1, with_j=False)[1]
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vk.get() - kref).max() < 1e-9
 
     def test_aft_k2(self):
         kpts = cell.make_kpts([2,1,1])
@@ -183,7 +182,7 @@ def test_aft_k2(self):
 
         kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1]
         vk = mydf.get_jk(dm, hermi=1, with_j=False)[1]
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vk.get() - kref).max() < 1e-9
 
     def test_aft_k3(self):
         kpts = cell.make_kpts([6,1,1])
@@ -205,7 +204,7 @@ def test_aft_k3(self):
 
         kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1]
         vk = mydf.get_jk(dm, hermi=1, with_j=False)[1]
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vk.get() - kref).max() < 1e-9
 
 if __name__ == '__main__':
     print("Full Tests for aft")
diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_df.py b/gpu4pyscf/pbc/df/tests/test_pbc_df.py
index e89cc8a0..fcc22837 100644
--- a/gpu4pyscf/pbc/df/tests/test_pbc_df.py
+++ b/gpu4pyscf/pbc/df/tests/test_pbc_df.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@ def setUpModule():
                   'C' :[[0, [1., 1]]],}
     cell.pseudo = {'C':'gth-pade'}
     cell.a = np.eye(3) * 2.5
+    cell.precision = 1e-8
     cell.build()
 
 def tearDownModule():
@@ -37,15 +38,19 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
     def test_get_pp(self):
-        kpt = cell.make_kpts([9,6,5])[107]
-        ref = df_cpu.GDF(cell, kpt).get_pp()
-        v1 = GDF(cell, kpt).get_pp().get()
-        assert abs(v1 - ref).max() < 1e-12
+        #kpt = cell.make_kpts([9,6,5])[107]
+        #ref = df_cpu.GDF(cell, kpt).get_pp()
+        #v1 = GDF(cell, kpt).get_pp().get()
+        #assert abs(v1 - ref).max() < 1e-8
+
+        ref = df_cpu.GDF(cell).get_pp()
+        v1 = GDF(cell).get_pp().get()
+        assert abs(v1 - ref).max() < 1e-8
 
         kpts4 = cell.make_kpts([4,1,1])
         ref = df_cpu.GDF(cell, kpts4).get_pp()
         v1 = GDF(cell, kpts4).get_pp().get()
-        assert abs(v1 - ref).max() < 1e-12
+        assert abs(v1 - ref).max() < 1e-8
 
     def test_get_nuc(self):
         L = 5.
@@ -56,18 +61,18 @@ def test_get_nuc(self):
         cell1.atom = '''He    3.    2.       3.
                        He    1.    1.       1.'''
         cell1.basis = 'ccpvdz'
-        cell1.precision=1e-12
+        cell1.precision=1e-8
         cell1.verbose = 0
         cell1.max_memory = 1000
         cell1.build(0,0)
         ref = df_cpu.GDF(cell1).get_nuc()
         v1 = GDF(cell1).get_nuc().get()
-        assert abs(v1 - ref).max() < 1e-12
+        assert abs(v1 - ref).max() < 1e-8
 
         kpts4 = cell1.make_kpts([4,1,1])
         ref = df_cpu.GDF(cell1, kpts4).get_nuc()
         v1 = GDF(cell1, kpts4).get_nuc().get()
-        assert abs(v1 - ref).max() < 1e-12
+        assert abs(v1 - ref).max() < 1e-8
 
     def test_jk(self):
         mydf0 = df_cpu.GDF(cell)
@@ -78,16 +83,38 @@ def test_jk(self):
         dm = np.random.random((nao,nao))
         jref, kref = mydf0.get_jk(dm, hermi=0, exxdiv='ewald')
         vj, vk = mydf.get_jk(dm, hermi=0, exxdiv='ewald')
-        assert abs(vj.get() - jref).max() < 1e-12
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vj.get() - jref).max() < 1e-8
+        assert abs(vk.get() - kref).max() < 1e-8
 
         dm = dm + np.random.random((nao,nao)) * 1j
         dm = dm + dm.conj().T
         jref, kref = mydf0.get_jk(dm, hermi=1, exxdiv='ewald')
         vj, vk = mydf.get_jk(dm, hermi=1, exxdiv='ewald')
-        assert abs(vj.get() - jref).max() < 1e-12
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vj.get() - jref).max() < 1e-8
+        assert abs(vk.get() - kref).max() < 1e-8
+
+    def test_jk1(self):
+        kpts = cell.make_kpts([1,6,1])
+        nkpts = len(kpts)
+        mydf0 = df_cpu.GDF(cell, kpts)
+        mydf  = GDF(cell, kpts)
+
+        nao = cell.nao
+        np.random.seed(12)
+        dm = (np.random.random((nkpts, nao, nao)) +
+              np.random.random((nkpts, nao, nao))*1j)
+        jref, kref = mydf0.get_jk(dm, hermi=0, exxdiv='ewald')
+        vj, vk = mydf.get_jk(dm, hermi=0, exxdiv='ewald')
+        assert abs(vj.get() - jref).max() < 1e-8
+        assert abs(vk.get() - kref).max() < 1e-8
+
+        dm = dm + dm.conj().transpose(0,2,1)
+        jref, kref = mydf0.get_jk(dm, hermi=1, exxdiv='ewald')
+        vj, vk = mydf.get_jk(dm, hermi=1, exxdiv='ewald')
+        assert abs(vj.get() - jref).max() < 1e-8
+        assert abs(vk.get() - kref).max() < 1e-8
 
+    @unittest.skip('pbc-gdf only supports Monkhorst-Pack k-mesh')
     def test_jk_complex_dm(self):
         scaled_center = [0.3728,0.5524,0.7672]
         kpt = cell.make_kpts([1,1,1], scaled_center=scaled_center)[0]
@@ -99,15 +126,16 @@ def test_jk_complex_dm(self):
         dm = np.random.random((nao,nao)) + np.random.random((nao,nao)) * 1j
         jref, kref = mydf0.get_jk(dm, hermi=0, kpts=kpt, exxdiv='ewald')
         vj, vk = mydf.get_jk(dm, hermi=0, kpts=kpt, exxdiv='ewald')
-        assert abs(vj.get() - jref).max() < 1e-12
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vj.get() - jref).max() < 1e-8
+        assert abs(vk.get() - kref).max() < 1e-8
 
         dm = dm + dm.conj().T
         jref, kref = mydf0.get_jk(dm, hermi=1, kpts=kpt, exxdiv='ewald')
         vj, vk = mydf.get_jk(dm, hermi=1, kpts=kpt, exxdiv='ewald')
-        assert abs(vj.get() - jref).max() < 1e-12
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vj.get() - jref).max() < 1e-8
+        assert abs(vk.get() - kref).max() < 1e-8
 
+    @unittest.skip('pbc-gdf only supports Monkhorst-Pack k-mesh')
     def test_get_j(self):
         kpts = np.random.random((4,3))
         nkpts = len(kpts)
@@ -120,8 +148,9 @@ def test_get_j(self):
         dm = dm + dm.transpose(0,2,1)
         jref = mydf0.get_jk(dm, with_k=False)[0]
         vj = mydf.get_jk(dm, with_k=False)[0]
-        assert abs(vj.get() - jref).max() < 1e-12
+        assert abs(vj.get() - jref).max() < 1e-8
 
+    @unittest.skip('pbc-gdf only supports Monkhorst-Pack k-mesh')
     def test_get_k(self):
         kpts = cell.get_abs_kpts([[-.25,-.25,-.25],
                                   [-.25,-.25, .25],
@@ -140,8 +169,9 @@ def test_get_k(self):
         dm = np.random.random((nkpts,nao,nao))
         kref = mydf0.get_jk(dm, hermi=0, with_j=False)[1]
         vk = mydf.get_jk(dm, hermi=0, with_j=False)[1]
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vk.get() - kref).max() < 1e-8
 
+    @unittest.skip('pbc-gdf only supports Monkhorst-Pack k-mesh')
     def test_get_k1(self):
         kpts = cell.get_abs_kpts([[-.25,-.25,-.25],
                                   [-.25,-.25, .25],
@@ -161,11 +191,10 @@ def test_get_k1(self):
         dm = dm + dm.transpose(0,2,1)
         kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1]
         vk = mydf.get_jk(dm, hermi=1, with_j=False)[1]
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vk.get() - kref).max() < 1e-8
 
-    @unittest.skip('build_k from MO coefficients')
     def test_get_k2(self):
-        kpts = cell.make_kpts([2,1,1])
+        kpts = cell.make_kpts([3,1,1])
         nkpts = len(kpts)
         mydf0 = df_cpu.GDF(cell, kpts=kpts)
         mydf  = GDF(cell, kpts=kpts)
@@ -176,14 +205,13 @@ def test_get_k2(self):
         mo = (np.random.random((nkpts,nao,nocc)) +
               np.random.random((nkpts,nao,nocc))*1j)
         mo_occ = np.ones((nkpts,nocc))
-        dm = np.random.rand(nkpts, nao, nao)
+        dm = np.einsum('kpi,kqi->kpq', mo, mo.conj())
         dm = lib.tag_array(dm, mo_coeff=mo, mo_occ=mo_occ)
 
         kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1]
         vk = mydf.get_jk(dm, hermi=1, with_j=False)[1]
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vk.get() - kref).max() < 1e-8
 
-    @unittest.skip('build_k from MO coefficients')
     def test_get_k3(self):
         kpts = cell.make_kpts([6,1,1])
         nkpts = len(kpts)
@@ -197,12 +225,12 @@ def test_get_k3(self):
         mo = (np.random.random((nkpts,nao,nocc)) +
               np.random.random((nkpts,nao,nocc))*1j)
         mo_occ = np.ones((nkpts,nocc))
-        dm = np.random.rand(nkpts, nao, nao)
+        dm = np.einsum('kpi,kqi->kpq', mo, mo.conj())
         dm = lib.tag_array(dm, mo_coeff=mo, mo_occ=mo_occ)
 
         kref = mydf0.get_jk(dm, hermi=1, with_j=False)[1]
         vk = mydf.get_jk(dm, hermi=1, with_j=False)[1]
-        assert abs(vk.get() - kref).max() < 1e-12
+        assert abs(vk.get() - kref).max() < 1e-8
 
 if __name__ == '__main__':
     print("Full Tests for PBC DF")
diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_ft_ao.py b/gpu4pyscf/pbc/df/tests/test_pbc_ft_ao.py
index 55646945..ee77c401 100644
--- a/gpu4pyscf/pbc/df/tests/test_pbc_ft_ao.py
+++ b/gpu4pyscf/pbc/df/tests/test_pbc_ft_ao.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_int3c2e.py b/gpu4pyscf/pbc/df/tests/test_pbc_int3c2e.py
new file mode 100644
index 00000000..3238806a
--- /dev/null
+++ b/gpu4pyscf/pbc/df/tests/test_pbc_int3c2e.py
@@ -0,0 +1,153 @@
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import pyscf
+from pyscf import lib
+from pyscf.pbc.df import rsdf_builder
+from gpu4pyscf.pbc.df.int3c2e import sr_aux_e2
+
+
+def test_int3c2e_gamma_point():
+    cell = pyscf.M(
+        atom='''C1   1.3    .2       .3
+                C2   .19   .1      1.1
+        ''',
+        basis={'C1': [[3, [1.1, 1.]],
+                      [4, [2., 1.]]],
+               'C2': 'ccpvdz'},
+        precision = 1e-8,
+        a=np.diag([2.5, 1.9, 2.2])*3)
+
+    auxcell = cell.copy()
+    auxcell.basis = {
+        'C1':'''
+C    P
+    102.9917624900           1.0000000000
+C    P
+     28.1325940100           1.0000000000
+C    P
+      9.8364318200           1.0000000000
+C    P
+      3.3490545000           1.0000000000
+C    P
+      1.4947618600           1.0000000000
+C    P
+      0.5769010900           1.0000000000
+C    D
+      0.1995412500           1.0000000000 ''',
+        'C2':[[0, [.5, 1.]]],
+    }
+    auxcell.build()
+    omega = -0.2
+    dat = sr_aux_e2(cell, auxcell, omega).get()
+
+    cell.precision=1e-10
+    cell.build()
+    df = rsdf_builder._RSGDFBuilder(cell, auxcell).build(omega=abs(omega))
+    int3c = df.gen_int3c_kernel('int3c2e', aosym='s1', return_complex=True)
+    ref = int3c().reshape(dat.shape)
+    assert abs(dat - ref).max() < 1e-8
+
+def test_int3c2e_kpoints():
+    cell = pyscf.M(
+        atom='''H1   1.3    .2       .3
+                H2   .19   .1      1.1
+        ''',
+        basis='ccpvdz',
+        precision = 1e-8,
+        a=np.diag([2.5, 1.9, 2.2])*4)
+    auxcell = cell.copy()
+    auxcell.basis = [[0, [3.5, 1.]],
+                     [0, [1.1, 1.]],
+                     [1, [0.7, 1.]],
+                     [2, [1.5, 1.]]]
+    auxcell.build()
+    kpts = cell.make_kpts([5,1,1])
+    omega = -0.2
+    dat = sr_aux_e2(cell, auxcell, omega, kpts).get()
+
+    cell.precision=1e-10
+    cell.build()
+    df = rsdf_builder._RSGDFBuilder(cell, auxcell, kpts).build(omega=abs(omega))
+    int3c = df.gen_int3c_kernel('int3c2e', aosym='s1', return_complex=True)
+    ref = int3c().reshape(dat.shape)
+    assert abs(dat - ref).max() < 1e-8
+
+def test_minor_diffused_basis():
+    cell = pyscf.M(
+        atom='''H   1.3    .2       .3
+                H   .19   .1      1.1
+        ''',
+        basis='''
+C    S
+      7.5                    0.40
+      2.6                    0.90
+      0.5                    0.08''',
+        precision = 1e-8,
+        a=np.diag([2.5, 1.9, 2.2])*3)
+    auxcell = cell.copy()
+    auxcell.basis = '''
+C    P
+      1.4947618600           1.0000000000
+C    P
+      0.5769010900           1.0000000000
+C    D
+      0.1995412500           1.0000000000 '''
+    auxcell.build()
+    omega = -0.2
+    dat = sr_aux_e2(cell, auxcell, omega).get()
+
+    cell.precision=1e-12
+    cell.build()
+    df = rsdf_builder._RSGDFBuilder(cell, auxcell).build(omega=abs(omega))
+    int3c = df.gen_int3c_kernel('int3c2e', aosym='s1', return_complex=True)
+    ref = int3c().reshape(dat.shape)
+    assert abs(dat - ref).max() < 1e-8
+
+def test_ignorable_diffused_basis():
+    cell = pyscf.M(
+        atom='''H   1.3    .2       .3
+                H   .19   .1      1.1
+        ''',
+        basis='''
+C    S
+      7.5                    0.4000000
+      2.6                    0.9000000
+      0.5                    0.0000002''',
+        precision = 1e-8,
+        a=np.diag([2.5, 1.9, 2.2])*3)
+    auxcell = cell.copy()
+    auxcell.basis = '''
+C    P
+      1.4947618600           1.0000000000
+C    P
+      0.5769010900           1.0000000000
+C    D
+      0.1995412500           1.0000000000 '''
+    auxcell.build()
+    omega = -0.2
+    cell.verbose = 6
+    dat = sr_aux_e2(cell, auxcell, omega).get()
+
+    cell.basis='''
+C S
+      7.5                    0.4000000
+      2.6                    0.9000000'''
+    cell.build()
+    df = rsdf_builder._RSGDFBuilder(cell, auxcell).build(omega=abs(omega))
+    int3c = df.gen_int3c_kernel('int3c2e', aosym='s1', return_complex=True)
+    ref = int3c().reshape(dat.shape)
+    assert abs(dat - ref).max() < 1e-6
diff --git a/gpu4pyscf/pbc/df/tests/test_rsdf_builder.py b/gpu4pyscf/pbc/df/tests/test_rsdf_builder.py
new file mode 100644
index 00000000..0d77cfb0
--- /dev/null
+++ b/gpu4pyscf/pbc/df/tests/test_rsdf_builder.py
@@ -0,0 +1,177 @@
+# Copyright 2024-2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import numpy as np
+import pyscf
+from pyscf.pbc.df.rsdf_builder import _RSGDFBuilder
+from pyscf.pbc.df.df import _load3c
+from gpu4pyscf.pbc.df.rsdf_builder import build_cderi
+
+def test_gamma_point():
+    cell = pyscf.M(
+        atom='''C1   1.3    .2       .3
+                C2   .19   .1      1.1
+        ''',
+        basis={'C1': [[0, [1.1, 1.]],
+                      [1, [2., 1.]]],
+               'C2': 'ccpvdz'},
+        a=np.diag([2.5, 1.9, 2.2])*3)
+
+    auxcell = cell.copy()
+    auxcell.basis = {
+        'C1':'''
+C    S
+     12.9917624900           1.0000000000
+C    S
+      2.1325940100           1.0000000000
+C    P
+      9.8364318200           1.0000000000
+C    P
+      3.3490545000           1.0000000000
+C    P
+      1.4947618600           1.0000000000
+C    P
+      0.5769010900           1.0000000000
+C    D
+      0.1995412500           1.0000000000 ''',
+        'C2':[[0, [.5, 1.]]],
+    }
+    auxcell.build()
+    omega = 0.3
+    gpu_dat, dat_neg = build_cderi(cell, auxcell, kpts=None, omega=omega)
+
+    cell.precision = 1e-10
+    auxcell.precision = 1e-10
+    kpts = cell.make_kpts([1,1,1])
+    dfbuilder = _RSGDFBuilder(cell, auxcell, kpts)
+    dfbuilder.omega = omega
+    dfbuilder.j2c_eig_always = False
+    dfbuilder.fft_dd_block = True
+    dfbuilder.exclude_d_aux = True
+    naux = auxcell.nao
+    nao = cell.nao
+    with tempfile.NamedTemporaryFile() as tmpf:
+        dfbuilder.make_j3c(tmpf.name, aosym='s1')
+        with _load3c(tmpf.name, 'j3c', kpts[[0,0]]) as cderi:
+            ref = abs(cderi[:].reshape(naux,nao,nao))
+            dat = abs(gpu_dat[0,0].get())
+            assert abs(dat - ref).max() < 1e-8
+
+def test_kpts():
+    cell = pyscf.M(
+        atom='''C1   1.3    .2       .3
+                C2   .19   .1      1.1
+        ''',
+        basis={'C1': [[0, [1.1, 1.]],
+                      [1, [2., 1.]]],
+               'C2': 'ccpvdz'},
+        a=np.diag([2.5, 1.9, 2.2])*3)
+
+    auxcell = cell.copy()
+    auxcell.basis = {
+        'C1':'''
+C    S
+     12.9917624900           1.0000000000
+C    S
+      2.1325940100           1.0000000000
+C    P
+      9.8364318200           1.0000000000
+C    P
+      3.3490545000           1.0000000000
+C    P
+      1.4947618600           1.0000000000
+C    P
+      0.5769010900           1.0000000000
+C    D
+      0.1995412500           1.0000000000 ''',
+        'C2':[[0, [.5, 1.]]],
+    }
+    auxcell.build()
+    omega = 0.3
+    kmesh = [6,1,1]
+    kpts = cell.make_kpts(kmesh)
+    gpu_dat, dat_neg = build_cderi(cell, auxcell, kpts, omega=omega)
+
+    cell.precision = 1e-10
+    auxcell.precision = 1e-10
+    dfbuilder = _RSGDFBuilder(cell, auxcell, kpts)
+    dfbuilder.omega = omega
+    dfbuilder.j2c_eig_always = False
+    dfbuilder.fft_dd_block = True
+    dfbuilder.exclude_d_aux = True
+    naux = auxcell.nao
+    nao = cell.nao
+    with tempfile.NamedTemporaryFile() as tmpf:
+        dfbuilder.make_j3c(tmpf.name, aosym='s1')
+        for ki, kj in gpu_dat:
+            with _load3c(tmpf.name, 'j3c', kpts[[ki,kj]]) as cderi:
+                ref = abs(cderi[:].reshape(naux,nao,nao))
+                dat = abs(gpu_dat[ki,kj].get())
+                print(ki,kj)
+                assert abs(dat - ref).max() < 1e-8
+
+def test_kpts_j_only():
+    cell = pyscf.M(
+        atom='''C1   1.3    .2       .3
+                C2   .19   .1      1.1
+        ''',
+        basis={'C1': [[0, [1.1, 1.]],
+                      [1, [2., 1.]]],
+               'C2': 'ccpvdz'},
+        a=np.diag([2.5, 1.9, 2.2])*3)
+
+    auxcell = cell.copy()
+    auxcell.basis = {
+        'C1':'''
+C    S
+     12.9917624900           1.0000000000
+C    S
+      2.1325940100           1.0000000000
+C    P
+      9.8364318200           1.0000000000
+C    P
+      3.3490545000           1.0000000000
+C    P
+      1.4947618600           1.0000000000
+C    P
+      0.5769010900           1.0000000000
+C    D
+      0.1995412500           1.0000000000 ''',
+        'C2':[[0, [.5, 1.]]],
+    }
+    auxcell.build()
+    omega = 0.3
+    kmesh = [1,3,4]
+    kpts = cell.make_kpts(kmesh)
+    gpu_dat, dat_neg = build_cderi(cell, auxcell, kpts, omega=omega, j_only=True)
+
+    cell.precision = 1e-10
+    auxcell.precision = 1e-10
+    dfbuilder = _RSGDFBuilder(cell, auxcell, kpts)
+    dfbuilder.j_only = True
+    dfbuilder.omega = omega
+    dfbuilder.j2c_eig_always = False
+    dfbuilder.fft_dd_block = True
+    dfbuilder.exclude_d_aux = True
+    naux = auxcell.nao
+    nao = cell.nao
+    with tempfile.NamedTemporaryFile() as tmpf:
+        dfbuilder.make_j3c(tmpf.name, aosym='s1', j_only=True)
+        for ki, kj in gpu_dat:
+            with _load3c(tmpf.name, 'j3c', kpts[[ki,kj]]) as cderi:
+                ref = abs(cderi[:].reshape(naux,nao,nao))
+                dat = abs(gpu_dat[ki,kj].get())
+                print(ki,kj)
+                assert abs(dat - ref).max() < 1e-8
diff --git a/gpu4pyscf/pbc/dft/gen_grid.py b/gpu4pyscf/pbc/dft/gen_grid.py
index 8cac0d01..66b362d2 100644
--- a/gpu4pyscf/pbc/dft/gen_grid.py
+++ b/gpu4pyscf/pbc/dft/gen_grid.py
@@ -16,10 +16,14 @@
 import numpy as np
 import cupy as cp
 from pyscf import lib
-from pyscf.lib import logger
 from pyscf.pbc.dft import gen_grid as gen_grid_cpu
 from pyscf.pbc.gto.cell import get_uniform_grids
-from gpu4pyscf.lib import utils
+from gpu4pyscf.dft import Grids
+from gpu4pyscf.lib import utils, logger
+
+__all__ = [
+    'UniformGrids', 'BeckeGrids', 'AtomicGrids'
+]
 
 class UniformGrids(lib.StreamObject):
     '''Uniform Grid class.'''
@@ -66,8 +70,31 @@ def size(self):
     kernel = gen_grid_cpu.UniformGrids.kernel
 
     to_gpu = utils.to_gpu
-    device = utils.device
     to_cpu = utils.to_cpu
 
-class BeckeGrids:
-    pass
+
+class BeckeGrids(Grids):
+    '''Atomic grids for all-electron calculation.'''
+    def __init__(self, cell):
+        self.cell = cell
+        Grids.__init__(self, cell)
+
+    def build(self, cell=None, with_non0tab=False):
+        if cell is None: cell = self.cell
+        coords, weights = gen_grid_cpu.get_becke_grids(
+            self.cell, self.atom_grid, radi_method=self.radi_method,
+            level=self.level, prune=self.prune)
+        self.coords = cp.asarray(coords)
+        self.weights = cp.asarray(weights)
+        if with_non0tab:
+            raise NotImplementedError
+        self.non0tab = None
+        logger.info(self, 'tot grids = %d', len(self.weights))
+        logger.info(self, 'cell vol = %.9g  sum(weights) = %.9g',
+                    cell.vol, self.weights.sum())
+        return self
+
+    to_gpu = utils.to_gpu
+    to_cpu = utils.to_cpu
+
+AtomicGrids = BeckeGrids
diff --git a/gpu4pyscf/pbc/dft/krks.py b/gpu4pyscf/pbc/dft/krks.py
index c5fefb7f..c4fa0245 100644
--- a/gpu4pyscf/pbc/dft/krks.py
+++ b/gpu4pyscf/pbc/dft/krks.py
@@ -47,7 +47,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_rks(ks.with_df, ks.xc, dm, hermi,
                                        kpts, kpts_band,
                                        with_j=True, return_j=False)
-        log.info('nelec by numeric integration = %s', n)
+        log.debug('nelec by numeric integration = %s', n)
         t0 = log.timer('vxc', *t0)
         return vxc
 
@@ -61,7 +61,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         max_memory = ks.max_memory - lib.current_memory()[0]
         n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi,
                                 kpts, kpts_band, max_memory=max_memory)
-        log.info('nelec by numeric integration = %s', n)
+        log.debug('nelec by numeric integration = %s', n)
         if ks.do_nlc():
             if ni.libxc.is_nlc(ks.xc):
                 xc = ks.xc
@@ -72,7 +72,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-            log.info('nelec with nlc grids = %s', n)
+            log.debug('nelec with nlc grids = %s', n)
         t0 = log.timer('vxc', *t0)
 
     nkpts = len(kpts)
@@ -140,6 +140,14 @@ def energy_elec(mf, dm_kpts=None, h1e_kpts=None, vhf=None):
                     ecoul.imag)
     return tot_e.real, ecoul.real + exc.real
 
+def get_rho(mf, dm=None, grids=None, kpts=None):
+    if dm is None: dm = mf.make_rdm1()
+    if grids is None: grids = mf.grids
+    if kpts is None: kpts = mf.kpts
+    assert dm.ndim == 3
+    assert kpts.ndim == 2
+    return mf._numint.get_rho(mf.cell, dm, grids, kpts)
+
 class KRKS(rks.KohnShamDFT, khf.KRHF):
     '''RKS class adapted for PBCs with k-point sampling.
     '''
@@ -151,7 +159,7 @@ def __init__(self, cell, kpts=np.zeros((1,3)), xc='LDA,VWN', exxdiv='ewald'):
     dump_flags = krks_cpu.KRKS.dump_flags
     get_veff = get_veff
     energy_elec = energy_elec
-    get_rho = return_cupy_array(krks_cpu.get_rho)
+    get_rho = get_rho
 
     nuc_grad_method = NotImplemented
     to_hf = NotImplemented
diff --git a/gpu4pyscf/pbc/dft/kuks.py b/gpu4pyscf/pbc/dft/kuks.py
index 363bfefd..fad45cbd 100644
--- a/gpu4pyscf/pbc/dft/kuks.py
+++ b/gpu4pyscf/pbc/dft/kuks.py
@@ -28,7 +28,7 @@
 from gpu4pyscf.lib import logger, utils
 from gpu4pyscf.lib.cupy_helper import return_cupy_array, tag_array
 from gpu4pyscf.pbc.scf import khf, kuhf
-from gpu4pyscf.pbc.dft import rks
+from gpu4pyscf.pbc.dft import rks, krks
 
 def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
              kpts=None, kpts_band=None):
@@ -47,7 +47,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm, hermi,
                                        kpts, kpts_band,
                                        with_j=True, return_j=False)
-        log.info('nelec by numeric integration = %s', n)
+        log.debug('nelec by numeric integration = %s', n)
         t0 = log.timer('vxc', *t0)
         return vxc
 
@@ -71,7 +71,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           0, hermi, kpts, max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-        log.info('nelec by numeric integration = %s', n)
+        log.debug('nelec by numeric integration = %s', n)
         t0 = log.timer('vxc', *t0)
 
     nkpts = len(kpts)
@@ -150,7 +150,10 @@ def __init__(self, cell, kpts=np.zeros((1,3)), xc='LDA,VWN', exxdiv='ewald'):
 
     get_veff = get_veff
     energy_elec = energy_elec
-    get_rho = return_cupy_array(kuks_cpu.get_rho)
+
+    def get_rho(self, dm=None, grids=None, kpts=None):
+        if dm is None: dm = self.make_rdm1()
+        return krks.get_rho(self, dm[0]+dm[1], grids, kpts)
 
     nuc_grad_method = NotImplemented
     to_hf = NotImplemented
diff --git a/gpu4pyscf/pbc/dft/numint.py b/gpu4pyscf/pbc/dft/numint.py
index ea9e83cd..f064f664 100644
--- a/gpu4pyscf/pbc/dft/numint.py
+++ b/gpu4pyscf/pbc/dft/numint.py
@@ -90,17 +90,17 @@ def eval_rho(cell, ao, dm, non0tab=None, xctype='LDA', hermi=0, with_lapl=False,
         pyscf.dft.numint.eval_rho
 
     '''
-    if np.iscomplexobj(ao) or np.iscomplexobj(dm):
+    if cp.iscomplexobj(ao) or cp.iscomplexobj(dm):
         ngrids, nao = ao.shape[-2:]
         ao_loc = cell.ao_loc_nr()
         assert nao == ao_loc[-1]
         dm = cp.asarray(dm, dtype=np.complex128)
+        ao = cp.asarray(ao, dtype=np.complex128)
 
         if hermi == 1:
             def dot_bra(bra, aodm):
-                rho = contract('pi,pi->p', bra.real, aodm.real)
-                rho += contract('pi,pi->p', bra.imag, aodm.imag)
-                return rho
+                rho = contract('pi,pi->p', bra.conj(), aodm).real
+                return cp.asarray(rho, order='C')
             dtype = np.float64
         else:
             def dot_bra(bra, aodm):
@@ -147,6 +147,7 @@ def dot_bra(bra, aodm):
         ngrids, nao = ao.shape[-2:]
         ao_loc = cell.ao_loc_nr()
         assert nao == ao_loc[-1]
+        assert ao.dtype == dm.dtype
 
         def dot_bra(bra, aodm):
             return contract('pi,pi->p', bra, aodm)
@@ -378,13 +379,12 @@ def _tau_dot(bra, ket, wv):
     return mat
 
 
-#TODO: put NumInt and KNumInt into one
 class KNumInt(lib.StreamObject, numint.LibXCMixin):
     eval_ao = staticmethod(eval_ao_kpts)
 
     make_mask = NotImplemented
 
-    def get_rho(self, cell, dm, grids, kpts=np.zeros((1,3)), max_memory=2000):
+    def get_rho(self, cell, dm, grids, kpts=np.zeros((1,3))):
         '''Density in real space
         '''
         kpts = kpts.reshape(-1, 3)
@@ -445,7 +445,7 @@ def block_loop(self, cell, grids, deriv=0, kpts=None):
         for ip0, ip1 in lib.prange(0, ngrids, blksize):
             coords = grids_coords[ip0:ip1]
             weight = grids_weights[ip0:ip1]
-            ao_ks = eval_ao_kpts(cell, coords, kpts, deriv=deriv)
+            ao_ks = self.eval_ao(cell, coords, kpts, deriv=deriv)
             yield ao_ks, weight, coords
             ao_ks = None
 
diff --git a/gpu4pyscf/pbc/dft/rks.py b/gpu4pyscf/pbc/dft/rks.py
index c6c93b24..fbc35f51 100644
--- a/gpu4pyscf/pbc/dft/rks.py
+++ b/gpu4pyscf/pbc/dft/rks.py
@@ -73,7 +73,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
     else:
         n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi,
                                 kpt, kpts_band)
-        log.info('nelec by numeric integration = %s', n)
+        log.debug('nelec by numeric integration = %s', n)
         if ks.do_nlc():
             if ni.libxc.is_nlc(ks.xc):
                 xc = ks.xc
@@ -83,7 +83,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
             n, enlc, vnlc = ni.nr_nlc_vxc(cell, ks.nlcgrids, xc, dm, 0, hermi, kpt)
             exc += enlc
             vxc += vnlc
-            log.info('nelec with nlc grids = %s', n)
+            log.debug('nelec with nlc grids = %s', n)
         t0 = log.timer('vxc', *t0)
 
     if not hybrid:
@@ -122,8 +122,18 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
     vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=None, vk=None)
     return vxc
 
-def prune_small_rho_grids_(ks, cell, dm, grids, kpts):
-    raise NotImplementedError
+NELEC_ERROR_TOL = getattr(__config__, 'pbc_dft_rks_prune_error_tol', 0.02)
+def prune_small_rho_grids_(mf, cell, dm, grids, kpts):
+    rho = mf.get_rho(dm, grids, kpts)
+    n = rho.dot(grids.weights)
+    if abs(n-cell.nelectron) < NELEC_ERROR_TOL*n:
+        rho *= grids.weights
+        size0 = grids.weights.size
+        idx = abs(rho) > mf.small_rho_cutoff / size0
+        grids.coords  = grids.coords [idx]
+        grids.weights = grids.weights[idx]
+        logger.debug(mf, 'Drop grids %d', size0 - grids.weights.size)
+    return grids
 
 class KohnShamDFT(mol_ks.KohnShamDFT):
     '''PBC-KS'''
@@ -148,9 +158,21 @@ def __init__(self, xc='LDA,VWN'):
     dump_flags = rks_cpu.KohnShamDFT.dump_flags
 
     get_veff = NotImplemented
-    get_rho = return_cupy_array(rks_cpu.get_rho)
+    get_rho = NotImplemented
+
+    def density_fit(self, auxbasis=None, with_df=None):
+        from gpu4pyscf.pbc.df.df_jk import density_fit
+        cell = self.cell
+        mf = density_fit(self, auxbasis, with_df)
+        mf.with_df._j_only = not self._numint.libxc.is_hybrid_xc(self.xc)
+        mf.grids = gen_grid.BeckeGrids(cell)
+        mf.grids.level = getattr(
+            __config__, 'dft_rks_RKS_grids_level', mf.grids.level)
+        mf.nlcgrids = gen_grid.BeckeGrids(cell)
+        mf.nlcgrids.level = getattr(
+            __config__, 'dft_rks_RKS_nlcgrids_level', mf.nlcgrids.level)
+        return mf
 
-    density_fit = NotImplemented
     rs_density_fit = NotImplemented
 
     jk_method = NotImplemented
@@ -164,7 +186,7 @@ def initialize_grids(self, cell, dm, kpts, ground_state=True):
         '''Initialize self.grids the first time call get_veff'''
         if self.grids.coords is None:
             t0 = (logger.process_clock(), logger.perf_counter())
-            self.grids.build(with_non0tab=True)
+            self.grids.build()
             if (isinstance(self.grids, gen_grid.BeckeGrids) and
                 self.small_rho_cutoff > 1e-20 and ground_state):
                 self.grids = prune_small_rho_grids_(
@@ -173,7 +195,7 @@ def initialize_grids(self, cell, dm, kpts, ground_state=True):
         is_nlc = self.do_nlc()
         if is_nlc and self.nlcgrids.coords is None:
             t0 = (logger.process_clock(), logger.perf_counter())
-            self.nlcgrids.build(with_non0tab=True)
+            self.nlcgrids.build()
             if (isinstance(self.grids, gen_grid.BeckeGrids) and
                 self.small_rho_cutoff > 1e-20 and ground_state):
                 self.nlcgrids = prune_small_rho_grids_(
@@ -185,6 +207,14 @@ def initialize_grids(self, cell, dm, kpts, ground_state=True):
 pbchf.KohnShamDFT = KohnShamDFT
 
 
+def get_rho(mf, dm=None, grids=None, kpt=None):
+    if dm is None: dm = mf.make_rdm1()
+    if grids is None: grids = mf.grids
+    if kpt is None: kpt = mf.kpt
+    assert dm.ndim == 2
+    assert kpt.ndim == 1
+    return mf._numint.get_rho(mf.cell, dm[None], grids, kpt[None])
+
 class RKS(KohnShamDFT, pbchf.RHF):
     '''RKS class adapted for PBCs.
 
@@ -203,6 +233,7 @@ def dump_flags(self, verbose=None):
 
     get_veff = get_veff
     energy_elec = mol_ks.energy_elec
+    get_rho = get_rho
 
     to_gpu = utils.to_gpu
     device = utils.device
diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py b/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py
index cc60be8f..17e1451f 100644
--- a/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py
+++ b/gpu4pyscf/pbc/dft/tests/test_pbc_rks.py
@@ -154,6 +154,31 @@ def test_kpts_rsh_fft(self):
         mf_ref = kmf.to_cpu().run()
         self.assertAlmostEqual(kmf.e_tot, mf_ref.e_tot, 7)
 
+    def test_kpts_gga_gdf(self):
+        from gpu4pyscf.pbc.df.df import GDF
+        L = 4.
+        cell = pbcgto.Cell()
+        cell.a = np.eye(3)*L
+        cell.atom =[['H' , ( L/2+0., L/2+0. ,   L/2+1.)],
+                    ['H' , ( L/2+1., L/2+0. ,   L/2+1.)]]
+        cell.basis = [[0, (4.0, 1.0)], [0, (1.0, 1.0)]]
+        cell.build()
+
+        mf = cell.RKS(xc='pbe0').to_gpu().density_fit().run()
+        self.assertTrue(isinstance(mf.with_df, GDF))
+        self.assertAlmostEqual(mf.e_tot, -0.44834992009430463, 7)
+        mf_ref = mf.to_cpu().run()
+        self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+        nk = [2, 1, 1]
+        kpts = cell.make_kpts(nk)
+        kmf = pbcdft.KRKS(cell, xc='pbe0', kpts=kpts).density_fit().run()
+        self.assertTrue(isinstance(kmf.with_df, GDF))
+        self.assertAlmostEqual(kmf.e_tot, -0.44429306, 7)
+        mf_ref = kmf.to_cpu()
+        mf_ref.run()
+        self.assertAlmostEqual(kmf.e_tot, mf_ref.e_tot, 7)
+
 if __name__ == '__main__':
     print("Full Tests for pbc.dft.rks")
     unittest.main()
diff --git a/gpu4pyscf/pbc/dft/tests/test_pbc_uks.py b/gpu4pyscf/pbc/dft/tests/test_pbc_uks.py
index 5848038c..2b73dfb2 100644
--- a/gpu4pyscf/pbc/dft/tests/test_pbc_uks.py
+++ b/gpu4pyscf/pbc/dft/tests/test_pbc_uks.py
@@ -68,6 +68,7 @@ def test_gga_fft(self):
 
     def test_rsh_fft(self):
         mf = pbcdft.UKS(cell, xc='camb3lyp').run(conv_tol=1e-9)
+        self.assertAlmostEqual(mf.e_tot, -4.350842690091271, 7)
         mf_ref = mf.to_cpu().run()
         self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
 
@@ -153,6 +154,32 @@ def test_kpts_rsh_fft(self):
         mf_ref = kmf.to_cpu().run()
         self.assertAlmostEqual(kmf.e_tot, mf_ref.e_tot, 7)
 
+    def test_kpts_gga_gdf(self):
+        from gpu4pyscf.pbc.df.df import GDF
+        L = 4.
+        cell = pbcgto.Cell()
+        cell.a = np.eye(3)*L
+        cell.atom =[['H' , ( L/2+0., L/2+0. ,   L/2+1.)],
+                    ['H' , ( L/2+1., L/2+0. ,   L/2+1.)]]
+        cell.basis = [[0, (4.0, 1.0)], [0, (1.0, 1.0)]]
+        cell.spin = 2
+        cell.build()
+
+        mf = cell.UKS(xc='pbe0').to_gpu().density_fit().run()
+        self.assertTrue(isinstance(mf.with_df, GDF))
+        self.assertAlmostEqual(mf.e_tot, -0.10443638, 7)
+        mf_ref = mf.to_cpu().run()
+        self.assertAlmostEqual(mf.e_tot, mf_ref.e_tot, 7)
+
+        nk = [2, 1, 1]
+        kpts = cell.make_kpts(nk)
+        kmf = pbcdft.KUKS(cell, xc='pbe0', kpts=kpts).density_fit().run()
+        self.assertTrue(isinstance(kmf.with_df, GDF))
+        self.assertAlmostEqual(kmf.e_tot, -0.19581151, 7)
+        mf_ref = kmf.to_cpu()
+        mf_ref.run()
+        self.assertAlmostEqual(kmf.e_tot, mf_ref.e_tot, 7)
+
 if __name__ == '__main__':
     print("Full Tests for pbc.dft.uks")
     unittest.main()
diff --git a/gpu4pyscf/pbc/dft/uks.py b/gpu4pyscf/pbc/dft/uks.py
index 8ce4466e..1cd2f976 100644
--- a/gpu4pyscf/pbc/dft/uks.py
+++ b/gpu4pyscf/pbc/dft/uks.py
@@ -52,7 +52,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm, hermi,
                                        kpt.reshape(1,3), kpts_band,
                                        with_j=True, return_j=False)
-        log.info('nelec by numeric integration = %s', n)
+        log.debug('nelec by numeric integration = %s', n)
         t0 = log.timer('vxc', *t0)
         return vxc
 
@@ -79,7 +79,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           0, hermi, kpt, max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-        log.info('nelec by numeric integration = %s', n)
+        log.debug('nelec by numeric integration = %s', n)
         t0 = log.timer('vxc', *t0)
 
     if not hybrid:
@@ -134,10 +134,13 @@ def __init__(self, cell, kpt=np.zeros(3), xc='LDA,VWN', exxdiv='ewald'):
 
     dump_flags = uks_cpu.UKS.dump_flags
 
-    get_rho = return_cupy_array(uks_cpu.get_rho)
     get_veff = get_veff
     energy_elec = mol_uks.energy_elec
 
+    def get_rho(self, dm=None, grids=None, kpt=None):
+        if dm is None: dm = self.make_rdm1()
+        return rks.get_rho(self, dm[0]+dm[1], grids, kpt)
+
     nuc_grad_method = NotImplemented
     to_hf = NotImplemented
 
diff --git a/gpu4pyscf/pbc/gto/__init__.py b/gpu4pyscf/pbc/gto/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/gpu4pyscf/pbc/gto/cell.py b/gpu4pyscf/pbc/gto/cell.py
new file mode 100644
index 00000000..14df0ff9
--- /dev/null
+++ b/gpu4pyscf/pbc/gto/cell.py
@@ -0,0 +1,49 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+# This function is only available in pyscf-2.8 or later
+def extract_pgto_params(cell, op='diffused'):
+    '''A helper function to extract exponents and contraction coefficients for
+    estimate_xxx function
+    '''
+    es = []
+    cs = []
+    if op == 'diffused':
+        precision = cell.precision
+        for i in range(cell.nbas):
+            e = cell.bas_exp(i)
+            c = abs(cell._libcint_ctr_coeff(i)).max(axis=1)
+            l = cell.bas_angular(i)
+            # A quick estimation for the radius that each primitive GTO vanishes
+            r2 = np.log(c**2 / precision * 10**l) / e
+            idx = r2.argmax()
+            es.append(e[idx])
+            cs.append(c[idx].max())
+    elif op == 'compact':
+        precision = cell.precision
+        for i in range(cell.nbas):
+            e = cell.bas_exp(i)
+            c = abs(cell._libcint_ctr_coeff(i)).max(axis=1)
+            l = cell.bas_angular(i)
+            # A quick estimation for the resolution of planewaves that each
+            # primitive GTO requires
+            ke = np.log(c**2 / precision * 50**l) * e
+            idx = ke.argmax()
+            es.append(e[idx])
+            cs.append(c[idx].max())
+    else:
+        raise RuntimeError(f'Unsupported operation {op}')
+    return np.array(es), np.array(cs)
diff --git a/gpu4pyscf/pbc/lib/kpts_helper.py b/gpu4pyscf/pbc/lib/kpts_helper.py
index 9b85184b..6a3d0334 100644
--- a/gpu4pyscf/pbc/lib/kpts_helper.py
+++ b/gpu4pyscf/pbc/lib/kpts_helper.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+from pyscf import lib
 
 def conj_images_in_bvk_cell(kmesh, return_pair=False):
     '''
@@ -42,3 +43,40 @@ def conj_images_in_bvk_cell(kmesh, return_pair=False):
     mask = Ls_idx <= Ls_idx_conj
     return np.column_stack((Ls_idx[mask], Ls_idx_conj[mask]))
 
+def kk_adapted_iter(kmesh):
+    '''Generates kpt which is adapted to the kpt_p in (ij|p)
+
+    This function provides the similar functionality as the
+    pyscf.pbc.lib.kpts_helper.kk_adapted_iter .
+    '''
+    kmesh = np.asarray(kmesh)
+    nkpts = np.prod(kmesh)
+    nx, ny, nz = kmesh
+    kx = np.fft.fftfreq(nx, 1./nx).astype(int)
+    ky = np.fft.fftfreq(ny, 1./ny).astype(int)
+    kz = np.fft.fftfreq(nz, 1./nz).astype(int)
+
+    kxyz = lib.cartesian_prod([kx, ky, kz])
+    dk = (kxyz[None,:,:] - kxyz[:,None,:]).reshape(-1, 3)
+
+    dk %= kmesh
+    wrap_around_mask = dk >= (kmesh+1)//2
+    dk[wrap_around_mask[:,0],0] -= nx
+    dk[wrap_around_mask[:,1],1] -= ny
+    dk[wrap_around_mask[:,2],2] -= nz
+    uniq_ks, uniq_index, uniq_inverse = np.unique(
+        dk, axis=0, return_index=True, return_inverse=True)
+
+    ks_conj = -uniq_ks
+    strides = np.array((ny*nz, nz, 1))
+    ks_idx = (uniq_ks % kmesh).dot(strides)
+    ks_idx_conj = (ks_conj % kmesh).dot(strides)
+
+    independent_idx = np.sort(np.nonzero(ks_idx <= ks_idx_conj)[0])
+    for x in independent_idx:
+        kp = ks_idx[x]
+        kp_conj = ks_idx_conj[x]
+        kpt_ij_idx = np.where(uniq_inverse == x)[0]
+        kpti_idx = kpt_ij_idx // nkpts
+        kptj_idx = kpt_ij_idx % nkpts
+        yield kp, kp_conj, kpti_idx, kptj_idx
diff --git a/gpu4pyscf/pbc/scf/hf.py b/gpu4pyscf/pbc/scf/hf.py
index 740f76a5..3aec403d 100644
--- a/gpu4pyscf/pbc/scf/hf.py
+++ b/gpu4pyscf/pbc/scf/hf.py
@@ -240,11 +240,16 @@ class RHF(SCF):
     to_gpu = utils.to_gpu
     device = utils.device
 
+    def density_fit(self, auxbasis=None, with_df=None):
+        from gpu4pyscf.pbc.df.df_jk import density_fit
+        return density_fit(self, auxbasis, with_df)
+
     def to_cpu(self):
         mf = hf_cpu.RHF(self.cell)
         utils.to_cpu(self, out=mf)
         return mf
 
+
 def _format_jks(vj, dm, kpts_band):
     if kpts_band is None:
         vj = vj.reshape(dm.shape)
diff --git a/gpu4pyscf/pbc/scf/khf.py b/gpu4pyscf/pbc/scf/khf.py
index d4c7855e..4ec72d98 100644
--- a/gpu4pyscf/pbc/scf/khf.py
+++ b/gpu4pyscf/pbc/scf/khf.py
@@ -399,6 +399,8 @@ def get_init_guess(self, cell=None, key='minao', s1e=None):
             dm_kpts *= (nelectron / ne).reshape(-1,1,1)
         return dm_kpts
 
+    density_fit = pbchf.RHF.density_fit
+
     to_gpu = utils.to_gpu
     device = utils.device
 
diff --git a/gpu4pyscf/pbc/scf/kuhf.py b/gpu4pyscf/pbc/scf/kuhf.py
index 7e82d932..d63396c7 100644
--- a/gpu4pyscf/pbc/scf/kuhf.py
+++ b/gpu4pyscf/pbc/scf/kuhf.py
@@ -38,8 +38,9 @@ def make_rdm1(mo_coeff_kpts, mo_occ_kpts, **kwargs):
     Returns:
         dm_kpts : (2, nkpts, nao, nao) ndarray
     '''
-    assert isinstance(mo_occ_kpts, cp.ndarray)
-    assert isinstance(mo_coeff_kpts, cp.ndarray)
+    mo_occ_kpts = cp.asarray(mo_occ_kpts)
+    mo_coeff_kpts = cp.asarray(mo_coeff_kpts)
+    assert mo_occ_kpts.dtype == np.float64
     c = mo_coeff_kpts * mo_occ_kpts[:,:,None,:]
     dm = contract('nkpi,nkqi->nkpq', mo_coeff_kpts, c.conj())
     return tag_array(dm, mo_coeff=mo_coeff_kpts, mo_occ=mo_occ_kpts)
@@ -312,6 +313,8 @@ def get_bands(self, kpts_band, cell=None, dm_kpts=None, kpts=None):
     to_ks = NotImplemented
     convert_from_ = NotImplemented
 
+    density_fit = khf.KRHF.density_fit
+
     to_gpu = utils.to_gpu
     device = utils.device
 
diff --git a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py
index ca0810c5..71ae0ef1 100644
--- a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py
+++ b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_hf.py
@@ -132,6 +132,28 @@ def test_krhf_bands(self):
         e_ref = kmf_cpu.get_bands(kpts_bands)[0]
         self.assertAlmostEqual(abs(e.get()-e_ref).max(), 0, 7)
 
+    def test_density_fit(self):
+        from gpu4pyscf.pbc.df.df import GDF
+        L = 4.
+        cell = pbcgto.Cell()
+        cell.a = np.eye(3)*L
+        cell.atom =[['H' , ( L/2+0., L/2+0. ,   L/2+1.)],
+                    ['H' , ( L/2+1., L/2+0. ,   L/2+1.)]]
+        cell.basis = [[0, (4.0, 1.0)], [0, (1.0, 1.0)]]
+        cell.build()
+
+        ref = cell.RHF().density_fit().run()
+        mf = ref.to_gpu().run(conv_tol=1e-8)
+        self.assertTrue(isinstance(mf.with_df, GDF))
+        self.assertAlmostEqual(ref.e_tot, -0.3740002917376214, 8)
+        self.assertAlmostEqual(mf.e_tot, ref.e_tot, 8)
+
+        ref = cell.KRHF().density_fit().run()
+        mf = ref.to_gpu().run(conv_tol=1e-8)
+        self.assertTrue(isinstance(mf.with_df, GDF))
+        self.assertAlmostEqual(ref.e_tot, -0.3740002917376214, 8)
+        self.assertAlmostEqual(mf.e_tot, ref.e_tot, 8)
+
 if __name__ == '__main__':
     print("Full Tests for pbc.scf.hf")
     unittest.main()
diff --git a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_uhf.py b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_uhf.py
index 2f888bdb..b9665f06 100644
--- a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_uhf.py
+++ b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_uhf.py
@@ -90,6 +90,28 @@ def test_small_system(self):
         mf = pscf.KUHF(mol,kpts=[[0., 0., 0.]]).run()
         self.assertAlmostEqual(mf.e_tot, -2.2719576422665635, 8)
 
+    def test_density_fit(self):
+        from gpu4pyscf.pbc.df.df import GDF
+        L = 4.
+        cell = pbcgto.Cell()
+        cell.a = np.eye(3)*L
+        cell.atom =[['H' , ( L/2+0., L/2+0. ,   L/2+1.)],
+                    ['H' , ( L/2+1., L/2+0. ,   L/2+1.)]]
+        cell.basis = [[0, (4.0, 1.0)], [0, (1.0, 1.0)]]
+        cell.spin = 2
+        cell.build()
+
+        ref = cell.UHF().density_fit().run()
+        mf = ref.to_gpu().run(conv_tol=1e-8)
+        self.assertTrue(isinstance(mf.with_df, GDF))
+        self.assertAlmostEqual(ref.e_tot, -0.11995733902879813, 8)
+        self.assertAlmostEqual(mf.e_tot, ref.e_tot, 8)
+
+        ref = cell.UHF().density_fit().run()
+        mf = ref.to_gpu().run(conv_tol=1e-8)
+        self.assertTrue(isinstance(mf.with_df, GDF))
+        self.assertAlmostEqual(ref.e_tot, -0.11995733902879813, 8)
+        self.assertAlmostEqual(mf.e_tot, ref.e_tot, 8)
 
 if __name__ == '__main__':
     print("Tests for PBC UHF and PBC KUHF")
diff --git a/gpu4pyscf/pbc/scf/uhf.py b/gpu4pyscf/pbc/scf/uhf.py
index 5abe6398..65e02ef2 100644
--- a/gpu4pyscf/pbc/scf/uhf.py
+++ b/gpu4pyscf/pbc/scf/uhf.py
@@ -124,6 +124,8 @@ def get_init_guess(self, cell=None, key='minao', s1e=None):
     to_ks = NotImplemented
     convert_from_ = NotImplemented
 
+    density_fit = pbchf.RHF.density_fit
+
     to_gpu = utils.to_gpu
     device = utils.device
 
diff --git a/gpu4pyscf/pbc/tools/k2gamma.py b/gpu4pyscf/pbc/tools/k2gamma.py
index 5e0041cf..2de30399 100644
--- a/gpu4pyscf/pbc/tools/k2gamma.py
+++ b/gpu4pyscf/pbc/tools/k2gamma.py
@@ -18,20 +18,23 @@
 import numpy as np
 from pyscf.lib import logger
 
-# This version of kpts_to_kmesh will be available in PySCF-2.8
-def kpts_to_kmesh(cell, kpts, precision=None, max_images=10000):
-    '''Find the minimal k-points mesh to include all input kpts'''
+# This version of kpts_to_kmesh may become available in PySCF-2.9
+def kpts_to_kmesh(cell, kpts, precision=None, rcut=None):
+    '''Search the minimal BvK mesh or Monkhorst-Pack k-point mesh'''
+    assert kpts.ndim == 2
     scaled_kpts = cell.get_scaled_kpts(kpts)
     logger.debug3(cell, '    scaled_kpts kpts %s', scaled_kpts)
-    # cell.nimgs are the upper limits for kmesh
-    kmesh = np.asarray(cell.nimgs) * 2 + 1
+    if rcut is None:
+        kmesh = np.asarray(cell.nimgs) * 2 + 1
+    else:
+        nimgs = cell.get_bounding_sphere(rcut)
+        kmesh = nimgs * 2 + 1
     if precision is None:
         precision = cell.precision * 1e2
     for i in range(3):
         floats = scaled_kpts[:,i]
         uniq_floats_idx = np.unique(floats.round(6), return_index=True)[1]
         uniq_floats = floats[uniq_floats_idx]
-        # Limit the number of images to 30 in each direction
         fracs = [Fraction(x).limit_denominator(int(kmesh[i])) for x in uniq_floats]
         denominators = np.unique([x.denominator for x in fracs])
         common_denominator = reduce(np.lcm, denominators)
@@ -43,14 +46,4 @@ def kpts_to_kmesh(cell, kpts, precision=None, max_images=10000):
                           i, common_denominator, abs(fs - np.rint(fs)).max())
             logger.debug3(cell, '    unique kpts %s', uniq_floats)
             logger.debug3(cell, '    frac kpts %s', fracs)
-
-    assert max_images > 0
-    if np.prod(kmesh) > max_images:
-        kmesh_raw = kmesh.copy()
-        for i in itertools.cycle(np.argsort(kmesh)[::-1]):
-            kmesh[i] = int(kmesh[i] * .8)
-            if np.prod(kmesh) < max_images:
-                break
-        logger.warn(cell, 'kmesh (%s) exceeds max_images (%d); reduced to %s',
-                    kmesh_raw, max_images, kmesh)
     return kmesh
diff --git a/gpu4pyscf/pop/esp.py b/gpu4pyscf/pop/esp.py
index 8406ac06..e6d41e5f 100644
--- a/gpu4pyscf/pop/esp.py
+++ b/gpu4pyscf/pop/esp.py
@@ -88,7 +88,7 @@ def vdw_surface(mol, scales=[1.0], density=1.0*radii.BOHR**2, rad=R_VDW):
     Generate vdw surface of molecules, in Bohr
     '''
     coords = mol.atom_coords(unit='B')
-    charges = mol.atom_charges()
+    charges = [gto.charge(sym) for sym in mol.elements]
     atom_radii = rad[charges]
 
     surface_points = []
@@ -196,7 +196,7 @@ def resp_solve(mol, dm, grid_density=1.0*radii.BOHR**2,
     q[u] = q[v] = q[w]
     '''
 
-    charges = mol.atom_charges()
+    charges = np.asarray([gto.charge(sym) for sym in mol.elements])
     natm = mol.natm
     is_restraint = charges > 1
     is_restraint[charges == 1] = not hfree
diff --git a/gpu4pyscf/properties/polarizability.py b/gpu4pyscf/properties/polarizability.py
index 8face371..7949b4f5 100644
--- a/gpu4pyscf/properties/polarizability.py
+++ b/gpu4pyscf/properties/polarizability.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 
 import numpy as np
-from gpu4pyscf.scf import cphf
 import cupy
+from gpu4pyscf.scf import hf, cphf, _response_functions
 from gpu4pyscf.lib.cupy_helper import contract
 
-
 def gen_vind(mf, mo_coeff, mo_occ):
     """get the induced potential. This is the same as contract the mo1 with the kernel.
 
@@ -59,6 +58,7 @@ def eval_polarizability(mf):
     Returns:
         polarizability (numpy.array): polarizability in au
     """
+    assert isinstance(mf, hf.RHF), "Unrestricted mf object is not supported."
 
     polarizability = np.empty((3, 3))
 
diff --git a/gpu4pyscf/properties/tests/test_polarizability.py b/gpu4pyscf/properties/tests/test_polarizability.py
index e9aebe48..7c02c718 100644
--- a/gpu4pyscf/properties/tests/test_polarizability.py
+++ b/gpu4pyscf/properties/tests/test_polarizability.py
@@ -17,6 +17,7 @@
 import pyscf
 from pyscf import lib
 from pyscf.dft import rks as rks_cpu
+from pyscf.dft import uks as uks_cpu
 from gpu4pyscf.dft import rks, uks
 from gpu4pyscf.properties import polarizability
 
@@ -62,7 +63,7 @@ def run_dft_df_polarizability(xc):
     polar = polarizability.eval_polarizability(mf)
     return e_dft, polar
 
-def _vs_cpu(xc):
+def _vs_cpu_rks(xc):
     mf = rks.RKS(mol, xc=xc)
     mf.grids.level = grids_level
     e_gpu = mf.kernel()
@@ -76,6 +77,20 @@ def _vs_cpu(xc):
     assert np.abs(e_gpu - e_cpu) < 1e-5
     assert np.linalg.norm(polar_cpu - polar_gpu) < 1e-3
 
+def _vs_cpu_uks(xc):
+    mf = uks.UKS(mol, xc=xc)
+    mf.grids.level = grids_level
+    e_gpu = mf.kernel()
+    polar_gpu = polarizability.eval_polarizability(mf)
+    
+    mf_cpu = uks_cpu.UKS(mol, xc=xc)
+    mf_cpu.conv_tol = 1e-12
+    e_cpu = mf_cpu.kernel()
+    polar_cpu = polar.rhf.Polarizability(mf_cpu).polarizability()
+
+    assert np.abs(e_gpu - e_cpu) < 1e-5
+    assert np.linalg.norm(polar_cpu - polar_gpu) < 1e-3
+
 class KnownValues(unittest.TestCase):
     '''
     known values are obtained by Q-Chem
@@ -140,9 +155,16 @@ def test_rks_b3lyp_df(self):
         assert np.allclose(polar, qchem_polar)
 
     @unittest.skipIf(polar is None, "Skipping test if pyscf.properties is not installed")
-    def test_cpu(self):
-        _vs_cpu('b3lyp')
+    def test_cpu_rks(self):
+        _vs_cpu_rks('b3lyp')
 
+    """
+    # UKS is not supported yet
+    @unittest.skipIf(polar is None, "Skipping test if pyscf.properties is not installed")
+    def test_cpu_uks(self):
+        _vs_cpu_uks('b3lyp')
+    """
+    
 if __name__ == "__main__":
     print("Full Tests for polarizabillity")
     unittest.main()
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index 3a0497ff..09523d4a 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -51,15 +51,13 @@ def _get_jk(mf, mol=None, dm=None, hermi=1, with_j=True, with_k=True,
     vj, vk = get_jk(mol, dm, hermi, vhfopt, with_j, with_k, omega)
     return vj, vk
 
-def make_rdm1(mf, mo_coeff=None, mo_occ=None, **kwargs):
-    if mo_occ is None: mo_occ = mf.mo_occ
-    if mo_coeff is None: mo_coeff = mf.mo_coeff
+def make_rdm1(mo_coeff, mo_occ):
     mo_coeff = cupy.asarray(mo_coeff)
     mo_occ = cupy.asarray(mo_occ)
     is_occ = mo_occ > 0
     mocc = mo_coeff[:, is_occ]
     dm = cupy.dot(mocc*mo_occ[is_occ], mocc.conj().T)
-    occ_coeff = mo_coeff[:, mo_occ>1.0]
+    occ_coeff = mo_coeff[:, is_occ]
     return tag_array(dm, occ_coeff=occ_coeff, mo_occ=mo_occ, mo_coeff=mo_coeff)
 
 def get_occ(mf, mo_energy=None, mo_coeff=None):
@@ -422,7 +420,6 @@ def check_sanity(self):
     init_guess_by_chkfile    = hf_cpu.SCF.init_guess_by_chkfile
     from_chk                 = hf_cpu.SCF.from_chk
     get_init_guess           = return_cupy_array(hf_cpu.SCF.get_init_guess)
-    make_rdm1                = make_rdm1
     make_rdm2                = NotImplemented
     energy_elec              = energy_elec
     energy_tot               = energy_tot
@@ -461,6 +458,11 @@ def check_sanity(self):
     mulliken_pop             = NotImplemented
     mulliken_meta            = NotImplemented
 
+    def make_rdm1(self, mo_coeff=None, mo_occ=None, **kwargs):
+        if mo_occ is None: mo_occ = self.mo_occ
+        if mo_coeff is None: mo_coeff = self.mo_coeff
+        return make_rdm1(mo_coeff, mo_occ)
+
     def dip_moment(self, mol=None, dm=None, unit='Debye', origin=None,
                    verbose=logger.NOTE):
         if mol is None: mol = self.mol
diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py
index 3d98ae5f..715eef45 100644
--- a/gpu4pyscf/scf/j_engine.py
+++ b/gpu4pyscf/scf/j_engine.py
@@ -26,7 +26,7 @@
 from pyscf import __config__
 from gpu4pyscf.lib.cupy_helper import load_library, condense, sandwich_dot, transpose_sum
 from gpu4pyscf.__config__ import props as gpu_specs
-from gpu4pyscf.__config__ import _num_devices
+from gpu4pyscf.__config__ import num_devices
 from gpu4pyscf.lib import logger
 from gpu4pyscf.scf import jk
 from gpu4pyscf.scf.jk import _make_j_engine_pair_locs, RysIntEnvVars, _scale_sp_ctr_coeff
@@ -53,7 +53,7 @@ def get_j(mol, dm, hermi=1, vhfopt=None, omega=None, verbose=None):
     if vhfopt is None:
         with mol.with_range_coulomb(omega):
             groupsize = None
-            if _num_devices > 1:                
+            if num_devices > 1:                
                 groupsize = jk.GROUP_SIZE
             vhfopt = _VHFOpt(mol).build(group_size=groupsize)
     if omega is None:
diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
index 0e328204..a0048bf5 100644
--- a/gpu4pyscf/scf/jk.py
+++ b/gpu4pyscf/scf/jk.py
@@ -26,12 +26,12 @@
 from pyscf.gto import ANG_OF, ATOM_OF, NPRIM_OF, NCTR_OF, PTR_COORD, PTR_COEFF
 from pyscf import lib
 from pyscf.scf import _vhf
-from pyscf import __config__
 from gpu4pyscf.lib.cupy_helper import (load_library, condense, sandwich_dot, transpose_sum,
                                        reduce_to_device)
+from gpu4pyscf.__config__ import _streams, num_devices, shm_size
 from gpu4pyscf.__config__ import props as gpu_specs
-from gpu4pyscf.__config__ import _streams, _num_devices
 from gpu4pyscf.lib import logger
+from gpu4pyscf.lib import multi_gpu
 from gpu4pyscf.gto.mole import group_basis
 
 __all__ = [
@@ -54,34 +54,68 @@
 UNROLL_NFMAX = ctypes.c_int.in_dll(libvhf_rys, 'rys_jk_unrolled_max_nf').value
 UNROLL_J_LMAX = ctypes.c_int.in_dll(libvhf_rys, 'rys_j_unrolled_lmax').value
 UNROLL_J_MAX_ORDER = ctypes.c_int.in_dll(libvhf_rys, 'rys_j_unrolled_max_order').value
+SHM_SIZE = shm_size - 1024
+del shm_size
 GOUT_WIDTH = 42
-SHM_SIZE = getattr(__config__, 'GPU_SHM_SIZE',
-                   int(gpu_specs['sharedMemPerBlockOptin']//9)*8)
 THREADS = 256
 GROUP_SIZE = 256
 
-def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
-             device_id=0, with_j=True, with_k=True, verbose=None):
-    n_dm = dms.shape[0]
-    nao, _ = vhfopt.coeff.shape
+def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None):
+    '''Compute J, K matrices
+    '''
+    assert with_j or with_k
+    log = logger.new_logger(mol, verbose)
+    cput0 = log.init_timer()
+
+    if vhfopt is None:
+        vhfopt = _VHFOpt(mol).build()
+
+    mol = vhfopt.sorted_mol
+    nao, nao_orig = vhfopt.coeff.shape
+
+    dm = cp.asarray(dm, order='C')
+    dms = dm.reshape(-1,nao_orig,nao_orig)
+    #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
+    dms = sandwich_dot(dms, vhfopt.coeff.T)
+    dms = cp.asarray(dms, order='C')
+
+    ao_loc = mol.ao_loc
+    nao = ao_loc[-1]
     uniq_l_ctr = vhfopt.uniq_l_ctr
     uniq_l = uniq_l_ctr[:,0]
     l_ctr_bas_loc = vhfopt.l_ctr_offsets
     l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
-    kern = libvhf_rys.RYS_build_jk
+    n_groups = np.count_nonzero(uniq_l <= LMAX)
 
-    timing_counter = Counter()
-    kern_counts = 0
-    with cp.cuda.Device(device_id), _streams[device_id]:
+    dm_cond = condense('absmax', dms, ao_loc)
+    if hermi == 0:
+        # Wrap the triu contribution to tril
+        dm_cond = dm_cond + dm_cond.T
+    dm_cond = cp.log(dm_cond + 1e-300).astype(np.float32)
+    log_max_dm = float(dm_cond.max())
+    log_cutoff = math.log(vhfopt.direct_scf_tol)
+
+    tasks = [(i,j,k,l)
+             for i in range(n_groups)
+             for j in range(i+1)
+             for k in range(i+1)
+             for l in range(k+1)]
+    schemes = {t: quartets_scheme(mol, uniq_l_ctr[list(t)]) for t in tasks}
+
+    def proc(dms, dm_cond):
+        device_id = cp.cuda.device.get_device_id()
+        stream = cp.cuda.stream.get_current_stream()
         log = logger.new_logger(mol, verbose)
-        cput0 = log.init_timer()
-        dms = cp.asarray(dms)
+        t0 = log.init_timer()
+        dms = cp.asarray(dms) # transfer to current device
+        dm_cond = cp.asarray(dm_cond)
 
         if hermi == 0:
             # Contract the tril and triu parts separately
             dms = cp.vstack([dms, dms.transpose(0,2,1)])
         n_dm = dms.shape[0]
-        tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p)
+        tile_q_cond = vhfopt.tile_q_cond
+        tile_q_ptr = ctypes.cast(tile_q_cond.data.ptr, ctypes.c_void_p)
         q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p)
         s_ptr = lib.c_null_ptr()
         if mol.omega < 0:
@@ -97,31 +131,34 @@ def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
             vj = cp.zeros(dms.shape)
             vj_ptr = ctypes.cast(vj.data.ptr, ctypes.c_void_p)
 
-        ao_loc = mol.ao_loc
-        dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32)
-        log_max_dm = dm_cond.max()
-        log_cutoff = math.log(vhfopt.direct_scf_tol)
-        tile_mappings = _make_tril_tile_mappings(l_ctr_bas_loc, vhfopt.tile_q_cond,
+        tile_mappings = _make_tril_tile_mappings(l_ctr_bas_loc, tile_q_cond,
                                                  log_cutoff-log_max_dm)
         workers = gpu_specs['multiProcessorCount']
         pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16)
         info = cp.empty(2, dtype=np.uint32)
-        t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
+        t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *t0)
 
-        for i, j, k, l in task_list:
-            ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
-                       l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
+        init_constant(mol)
+        timing_counter = Counter()
+        kern_counts = 0
+        kern = libvhf_rys.RYS_build_jk
+
+        while tasks:
+            try:
+                task = tasks.pop()
+            except IndexError:
+                break
+
+            i, j, k, l = task
+            shls_slice = l_ctr_bas_loc[[i, i+1, j, j+1, k, k+1, l, l+1]]
             tile_ij_mapping = tile_mappings[i,j]
-            llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
-            kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
-                        l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
             tile_kl_mapping = tile_mappings[k,l]
-            scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+            scheme = schemes[task]
             err = kern(
                 vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
                 ctypes.c_int(n_dm), ctypes.c_int(nao),
                 vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
-                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                (ctypes.c_int*8)(*shls_slice),
                 ctypes.c_int(tile_ij_mapping.size),
                 ctypes.c_int(tile_kl_mapping.size),
                 ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
@@ -135,12 +172,17 @@ def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
                 mol._atm.ctypes, ctypes.c_int(mol.natm),
                 mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
             if err != 0:
+                llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
                 raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
             if log.verbose >= logger.DEBUG1:
+                llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
                 msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
                 t1, t1p = log.timer_debug1(msg, *t1), t1
                 timing_counter[llll] += t1[1] - t1p[1]
                 kern_counts += 1
+            if num_devices > 1:
+                stream.synchronize()
+
         if with_j:
             if hermi == 1:
                 vj *= 2.
@@ -153,67 +195,16 @@ def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
             else:
                 vk, vkT = vk[:n_dm//2], vk[n_dm//2:]
                 vk += vkT.transpose(0,2,1)
-    return vj, vk, kern_counts, timing_counter
-
-def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None):
-    '''Compute J, K matrices
-    '''
-    log = logger.new_logger(mol, verbose)
-    cput0 = log.init_timer()
-
-    if vhfopt is None:
-        vhfopt = _VHFOpt(mol).build()
-
-    mol = vhfopt.sorted_mol
-    nao, nao_orig = vhfopt.coeff.shape
-
-    dm = cp.asarray(dm, order='C')
-    dms = dm.reshape(-1,nao_orig,nao_orig)
-    #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
-    dms = sandwich_dot(dms, vhfopt.coeff.T)
-    dms = cp.asarray(dms, order='C')
-
-    n_dm = dms.shape[0]
-
-    assert with_j or with_k
-
-    init_constant(mol)
-
-    uniq_l_ctr = vhfopt.uniq_l_ctr
-    uniq_l = uniq_l_ctr[:,0]
-    l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
-    n_groups = np.count_nonzero(uniq_l <= LMAX)
-
-    tasks = []
-    for i in range(n_groups):
-        for j in range(i+1):
-            for k in range(i+1):
-                for l in range(k+1): 
-                    tasks.append((i,j,k,l))
-    tasks = np.array(tasks)
-    task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
-
-    cp.cuda.get_current_stream().synchronize()
-    futures = []
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
-            future = executor.submit(
-                _jk_task,
-                mol, dms, vhfopt, task_list[device_id], hermi=hermi,
-                with_j=with_j, with_k=with_k, verbose=verbose, 
-                device_id=device_id)
-            futures.append(future)
+        return vj, vk, kern_counts, timing_counter
 
+    results = multi_gpu.run(proc, args=(dms, dm_cond), non_blocking=True)
     kern_counts = 0
     timing_collection = Counter()
     vj_dist = []
     vk_dist = []
-    for future in futures:
-        vj, vk, counts, counter = future.result()
+    for vj, vk, counts, t_counter in results:
         kern_counts += counts
-        timing_collection += counter
+        timing_collection += t_counter
         vj_dist.append(vj)
         vk_dist.append(vk)
 
@@ -222,17 +213,14 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
         for llll, t in timing_collection.items():
             log.debug1('%s wall time %.2f', llll, t)
 
-    for s in _streams:
-        s.synchronize()
-    cp.cuda.get_current_stream().synchronize()
     vj = vk = None
     if with_k:
-        vk = reduce_to_device(vk_dist, inplace=True)
+        vk = multi_gpu.array_reduce(vk_dist, inplace=True)
         #:vk = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vk, vhfopt.coeff)
         vk = sandwich_dot(vk, vhfopt.coeff)
-        
+
     if with_j:
-        vj = reduce_to_device(vj_dist, inplace=True)
+        vj = multi_gpu.array_reduce(vj_dist, inplace=True)
         vj = transpose_sum(vj)
         #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vj, vhfopt.coeff)
         vj = sandwich_dot(vj, vhfopt.coeff)
@@ -251,10 +239,7 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
             else:
                 scripts.append('jk->s1il')
         shls_excludes = [0, h_shls[0]] * 4
-        if hermi == 1:
-            dms = dms.get()
-        else:
-            dms = dms[:n_dm//2].get()
+        dms = dms.get()
         vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts,
                                  dms, 1, mol._atm, mol._bas, mol._env,
                                  shls_excludes=shls_excludes)
@@ -310,121 +295,148 @@ def get_j(mol, dm, hermi=0, vhfopt=None, verbose=None):
 
     ao_loc = mol.ao_loc
     dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32)
-    log_max_dm = dm_cond.max()
+    log_max_dm = float(dm_cond.max())
     log_cutoff = math.log(vhfopt.direct_scf_tol)
 
+    uniq_l_ctr = vhfopt.uniq_l_ctr
+    uniq_l = uniq_l_ctr[:,0]
+    l_ctr_bas_loc = vhfopt.l_ctr_offsets
+    l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
+    n_groups = np.count_nonzero(uniq_l <= LMAX)
+    ntiles = mol.nbas // TILE
+
     dms = dms.get()
     pair_loc = _make_j_engine_pair_locs(mol)
     dm_xyz = np.empty(pair_loc[-1])
     libvhf_rys.transform_cart_to_xyz(
         dm_xyz.ctypes, dms.ctypes, ao_loc.ctypes, pair_loc.ctypes,
         mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-    dm_xyz = cp.asarray(dm_xyz)
-    vj_xyz = cp.zeros_like(dm_xyz)
-
-    pair_loc_on_gpu = cp.asarray(pair_loc)
-    rys_envs = RysIntEnvVars(
-        mol.natm, mol.nbas,
-        vhfopt.rys_envs.atm, vhfopt.rys_envs.bas, vhfopt.rys_envs.env,
-        pair_loc_on_gpu.data.ptr,
-    )
 
-    err = libvhf_rys.RYS_init_rysj_constant(ctypes.c_int(SHM_SIZE))
-    if err != 0:
-        raise RuntimeError('CUDA kernel initialization')
+    tasks = [(i,j,k,l)
+             for i in range(n_groups)
+             for j in range(i+1)
+             for k in range(i+1)
+             for l in range(k+1)]
+    schemes = {t: _j_engine_quartets_scheme(mol, uniq_l_ctr[list(t)]) for t in tasks}
 
-    uniq_l_ctr = vhfopt.uniq_l_ctr
-    uniq_l = uniq_l_ctr[:,0]
-    l_ctr_bas_loc = vhfopt.l_ctr_offsets
-    l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
-    n_groups = np.count_nonzero(uniq_l <= LMAX)
-    ntiles = mol.nbas // TILE
-    tile_mappings = {}
-    workers = gpu_specs['multiProcessorCount']
-    pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16)
-    info = cp.empty(2, dtype=np.uint32)
+    def proc(dm_xyz, dm_cond):
+        device_id = cp.cuda.device.get_device_id()
+        stream = cp.cuda.stream.get_current_stream()
+        log = logger.new_logger(mol, verbose)
+        t0 = log.init_timer()
+        dm_xyz = cp.asarray(dm_xyz) # transfer to current device
+        dm_cond = cp.asarray(dm_cond)
+        vj_xyz = cp.zeros_like(dm_xyz)
+        pair_loc_on_gpu = cp.asarray(pair_loc)
+        _atm, _bas, _env, _ = vhfopt.rys_envs._env_ref_holder
+        rys_envs = RysIntEnvVars(
+            mol.natm, mol.nbas,
+            _atm.data.ptr, _bas.data.ptr, _env.data.ptr,
+            pair_loc_on_gpu.data.ptr,
+        )
+        tile_q_cond = vhfopt.tile_q_cond
+        q_cond = vhfopt.q_cond
+
+        err = libvhf_rys.RYS_init_rysj_constant(ctypes.c_int(SHM_SIZE))
+        if err != 0:
+            raise RuntimeError('CUDA kernel initialization')
+
+        tile_mappings = {}
+        workers = gpu_specs['multiProcessorCount']
+        pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16)
+        info = cp.empty(2, dtype=np.uint32)
 
-    for i in range(n_groups):
-        for j in range(i+1):
-            ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1]
-            jsh0, jsh1 = l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]
-            ij_shls = (ish0, ish1, jsh0, jsh1)
-            i0 = ish0 // TILE
-            i1 = ish1 // TILE
-            j0 = jsh0 // TILE
-            j1 = jsh1 // TILE
-            sub_tile_q = vhfopt.tile_q_cond[i0:i1,j0:j1]
-            mask = sub_tile_q > log_cutoff - log_max_dm
-            if i == j:
-                mask = cp.tril(mask)
-            t_ij = (cp.arange(i0, i1, dtype=np.int32)[:,None] * ntiles +
-                    cp.arange(j0, j1, dtype=np.int32))
-            idx = cp.argsort(sub_tile_q[mask])[::-1]
-            tile_mappings[i,j] = t_ij[mask][idx]
-    t1 = t2 = log.timer_debug1('q_cond and dm_cond', *cput0)
+        for i in range(n_groups):
+            for j in range(i+1):
+                ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1]
+                jsh0, jsh1 = l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]
+                i0 = ish0 // TILE
+                i1 = ish1 // TILE
+                j0 = jsh0 // TILE
+                j1 = jsh1 // TILE
+                sub_tile_q = tile_q_cond[i0:i1,j0:j1]
+                mask = sub_tile_q > log_cutoff - log_max_dm
+                if i == j:
+                    mask = cp.tril(mask)
+                t_ij = (cp.arange(i0, i1, dtype=np.int32)[:,None] * ntiles +
+                        cp.arange(j0, j1, dtype=np.int32))
+                idx = cp.argsort(sub_tile_q[mask])[::-1]
+                tile_mappings[i,j] = t_ij[mask][idx]
+        t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *t0)
+
+        timing_collection = {}
+        kern_counts = 0
+        kern = libvhf_rys.RYS_build_j
+
+        while tasks:
+            try:
+                task = tasks.pop()
+            except IndexError:
+                break
+
+            i, j, k, l = task
+            shls_slice = l_ctr_bas_loc[[i, i+1, j, j+1, k, k+1, l, l+1]]
+            tile_ij_mapping = tile_mappings[i,j]
+            tile_kl_mapping = tile_mappings[k,l]
+            scheme = schemes[task]
+            err = kern(
+                ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p),
+                ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao),
+                rys_envs, (ctypes.c_int*3)(*scheme),
+                (ctypes.c_int*8)(*shls_slice),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_q_cond.data.ptr, ctypes.c_void_p),
+                ctypes.cast(q_cond.data.ptr, ctypes.c_void_p),
+                lib.c_null_ptr(),
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            if err != 0:
+                llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+                raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
+            if log.verbose >= logger.DEBUG1:
+                llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+                t1, t1p = log.timer_debug1(f'processing {llll}, tasks = {info[1]}', *t1), t1
+                if llll not in timing_collection:
+                    timing_collection[llll] = 0
+                timing_collection[llll] += t1[1] - t1p[1]
+                kern_counts += 1
+            if num_devices > 1:
+                stream.synchronize()
+        return vj_xyz, kern_counts, timing_collection
 
-    timing_collection = {}
+    results = multi_gpu.run(proc, args=(dm_xyz, dm_cond), non_blocking=True)
     kern_counts = 0
-    kern = libvhf_rys.RYS_build_j
-
-    for i in range(n_groups):
-        for j in range(i+1):
-            ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
-                       l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
-            tile_ij_mapping = tile_mappings[i,j]
-            for k in range(i+1):
-                for l in range(k+1):
-                    llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
-                    kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
-                               l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
-                    tile_kl_mapping = tile_mappings[k,l]
-                    scheme = _j_engine_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
-                    err = kern(
-                        ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        rys_envs, (ctypes.c_int*3)(*scheme),
-                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
-                        ctypes.c_int(tile_ij_mapping.size),
-                        ctypes.c_int(tile_kl_mapping.size),
-                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p),
-                        lib.c_null_ptr(),
-                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.c_float(log_cutoff),
-                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(workers),
-                        mol._atm.ctypes, ctypes.c_int(mol.natm),
-                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-                    if err != 0:
-                        raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
-                    if log.verbose >= logger.DEBUG1:
-                        t1, t1p = log.timer_debug1(f'processing {llll}, tasks = {info[1]}', *t1), t1
-                        if llll not in timing_collection:
-                            timing_collection[llll] = 0
-                        timing_collection[llll] += t1[1] - t1p[1]
-                        kern_counts += 1
+    timing_collection = Counter()
+    vj_dist = []
+    for vj, counts, t_counter in results:
+        kern_counts += counts
+        timing_collection += t_counter
+        vj_dist.append(vj)
 
     if log.verbose >= logger.DEBUG1:
         log.debug1('kernel launches %d', kern_counts)
         for llll, t in timing_collection.items():
             log.debug1('%s wall time %.2f', llll, t)
-        cp.cuda.Stream.null.synchronize()
-        log.timer_debug1('cuda kernel', *t2)
 
+    vj_xyz = multi_gpu.array_reduce(vj_dist, inplace=True)
     vj_xyz = vj_xyz.get()
     vj = np.empty_like(dms)
     libvhf_rys.transform_xyz_to_cart(
         vj.ctypes, vj_xyz.ctypes, ao_loc.ctypes, pair_loc.ctypes,
         mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
     #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, cp.asarray(vj), vhfopt.coeff)
-    vj = sandwich_dot(vj, vhfopt.coeff)
+    vj = sandwich_dot(cp.asarray(vj), vhfopt.coeff)
     vj = transpose_sum(vj)
     vj *= 2.
-    vj = vj.reshape(dm.shape)
 
     h_shls = vhfopt.h_shls
     if h_shls:
@@ -433,7 +445,7 @@ def get_j(mol, dm, hermi=0, vhfopt=None, verbose=None):
         scripts = ['ji->s2kl']
         shls_excludes = [0, h_shls[0]] * 4
         vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts,
-                                 dms.get(), 1, mol._atm, mol._bas, mol._env,
+                                 dms, 1, mol._atm, mol._bas, mol._env,
                                  shls_excludes=shls_excludes)
         vj1 = vs_h[0].reshape(n_dm,nao,nao)
         coeff = vhfopt.coeff
@@ -443,6 +455,7 @@ def get_j(mol, dm, hermi=0, vhfopt=None, verbose=None):
             vj[i] += coeff.T.dot(cp.asarray(v)).dot(coeff)
         log.timer_debug1('get_j pass 2 for h functions on cpu', *cput1)
 
+    vj = vj.reshape(dm.shape)
     log.timer('vj', *cput0)
     return vj
 
@@ -457,7 +470,6 @@ def __init__(self, mol, cutoff=1e-13):
 
         # Hold cache on GPU devices
         self._rys_envs = {}
-        self._mol_gpu = {}
         self._q_cond = {}
         self._tile_q_cond = {}
         self._s_estimator = {}
@@ -550,11 +562,11 @@ def rys_envs(self):
                 _bas = cp.array(mol._bas)
                 _env = cp.array(_scale_sp_ctr_coeff(mol))
                 ao_loc = cp.array(mol.ao_loc)
-                self._mol_gpu[device_id] = (_atm, _bas, _env, ao_loc)
-                self._rys_envs[device_id] = RysIntEnvVars(
+                self._rys_envs[device_id] = rys_envs = RysIntEnvVars(
                     mol.natm, mol.nbas,
                     _atm.data.ptr, _bas.data.ptr, _env.data.ptr,
                     ao_loc.data.ptr)
+                rys_envs._env_ref_holder = (_atm, _bas, _env, ao_loc)
         return self._rys_envs[device_id]
 
 class RysIntEnvVars(ctypes.Structure):
@@ -600,13 +612,12 @@ def g_pair_idx(ij_inc=None):
 
 def init_constant(mol):
     g_idx, offsets = g_pair_idx()
-    for device_id in range(_num_devices):
-        with cp.cuda.Device(device_id), _streams[device_id]:
-            err = libvhf_rys.RYS_init_constant(
-                g_idx.ctypes, offsets.ctypes, mol._env.ctypes,
-                ctypes.c_int(mol._env.size), ctypes.c_int(SHM_SIZE))
-            if err != 0:
-                raise RuntimeError(f'CUDA kernel initialization on device {device_id}')
+    err = libvhf_rys.RYS_init_constant(
+        g_idx.ctypes, offsets.ctypes, mol._env.ctypes,
+        ctypes.c_int(mol._env.size), ctypes.c_int(SHM_SIZE))
+    if err != 0:
+        device_id = cp.cuda.device.get_device_id()
+        raise RuntimeError(f'CUDA kernel initialization on device {device_id}')
 
 def _make_tril_tile_mappings(l_ctr_bas_loc, tile_q_cond, cutoff, tile=TILE):
     n_groups = len(l_ctr_bas_loc) - 1
diff --git a/gpu4pyscf/scf/tests/test_scf_jk.py b/gpu4pyscf/scf/tests/test_scf_jk.py
index 78ae68eb..e311482f 100644
--- a/gpu4pyscf/scf/tests/test_scf_jk.py
+++ b/gpu4pyscf/scf/tests/test_scf_jk.py
@@ -125,3 +125,32 @@ def test_jk_hermi0():
 
     assert abs(vj2+vj3 - vj1).max() < 1e-9
     assert abs(vk2+vk3 - vk1).max() < 1e-9
+
+def test_jk_hermi0_l5():
+    mol = pyscf.M(
+        atom = '''
+        O   0.000   -0.    0.1174
+        H  -0.757    4.   -0.4696
+        H   0.757    4.   -0.4696
+        C   1.      1.    0.
+        H   4.      0.    3.
+        H   0.      1.    .6
+        ''',
+        basis={'default': 'def2-tzvp', 'O': [[5, [1., 1.]]]},
+        unit='B',)
+
+    np.random.seed(9)
+    nao = mol.nao
+    dm = np.random.rand(nao, nao)
+    vj, vk = jk.get_jk(mol, dm, hermi=0)
+    vj = vj.get()
+    vk = vk.get()
+    ref = get_jk(mol, dm, hermi=0)
+    assert abs(vj - ref[0]).max() < 1e-9
+    assert abs(vk - ref[1]).max() < 1e-9
+    assert abs(lib.fp(vj) - -61.28856847097108) < 1e-9
+    assert abs(lib.fp(vk) - -76.38373664249241) < 1e-9
+
+    vj = jk.get_j(mol, dm, hermi=0).get()
+    assert abs(vj - ref[0]).max() < 1e-9
+    assert abs(lib.fp(vj) - -61.28856847097108) < 1e-9
diff --git a/gpu4pyscf/scf/uhf.py b/gpu4pyscf/scf/uhf.py
index 12a01d57..1107cbd2 100644
--- a/gpu4pyscf/scf/uhf.py
+++ b/gpu4pyscf/scf/uhf.py
@@ -38,10 +38,6 @@ def make_rdm1(mo_coeff, mo_occ, **kwargs):
     mo_b = mo_coeff[1]
     dm_a = cupy.dot(mo_a*mo_occ[0], mo_a.conj().T)
     dm_b = cupy.dot(mo_b*mo_occ[1], mo_b.conj().T)
-# DO NOT make tag_array for DM here because the DM arrays may be modified and
-# passed to functions like get_jk, get_vxc.  These functions may take the tags
-# (mo_coeff, mo_occ) to compute the potential if tags were found in the DM
-# arrays and modifications to DM arrays may be ignored.
     return tag_array((dm_a, dm_b), mo_coeff=mo_coeff, mo_occ=mo_occ)
 
 
diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py
index 3fe7cb6c..28711f77 100644
--- a/gpu4pyscf/solvent/grad/pcm.py
+++ b/gpu4pyscf/solvent/grad/pcm.py
@@ -40,13 +40,6 @@ def grad_switch_h(x):
     dy[x>1] = 0.0
     return dy
 
-def gradgrad_switch_h(x):
-    ''' 2nd derivative of h(x) '''
-    ddy = 60.0*x - 180.0*x**2 + 120*x**3
-    ddy[x<0] = 0.0
-    ddy[x>1] = 0.0
-    return ddy
-
 def get_dF_dA(surface):
     '''
     J. Chem. Phys. 133, 244111 (2010), Appendix C
@@ -63,10 +56,9 @@ def get_dF_dA(surface):
     dF = cupy.zeros([ngrids, natom, 3])
     dA = cupy.zeros([ngrids, natom, 3])
 
-    for ia in range(atom_coords.shape[0]):
+    for ia in range(natom):
         p0,p1 = surface['gslice_by_atom'][ia]
         coords = grid_coords[p0:p1]
-        p1 = p0 + coords.shape[0]
         ri_rJ = cupy.expand_dims(coords, axis=1) - atom_coords
         riJ = cupy.linalg.norm(ri_rJ, axis=-1)
         diJ = (riJ - R_in_J) / R_sw_J
@@ -145,9 +137,7 @@ def get_dD_dS(surface, with_S=True, with_D=False, stream=None):
     '''
     charge_exp  = surface['charge_exp']
     grid_coords = surface['grid_coords']
-    switch_fun  = surface['switch_fun']
     norm_vec    = surface['norm_vec']
-    R_vdw       = surface['R_vdw']
     n = charge_exp.shape[0]
     dS = cupy.empty([3,n,n])
     dD = None
@@ -163,9 +153,7 @@ def get_dD_dS(surface, with_S=True, with_D=False, stream=None):
         dD_ptr, dS_ptr,
         ctypes.cast(grid_coords.data.ptr, ctypes.c_void_p),
         ctypes.cast(norm_vec.data.ptr, ctypes.c_void_p),
-        ctypes.cast(R_vdw.data.ptr, ctypes.c_void_p),
         ctypes.cast(charge_exp.data.ptr, ctypes.c_void_p),
-        ctypes.cast(switch_fun.data.ptr, ctypes.c_void_p),
         ctypes.c_int(n)
     )
     if err != 0:
@@ -181,7 +169,7 @@ def get_dSii(surface, dF):
     dSii = dSii_dF[:,None] * dF
     return dSii
 
-def grad_nuc(pcmobj, dm):
+def grad_nuc(pcmobj, dm, q_sym = None):
     mol = pcmobj.mol
     log = logger.new_logger(mol, mol.verbose)
     t1 = log.init_timer()
@@ -194,7 +182,8 @@ def grad_nuc(pcmobj, dm):
         pcmobj._get_vind(dm)
 
     mol = pcmobj.mol
-    q_sym        = pcmobj._intermediates['q_sym'].get()
+    if q_sym is None:
+        q_sym = pcmobj._intermediates['q_sym'].get()
     gridslice    = pcmobj.surface['gslice_by_atom']
     grid_coords  = pcmobj.surface['grid_coords'].get()
     exponents    = pcmobj.surface['charge_exp'].get()
@@ -220,7 +209,7 @@ def grad_nuc(pcmobj, dm):
     t1 = log.timer_debug1('grad nuc', *t1)
     return de
 
-def grad_qv(pcmobj, dm):
+def grad_qv(pcmobj, dm, q_sym = None):
     '''
     contributions due to integrals
     '''
@@ -237,7 +226,8 @@ def grad_qv(pcmobj, dm):
     gridslice   = pcmobj.surface['gslice_by_atom']
     charge_exp  = pcmobj.surface['charge_exp']
     grid_coords = pcmobj.surface['grid_coords']
-    q_sym       = pcmobj._intermediates['q_sym']
+    if q_sym is None:
+        q_sym = pcmobj._intermediates['q_sym']
 
     intopt = int3c1e.VHFOpt(mol)
     intopt.build(1e-14, aosym=False)
@@ -282,12 +272,23 @@ def grad_solver(pcmobj, dm):
     vK_1 = cupy.linalg.solve(K.T, v_grids)
     epsilon = pcmobj.eps
 
+    def contract_bra(a, B, c):
+        ''' i,xij,j->jx '''
+        tmp = a.dot(B)
+        return (tmp*c).T
+
+    def contract_ket(a, B, c):
+        ''' i,xij,j->ix '''
+        tmp = B.dot(c)
+        return (a*tmp).T
+
     de = cupy.zeros([pcmobj.mol.natm,3])
     if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']:
         dD, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True)
         
         # dR = 0, dK = dS
-        de_dS = (vK_1 * dS.dot(q)).T                  # cupy.einsum('i,xij,j->ix', vK_1, dS, q)
+        de_dS  = 0.5 * contract_ket(vK_1, dS, q)
+        de_dS -= 0.5 * contract_bra(vK_1, dS, q)
         de -= cupy.asarray([cupy.sum(de_dS[p0:p1], axis=0) for p0,p1 in gridslice])
         dD = dS = None
 
@@ -295,24 +296,13 @@ def grad_solver(pcmobj, dm):
         dSii = get_dSii(pcmobj.surface, dF)
         de -= 0.5*contract('i,xij->jx', vK_1*q, dSii) # 0.5*cupy.einsum('i,xij,i->jx', vK_1, dSii, q)
 
-    elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE', 'SMD']:
+    elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']:
         dF, dA = get_dF_dA(pcmobj.surface)
         dSii = get_dSii(pcmobj.surface, dF)
         dF = None
 
         dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
 
-        def contract_bra(a, B, c):
-            ''' i,xij,j->jx '''
-            tmp = a.dot(B)
-            return (tmp*c).T
-
-        def contract_ket(a, B, c):
-            ''' i,xij,j->ix '''
-            tmp = B.dot(c)
-            return (a*tmp).T
-        
-        # IEF-PCM and SS(V)PE formally are the same in gradient calculation
         # dR = f_eps/(2*pi) * (dD*A + D*dA),
         # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
         f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
@@ -352,6 +342,67 @@ def contract_ket(a, B, c):
 
         de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1)
         de += de_dR - de_dK
+    elif pcmobj.method.upper() in [ 'SS(V)PE' ]:
+        dF, dA = get_dF_dA(pcmobj.surface)
+        dSii = get_dSii(pcmobj.surface, dF)
+        dF = None
+
+        dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
+
+        # dR = f_eps/(2*pi) * (dD*A + D*dA),
+        # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
+        f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
+        fac = f_epsilon/(2.0*PI)
+
+        Av = A*v_grids
+        de_dR  = 0.5*fac * contract_ket(vK_1, dD, Av)
+        de_dR -= 0.5*fac * contract_bra(vK_1, dD, Av)
+        de_dR  = cupy.asarray([cupy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice])
+
+        vK_1_D = vK_1.dot(D)
+        vK_1_Dv = vK_1_D * v_grids
+        de_dR += 0.5*fac * contract('j,xjn->nx', vK_1_Dv, dA)
+
+        de_dS0  = 0.5*contract_ket(vK_1, dS, q)
+        de_dS0 -= 0.5*contract_bra(vK_1, dS, q)
+        de_dS0  = cupy.asarray([cupy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice])
+
+        vK_1_q = vK_1 * q
+        de_dS0 += 0.5*contract('i,xin->nx', vK_1_q, dSii)
+
+        vK_1_DA = vK_1_D*A
+        de_dS1  = 0.5*contract_ket(vK_1_DA, dS, q)
+        de_dS1 -= 0.5*contract_bra(vK_1_DA, dS, q)
+        de_dS1  = cupy.asarray([cupy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice])
+        vK_1_DAq = vK_1_DA*q
+        de_dS1 += 0.5*contract('j,xjn->nx', vK_1_DAq, dSii)
+
+        DT_q = cupy.dot(D.T, q)
+        ADT_q = A * DT_q
+        de_dS1_T  = 0.5*contract_ket(vK_1, dS, ADT_q)
+        de_dS1_T -= 0.5*contract_bra(vK_1, dS, ADT_q)
+        de_dS1_T  = cupy.asarray([cupy.sum(de_dS1_T[p0:p1], axis=0) for p0,p1 in gridslice])
+        vK_1_ADT_q = vK_1 * ADT_q
+        de_dS1_T += 0.5*contract('j,xjn->nx', vK_1_ADT_q, dSii)
+
+        Sq = cupy.dot(S,q)
+        ASq = A*Sq
+        de_dD  = 0.5*contract_ket(vK_1, dD, ASq)
+        de_dD -= 0.5*contract_bra(vK_1, dD, ASq)
+        de_dD  = cupy.asarray([cupy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice])
+
+        vK_1_S = cupy.dot(vK_1, S)
+        vK_1_SA = vK_1_S * A
+        de_dD_T  = 0.5*contract_ket(vK_1_SA, -dD.transpose(0,2,1), q)
+        de_dD_T -= 0.5*contract_bra(vK_1_SA, -dD.transpose(0,2,1), q)
+        de_dD_T  = cupy.asarray([cupy.sum(de_dD_T[p0:p1], axis=0) for p0,p1 in gridslice])
+
+        de_dA = 0.5*contract('j,xjn->nx', vK_1_D*Sq, dA)   # 0.5*cupy.einsum('j,xjn,j->nx', vK_1_D, dA, Sq)
+
+        de_dA_T = 0.5*contract('j,xjn->nx', vK_1_S*DT_q, dA)
+
+        de_dK = de_dS0 - 0.5 * fac * (de_dD + de_dA + de_dS1 + de_dD_T + de_dA_T + de_dS1_T)
+        de += de_dR - de_dK
     else:
         raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}")
     t1 = log.timer_debug1('grad solver', *t1)
diff --git a/gpu4pyscf/solvent/hessian/pcm.py b/gpu4pyscf/solvent/hessian/pcm.py
index 538cb859..11c3e1df 100644
--- a/gpu4pyscf/solvent/hessian/pcm.py
+++ b/gpu4pyscf/solvent/hessian/pcm.py
@@ -19,141 +19,685 @@
 
 import numpy
 import cupy
+import ctypes
 from pyscf import lib, gto
 from gpu4pyscf import scf
-from gpu4pyscf.solvent.pcm import PI
-from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc, get_dD_dS, get_dF_dA, get_dSii
+from gpu4pyscf.solvent.pcm import PI, switch_h, libsolvent
+from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc, get_dD_dS, get_dF_dA, get_dSii, grad_switch_h
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib import logger
 from gpu4pyscf.hessian.jk import _ao2mo
 from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2
+from gpu4pyscf.gto.int3c1e_ipip import int1e_grids_ipip1, int1e_grids_ipvip1, int1e_grids_ipip2, int1e_grids_ip1ip2
 from gpu4pyscf.gto import int3c1e
 from gpu4pyscf.gto.int3c1e import int1e_grids
+from pyscf import lib as pyscf_lib
 
-def hess_nuc(pcmobj):
-    raise NotImplementedError("Not tested")
+def gradgrad_switch_h(x):
+    ''' 2nd derivative of h(x) '''
+    ddy = 60.0*x - 180.0*x**2 + 120.0*x**3
+    ddy[x<0] = 0.0
+    ddy[x>1] = 0.0
+    return ddy
+
+def get_d2F_d2A(surface):
+    '''
+    Notations adopted from
+    J. Chem. Phys. 133, 244111 (2010), Appendix C
+    '''
+    atom_coords = surface['atom_coords']
+    grid_coords = surface['grid_coords']
+    switch_fun  = surface['switch_fun']
+    area        = surface['area']
+    R_in_J      = surface['R_in_J']
+    R_sw_J      = surface['R_sw_J']
+
+    ngrids = grid_coords.shape[0]
+    natom = atom_coords.shape[0]
+    d2F = cupy.zeros([ngrids, natom, natom, 3, 3])
+    d2A = cupy.zeros([ngrids, natom, natom, 3, 3])
+
+    for i_grid_atom in range(natom):
+        p0,p1 = surface['gslice_by_atom'][i_grid_atom]
+        coords = grid_coords[p0:p1]
+        si_rJ = cupy.expand_dims(coords, axis=1) - atom_coords
+        norm_si_rJ = cupy.linalg.norm(si_rJ, axis=-1)
+        diJ = (norm_si_rJ - R_in_J) / R_sw_J
+        diJ[:,i_grid_atom] = 1.0
+        diJ[diJ < 1e-8] = 0.0
+        si_rJ[:,i_grid_atom,:] = 0.0
+        si_rJ[diJ < 1e-8] = 0.0
+
+        fiJ = switch_h(diJ)
+        dfiJ = grad_switch_h(diJ)
+
+        fiJK = fiJ[:, :, cupy.newaxis] * fiJ[:, cupy.newaxis, :]
+        dfiJK = dfiJ[:, :, cupy.newaxis] * dfiJ[:, cupy.newaxis, :]
+        R_sw_JK = R_sw_J[:, cupy.newaxis] * R_sw_J[cupy.newaxis, :]
+        norm_si_rJK = norm_si_rJ[:, :, cupy.newaxis] * norm_si_rJ[:, cupy.newaxis, :]
+        terms_size_ngrids_natm_natm = dfiJK / (fiJK * norm_si_rJK * R_sw_JK)
+        si_rJK = si_rJ[:, :, cupy.newaxis, :, cupy.newaxis] * si_rJ[:, cupy.newaxis, :, cupy.newaxis, :]
+        d2fiJK_offdiagonal = terms_size_ngrids_natm_natm[:, :, :, cupy.newaxis, cupy.newaxis] * si_rJK
+
+        d2fiJ = gradgrad_switch_h(diJ)
+        terms_size_ngrids_natm = d2fiJ / (norm_si_rJ**2 * R_sw_J) - dfiJ / (norm_si_rJ**3)
+        si_rJJ = si_rJ[:, :, :, cupy.newaxis] * si_rJ[:, :, cupy.newaxis, :]
+        d2fiJK_diagonal = cupy.einsum('qA,qAdD->qAdD', terms_size_ngrids_natm, si_rJJ)
+        d2fiJK_diagonal += cupy.einsum('qA,dD->qAdD', dfiJ / norm_si_rJ, cupy.eye(3))
+        d2fiJK_diagonal /= (fiJ * R_sw_J)[:, :, cupy.newaxis, cupy.newaxis]
+
+        d2fiJK = d2fiJK_offdiagonal
+        for i_atom in range(natom):
+            d2fiJK[:, i_atom, i_atom, :, :] = d2fiJK_diagonal[:, i_atom, :, :]
+
+        Fi = switch_fun[p0:p1]
+        Ai = area[p0:p1]
+
+        d2F[p0:p1, :, :, :, :] += cupy.einsum('q,qABdD->qABdD', Fi, d2fiJK)
+        d2A[p0:p1, :, :, :, :] += cupy.einsum('q,qABdD->qABdD', Ai, d2fiJK)
+
+        d2fiJK_grid_atom_offdiagonal = -cupy.einsum('qABdD->qAdD', d2fiJK)
+        d2F[p0:p1, i_grid_atom, :, :, :] = cupy.einsum('q,qAdD->qAdD', Fi, d2fiJK_grid_atom_offdiagonal.transpose(0,1,3,2))
+        d2F[p0:p1, :, i_grid_atom, :, :] = cupy.einsum('q,qAdD->qAdD', Fi, d2fiJK_grid_atom_offdiagonal)
+        d2A[p0:p1, i_grid_atom, :, :, :] = cupy.einsum('q,qAdD->qAdD', Ai, d2fiJK_grid_atom_offdiagonal.transpose(0,1,3,2))
+        d2A[p0:p1, :, i_grid_atom, :, :] = cupy.einsum('q,qAdD->qAdD', Ai, d2fiJK_grid_atom_offdiagonal)
+
+        d2fiJK_grid_atom_diagonal = -cupy.einsum('qAdD->qdD', d2fiJK_grid_atom_offdiagonal)
+        d2F[p0:p1, i_grid_atom, i_grid_atom, :, :] = cupy.einsum('q,qdD->qdD', Fi, d2fiJK_grid_atom_diagonal)
+        d2A[p0:p1, i_grid_atom, i_grid_atom, :, :] = cupy.einsum('q,qdD->qdD', Ai, d2fiJK_grid_atom_diagonal)
+
+    d2F = d2F.transpose(1,2,3,4,0)
+    d2A = d2A.transpose(1,2,3,4,0)
+    return d2F, d2A
+
+def get_d2Sii(surface, dF, d2F, stream=None):
+    ''' Second derivative of S matrix (diagonal only)
+    '''
+    charge_exp  = surface['charge_exp']
+    switch_fun  = surface['switch_fun']
+    ngrids = switch_fun.shape[0]
+    dF = dF.transpose(2,0,1)
+    natm = dF.shape[0]
+    assert dF.shape == (natm, 3, ngrids)
+
+    # dF_dF = dF[:, cupy.newaxis, :, cupy.newaxis, :] * dF[cupy.newaxis, :, cupy.newaxis, :, :]
+    # dF_dF_over_F3 = dF_dF * (1.0/(switch_fun**3))
+    # d2F_over_F2 = d2F * (1.0/(switch_fun**2))
+    # d2Sii = 2 * dF_dF_over_F3 - d2F_over_F2
+    # d2Sii = (2.0/PI)**0.5 * (d2Sii * charge_exp)
+
+    dF = dF.flatten()   # Make sure the underlying data order is the same as shape shows
+    d2F = d2F.flatten() # Make sure the underlying data order is the same as shape shows
+    d2Sii = cupy.empty((natm, natm, 3, 3, ngrids), dtype=cupy.float64)
+    if stream is None:
+        stream = cupy.cuda.get_current_stream()
+    err = libsolvent.pcm_d2f_to_d2sii(
+        ctypes.cast(stream.ptr, ctypes.c_void_p),
+        ctypes.cast(switch_fun.data.ptr, ctypes.c_void_p),
+        ctypes.cast(dF.data.ptr, ctypes.c_void_p),
+        ctypes.cast(d2F.data.ptr, ctypes.c_void_p),
+        ctypes.cast(charge_exp.data.ptr, ctypes.c_void_p),
+        ctypes.cast(d2Sii.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(natm),
+        ctypes.c_int(ngrids),
+    )
+    if err != 0:
+        raise RuntimeError('Failed in converting PCM d2F to d2Sii.')
+    return d2Sii
+
+def get_d2D_d2S(surface, with_S=True, with_D=False, stream=None):
+    ''' Second derivatives of D matrix and S matrix (offdiagonals only)
+    '''
+    charge_exp  = surface['charge_exp']
+    grid_coords = surface['grid_coords']
+    norm_vec    = surface['norm_vec']
+    n = charge_exp.shape[0]
+    d2S = cupy.empty([3,3,n,n])
+    d2D = None
+    d2S_ptr = ctypes.cast(d2S.data.ptr, ctypes.c_void_p)
+    d2D_ptr = pyscf_lib.c_null_ptr()
+    if with_D:
+        d2D = cupy.empty([3,3,n,n])
+        d2D_ptr = ctypes.cast(d2D.data.ptr, ctypes.c_void_p)
+    if stream is None:
+        stream = cupy.cuda.get_current_stream()
+    err = libsolvent.pcm_d2d_d2s(
+        ctypes.cast(stream.ptr, ctypes.c_void_p),
+        d2D_ptr, d2S_ptr,
+        ctypes.cast(grid_coords.data.ptr, ctypes.c_void_p),
+        ctypes.cast(norm_vec.data.ptr, ctypes.c_void_p),
+        ctypes.cast(charge_exp.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(n)
+    )
+    if err != 0:
+        raise RuntimeError('Failed in generating PCM d2D and d2S matrices.')
+    return d2D, d2S
+
+def analytical_hess_nuc(pcmobj, dm, verbose=None):
     if not pcmobj._intermediates:
         pcmobj.build()
+    dm_cache = pcmobj._intermediates.get('dm', None)
+    if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10:
+        pass
+    else:
+        pcmobj._get_vind(dm)
     mol = pcmobj.mol
+    log = logger.new_logger(pcmobj, verbose)
+    t1 = log.init_timer()
+
     q_sym        = pcmobj._intermediates['q_sym'].get()
     gridslice    = pcmobj.surface['gslice_by_atom']
     grid_coords  = pcmobj.surface['grid_coords'].get()
     exponents    = pcmobj.surface['charge_exp'].get()
 
+    ngrids = q_sym.shape[0]
+
     atom_coords = mol.atom_coords(unit='B')
     atom_charges = numpy.asarray(mol.atom_charges(), dtype=numpy.float64)
     fakemol_nuc = gto.fakemol_for_charges(atom_coords)
     fakemol = gto.fakemol_for_charges(grid_coords, expnt=exponents**2)
 
-    # nuclei potential response
+    d2e_from_d2I = numpy.zeros([mol.natm, mol.natm, 3, 3])
+
     int2c2e_ip1ip2 = mol._add_suffix('int2c2e_ip1ip2')
-    v_ng_ip1ip2 = gto.mole.intor_cross(int2c2e_ip1ip2, fakemol_nuc, fakemol).reshape([3,3,mol.natm,-1])
-    dv_g = numpy.einsum('n,xyng->ngxy', atom_charges, v_ng_ip1ip2)
-    dv_g = numpy.einsum('ngxy,g->ngxy', dv_g, q_sym)
+    d2I_dAdC = gto.mole.intor_cross(int2c2e_ip1ip2, fakemol_nuc, fakemol)
+    d2I_dAdC = d2I_dAdC.reshape(3, 3, mol.natm, ngrids)
+    for i_atom in range(mol.natm):
+        g0,g1 = gridslice[i_atom]
+        d2e_from_d2I[:, i_atom, :, :] += numpy.einsum('A,dDAq,q->AdD', atom_charges, d2I_dAdC[:, :, :, g0:g1], q_sym[g0:g1])
+        d2e_from_d2I[i_atom, :, :, :] += numpy.einsum('A,dDAq,q->AdD', atom_charges, d2I_dAdC[:, :, :, g0:g1], q_sym[g0:g1])
 
-    de = numpy.zeros([mol.natm, mol.natm, 3, 3])
-    for ia in range(mol.natm):
-        p0, p1 = gridslice[ia]
-        de_tmp = numpy.sum(dv_g[:,p0:p1], axis=1)
-        de[:,ia] -= de_tmp
-        #de[ia,:] -= de_tmp.transpose([0,2,1])
+    int2c2e_ipip1 = mol._add_suffix('int2c2e_ipip1')
+    # # Some explanations here:
+    # # Why can we use the ip1ip2 here? Because of the translational invariance
+    # # $\frac{\partial^2 I_{AC}}{\partial A^2} + \frac{\partial^2 I_{AC}}{\partial A \partial C} = 0$
+    # # Why not using the ipip1 here? Because the nuclei, a point charge, is handled as a Gaussian charge with exponent = 1e16
+    # # This causes severe numerical problem in function int2c2e_ip1ip2, and make the main diagonal of hessian garbage.
+    # d2I_dA2 = gto.mole.intor_cross(int2c2e_ipip1, fakemol_nuc, fakemol)
+    d2I_dA2 = -gto.mole.intor_cross(int2c2e_ip1ip2, fakemol_nuc, fakemol)
+    d2I_dA2 = d2I_dA2 @ q_sym
+    d2I_dA2 = d2I_dA2.reshape(3, 3, mol.natm)
+    for i_atom in range(mol.natm):
+        d2e_from_d2I[i_atom, i_atom, :, :] += atom_charges[i_atom] * d2I_dA2[:, :, i_atom]
+
+    d2I_dC2 = gto.mole.intor_cross(int2c2e_ipip1, fakemol, fakemol_nuc)
+    d2I_dC2 = d2I_dC2 @ atom_charges
+    d2I_dC2 = d2I_dC2.reshape(3, 3, ngrids)
+    for i_atom in range(mol.natm):
+        g0,g1 = gridslice[i_atom]
+        d2e_from_d2I[i_atom, i_atom, :, :] += d2I_dC2[:, :, g0:g1] @ q_sym[g0:g1]
 
+    intopt_derivative = int3c1e.VHFOpt(mol)
+    intopt_derivative.build(cutoff = 1e-14, aosym = False)
 
-    int2c2e_ip1ip2 = mol._add_suffix('int2c2e_ip1ip2')
-    v_ng_ip1ip2 = gto.mole.intor_cross(int2c2e_ip1ip2, fakemol, fakemol_nuc).reshape([3,3,-1,mol.natm])
-    dv_g = numpy.einsum('n,xygn->gnxy', atom_charges, v_ng_ip1ip2)
-    dv_g = numpy.einsum('gnxy,g->gnxy', dv_g, q_sym)
+    dqdx = get_dqsym_dx(pcmobj, dm, range(mol.natm), intopt_derivative)
+    dqdx = dqdx.get()
 
-    for ia in range(mol.natm):
-        p0, p1 = gridslice[ia]
-        de_tmp = numpy.sum(dv_g[p0:p1], axis=0)
-        de[ia,:] -= de_tmp
-        #de[ia,:] -= de_tmp.transpose([0,2,1])
+    d2e_from_dIdq = numpy.zeros([mol.natm, mol.natm, 3, 3])
+    for i_atom in range(mol.natm):
+        for i_xyz in range(3):
+            d2e_from_dIdq[i_atom, :, i_xyz, :] = grad_nuc(pcmobj, dm, q_sym = dqdx[i_atom, i_xyz, :])
 
-    int2c2e_ipip1 = mol._add_suffix('int2c2e_ipip1')
-    v_ng_ipip1 = gto.mole.intor_cross(int2c2e_ipip1, fakemol_nuc, fakemol).reshape([3,3,mol.natm,-1])
-    dv_g = numpy.einsum('g,xyng->nxy', q_sym, v_ng_ipip1)
-    for ia in range(mol.natm):
-        de[ia,ia] -= dv_g[ia] * atom_charges[ia]
-
-    v_ng_ipip1 = gto.mole.intor_cross(int2c2e_ipip1, fakemol, fakemol_nuc).reshape([3,3,-1,mol.natm])
-    dv_g = numpy.einsum('n,xygn->gxy', atom_charges, v_ng_ipip1)
-    dv_g = numpy.einsum('g,gxy->gxy', q_sym, dv_g)
-    for ia in range(mol.natm):
-        p0, p1 = gridslice[ia]
-        de[ia,ia] -= numpy.sum(dv_g[p0:p1], axis=0)
-
-    return de
-
-def hess_qv(pcmobj, dm, verbose=None):
-    raise NotImplementedError("PCM analytical hessian is not tested")
-    if not pcmobj._intermediates or 'q_sym' not in pcmobj._intermediates:
-        pcmobj._get_vind(dm)
-    gridslice    = pcmobj.surface['gslice_by_atom']
-    q_sym        = pcmobj._intermediates['q_sym']
+    d2e = d2e_from_d2I - d2e_from_dIdq
 
-    intopt = pcmobj.intopt
-    intopt.clear()
-    # rebuild with aosym
-    intopt.build(1e-14, diag_block_with_triu=True, aosym=False)
-    coeff = intopt.coeff
-    dm_cart = coeff @ dm @ coeff.T
-    #dm_cart = cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff)
-
-    dvj, _ = int3c2e.get_int3c2e_ipip1_hjk(intopt, q_sym, None, dm_cart, with_k=False)
-    dq, _ = int3c2e.get_int3c2e_ipvip1_hjk(intopt, q_sym, None, dm_cart, with_k=False)
-    dvj, _ = int3c2e.get_int3c2e_ip1ip2_hjk(intopt, q_sym, None, dm_cart, with_k=False)
-    dq, _ = int3c2e.get_int3c2e_ipip2_hjk(intopt, q_sym, None, dm_cart, with_k=False)
-
-    cart_ao_idx = intopt.cart_ao_idx
-    rev_cart_ao_idx = numpy.argsort(cart_ao_idx)
-    dvj = dvj[:,rev_cart_ao_idx]
-
-    aoslice = intopt.mol.aoslice_by_atom()
-    dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice])
-    dvj= 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]])
-    de = dq + dvj
-    return de.get()
-
-def hess_elec(pcmobj, dm, verbose=None):
-    '''
-    slow version with finite difference
-    TODO: use analytical hess_nuc
-    '''
+    t1 = log.timer_debug1('solvent hessian d(dVnuc/dx * q)/dx contribution', *t1)
+    return d2e
+
+def analytical_hess_qv(pcmobj, dm, verbose=None):
+    if not pcmobj._intermediates:
+        pcmobj.build()
+    dm_cache = pcmobj._intermediates.get('dm', None)
+    if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10:
+        pass
+    else:
+        pcmobj._get_vind(dm)
+    mol = pcmobj.mol
     log = logger.new_logger(pcmobj, verbose)
     t1 = log.init_timer()
-    pmol = pcmobj.mol.copy()
-    mol = pmol.copy()
-    coords = mol.atom_coords(unit='Bohr')
-
-    def pcm_grad_scanner(mol):
-        # TODO: use more analytical forms
-        pcmobj.reset(mol)
-        e, v = pcmobj._get_vind(dm)
-        #return grad_elec(pcmobj, dm)
-        pcm_grad = grad_nuc(pcmobj, dm)
-        pcm_grad+= grad_solver(pcmobj, dm)
-        pcm_grad+= grad_qv(pcmobj, dm)
-        return pcm_grad
-
-    mol.verbose = 0
-    de = numpy.zeros([mol.natm, mol.natm, 3, 3])
-    eps = 1e-3
-    for ia in range(mol.natm):
-        for ix in range(3):
-            dv = numpy.zeros_like(coords)
-            dv[ia,ix] = eps
-            mol.set_geom_(coords + dv, unit='Bohr')
-            g0 = pcm_grad_scanner(mol)
-
-            mol.set_geom_(coords - dv, unit='Bohr')
-            g1 = pcm_grad_scanner(mol)
-            de[ia,:,ix] = (g0 - g1)/2.0/eps
-    t1 = log.timer_debug1('solvent energy', *t1)
-    pcmobj.reset(pmol)
-    return de
-
-def get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K):
+
+    gridslice   = pcmobj.surface['gslice_by_atom']
+    charge_exp  = pcmobj.surface['charge_exp']
+    grid_coords = pcmobj.surface['grid_coords']
+    q_sym       = pcmobj._intermediates['q_sym']
+
+    aoslice = mol.aoslice_by_atom()
+    aoslice = numpy.array(aoslice)
+
+    intopt_derivative = int3c1e.VHFOpt(mol)
+    intopt_derivative.build(cutoff = 1e-14, aosym = False)
+
+    # fakemol = gto.fakemol_for_charges(grid_coords.get(), expnt=charge_exp.get()**2)
+    # intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e')
+    # intopt.build(1e-14, diag_block_with_triu=True, aosym=False)
+
+    d2e_from_d2I = cupy.zeros([mol.natm, mol.natm, 3, 3])
+
+    # d2I_dA2 = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip1', direct_scf_tol=1e-14)
+    # d2I_dA2 = cupy.einsum('dijq,q->dij', d2I_dA2, q_sym)
+    # d2I_dA2 = d2I_dA2.reshape([3, 3, nao, nao])
+    d2I_dA2 = int1e_grids_ipip1(mol, grid_coords, charges = q_sym, intopt = intopt_derivative, charge_exponents = charge_exp**2)
+    for i_atom in range(mol.natm):
+        p0,p1 = aoslice[i_atom, 2:]
+        d2e_from_d2I[i_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[p0:p1, :], d2I_dA2[:, :, p0:p1, :])
+        d2e_from_d2I[i_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[:, p0:p1], d2I_dA2[:, :, p0:p1, :].transpose(0,1,3,2))
+    d2I_dA2 = None
+
+    # d2I_dAdB = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipvip1', direct_scf_tol=1e-14)
+    # d2I_dAdB = cupy.einsum('dijq,q->dij', d2I_dAdB, q_sym)
+    # d2I_dAdB = d2I_dAdB.reshape([3, 3, nao, nao])
+    d2I_dAdB = int1e_grids_ipvip1(mol, grid_coords, charges = q_sym, intopt = intopt_derivative, charge_exponents = charge_exp**2)
+    for i_atom in range(mol.natm):
+        pi0,pi1 = aoslice[i_atom, 2:]
+        for j_atom in range(mol.natm):
+            pj0,pj1 = aoslice[j_atom, 2:]
+            d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[pi0:pi1, pj0:pj1], d2I_dAdB[:, :, pi0:pi1, pj0:pj1])
+            d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[pj0:pj1, pi0:pi1], d2I_dAdB[:, :, pi0:pi1, pj0:pj1].transpose(0,1,3,2))
+    d2I_dAdB = None
+
+    for j_atom in range(mol.natm):
+        g0,g1 = gridslice[j_atom]
+        # d2I_dAdC = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ip1ip2', direct_scf_tol=1e-14)
+        # d2I_dAdC = cupy.einsum('dijq,q->dij', d2I_dAdC[:, :, :, g0:g1], q_sym[g0:g1])
+        # d2I_dAdC = d2I_dAdC.reshape([3, 3, nao, nao])
+        d2I_dAdC = int1e_grids_ip1ip2(mol, grid_coords[g0:g1, :], charges = q_sym[g0:g1], intopt = intopt_derivative, charge_exponents = charge_exp[g0:g1]**2)
+
+        for i_atom in range(mol.natm):
+            p0,p1 = aoslice[i_atom, 2:]
+            d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[p0:p1, :], d2I_dAdC[:, :, p0:p1, :])
+            d2e_from_d2I[i_atom, j_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[:, p0:p1], d2I_dAdC[:, :, p0:p1, :].transpose(0,1,3,2))
+
+            d2e_from_d2I[j_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[p0:p1, :], d2I_dAdC[:, :, p0:p1, :].transpose(1,0,2,3))
+            d2e_from_d2I[j_atom, i_atom, :, :] += cupy.einsum('ij,dDij->dD', dm[:, p0:p1], d2I_dAdC[:, :, p0:p1, :].transpose(1,0,3,2))
+    d2I_dAdC = None
+
+    # d2I_dC2 = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip2', direct_scf_tol=1e-14)
+    # d2I_dC2 = cupy.einsum('dijq,ij->dq', d2I_dC2, dm)
+    # d2I_dC2 = d2I_dC2.reshape([3, 3, ngrids])
+    d2I_dC2 = int1e_grids_ipip2(mol, grid_coords, dm = dm, intopt = intopt_derivative, charge_exponents = charge_exp**2)
+    for i_atom in range(mol.natm):
+        g0,g1 = gridslice[i_atom]
+        d2e_from_d2I[i_atom, i_atom, :, :] += d2I_dC2[:, :, g0:g1] @ q_sym[g0:g1]
+    d2I_dC2 = None
+
+    dqdx = get_dqsym_dx(pcmobj, dm, range(mol.natm), intopt_derivative)
+
+    d2e_from_dIdq = numpy.zeros([mol.natm, mol.natm, 3, 3])
+    for i_atom in range(mol.natm):
+        for i_xyz in range(3):
+            d2e_from_dIdq[i_atom, :, i_xyz, :] = grad_qv(pcmobj, dm, q_sym = dqdx[i_atom, i_xyz, :])
+
+    d2e_from_d2I = d2e_from_d2I.get()
+    d2e = d2e_from_d2I + d2e_from_dIdq
+    d2e *= -1
+
+    t1 = log.timer_debug1('solvent hessian d(dI/dx * q)/dx contribution', *t1)
+    return d2e
+
+def einsum_ij_Adj_Adi_inverseK(K, Adj_term):
+    nA, nd, nj = Adj_term.shape
+    # return cupy.einsum('ij,Adj->Adi', cupy.linalg.inv(K), Adj_term)
+    return cupy.linalg.solve(K, Adj_term.reshape(nA * nd, nj).T).T.reshape(nA, nd, nj)
+def einsum_Adi_ij_Adj_inverseK(Adi_term, K):
+    nA, nd, nj = Adi_term.shape
+    # return cupy.einsum('Adi,ij->Adj', Adi_term, cupy.linalg.inv(K))
+    return cupy.linalg.solve(K.T, Adi_term.reshape(nA * nd, nj).T).T.reshape(nA, nd, nj)
+
+def get_dS_dot_q(dS, dSii, q, atmlst, gridslice):
+    output = cupy.einsum('diA,i->Adi', dSii[:,:,atmlst], q)
+    for i_atom in atmlst:
+        g0,g1 = gridslice[i_atom]
+        output[i_atom, :, g0:g1] += dS[:,g0:g1,:] @ q
+        output[i_atom, :, :] -= dS[:,:,g0:g1] @ q[g0:g1]
+    return output
+def get_dST_dot_q(dS, dSii, q, atmlst, gridslice):
+    # S is symmetric
+    return get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
+
+def get_dA_dot_q(dA, q, atmlst):
+    return cupy.einsum('diA,i->Adi', dA[:,:,atmlst], q)
+
+def get_dD_dot_q(dD, q, atmlst, gridslice, ngrids):
+    output = cupy.zeros([len(atmlst), 3, ngrids])
+    for i_atom in atmlst:
+        g0,g1 = gridslice[i_atom]
+        output[i_atom, :, g0:g1] += dD[:,g0:g1,:] @ q
+        output[i_atom, :, :] -= dD[:,:,g0:g1] @ q[g0:g1]
+    return output
+def get_dDT_dot_q(dD, q, atmlst, gridslice, ngrids):
+    return get_dD_dot_q(-dD.transpose(0,2,1), q, atmlst, gridslice, ngrids)
+
+def get_v_dot_d2S_dot_q(d2S, d2Sii, v_left, q_right, natom, gridslice):
+    output = d2Sii @ (v_left * q_right)
+    for i_atom in range(natom):
+        gi0,gi1 = gridslice[i_atom]
+        for j_atom in range(natom):
+            gj0,gj1 = gridslice[j_atom]
+            d2S_atom_ij = cupy.einsum('q,dDq->dD', v_left[gi0:gi1], d2S[:,:,gi0:gi1,gj0:gj1] @ q_right[gj0:gj1])
+            output[i_atom, i_atom, :, :] += d2S_atom_ij
+            output[j_atom, j_atom, :, :] += d2S_atom_ij
+            output[i_atom, j_atom, :, :] -= d2S_atom_ij
+            output[j_atom, i_atom, :, :] -= d2S_atom_ij
+    return output
+def get_v_dot_d2ST_dot_q(d2S, d2Sii, v_left, q_right, natom, gridslice):
+    # S is symmetric
+    return get_v_dot_d2S_dot_q(d2S, d2Sii, v_left, q_right, natom, gridslice)
+
+def get_v_dot_d2A_dot_q(d2A, v_left, q_right):
+    return d2A @ (v_left * q_right)
+
+def get_v_dot_d2D_dot_q(d2D, v_left, q_right, natom, gridslice):
+    output = cupy.zeros([natom, natom, 3, 3])
+    for i_atom in range(natom):
+        gi0,gi1 = gridslice[i_atom]
+        for j_atom in range(natom):
+            gj0,gj1 = gridslice[j_atom]
+            d2D_atom_ij = cupy.einsum('q,dDq->dD', v_left[gi0:gi1], d2D[:,:,gi0:gi1,gj0:gj1] @ q_right[gj0:gj1])
+            output[i_atom, i_atom, :, :] += d2D_atom_ij
+            output[j_atom, j_atom, :, :] += d2D_atom_ij
+            output[i_atom, j_atom, :, :] -= d2D_atom_ij
+            output[j_atom, i_atom, :, :] -= d2D_atom_ij
+    return output
+def get_v_dot_d2DT_dot_q(d2D, v_left, q_right, natom, gridslice):
+    return get_v_dot_d2D_dot_q(d2D.transpose(0,1,3,2), v_left, q_right, natom, gridslice)
+
+def analytical_hess_solver(pcmobj, dm, verbose=None):
+    if not pcmobj._intermediates:
+        pcmobj.build()
+    dm_cache = pcmobj._intermediates.get('dm', None)
+    if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10:
+        pass
+    else:
+        pcmobj._get_vind(dm)
+    mol = pcmobj.mol
+    log = logger.new_logger(mol, verbose)
+    t1 = log.init_timer()
+
+    natom = mol.natm
+    atmlst = range(natom) # Attention: This cannot be split
+
+    gridslice    = pcmobj.surface['gslice_by_atom']
+    v_grids      = pcmobj._intermediates['v_grids']
+    A            = pcmobj._intermediates['A']
+    D            = pcmobj._intermediates['D']
+    S            = pcmobj._intermediates['S']
+    K            = pcmobj._intermediates['K']
+    R            = pcmobj._intermediates['R']
+    q            = pcmobj._intermediates['q']
+    f_epsilon    = pcmobj._intermediates['f_epsilon']
+
+    ngrids = q.shape[0]
+
+    vK_1 = cupy.linalg.solve(K.T, v_grids)
+
+    if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']:
+        _, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True)
+        dF, _ = get_dF_dA(pcmobj.surface)
+        dSii = get_dSii(pcmobj.surface, dF)
+
+        # dR = 0, dK = dS
+        # d(S-1 R) = - S-1 dS S-1 R
+        # d2(S-1 R) = (S-1 dS S-1 dS S-1 R) + (S-1 dS S-1 dS S-1 R) - (S-1 d2S S-1 R)
+        dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
+        S_1_dSdx_dot_q = einsum_ij_Adj_Adi_inverseK(K, dSdx_dot_q)
+        dSdx_dot_q = None
+        VS_1_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1, atmlst, gridslice)
+        dS = None
+        dSii = None
+        d2e_from_d2KR = cupy.einsum('Adi,BDi->ABdD', VS_1_dot_dSdx, S_1_dSdx_dot_q) * 2
+
+        _, d2S = get_d2D_d2S(pcmobj.surface, with_D=False, with_S=True)
+        d2F, _ = get_d2F_d2A(pcmobj.surface)
+        d2Sii = get_d2Sii(pcmobj.surface, dF, d2F)
+        dF = None
+        d2F = None
+        d2e_from_d2KR -= get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1, q, natom, gridslice)
+        d2S = None
+        d2Sii = None
+
+        dK_1Rv = -S_1_dSdx_dot_q
+        dvK_1R = -einsum_Adi_ij_Adj_inverseK(VS_1_dot_dSdx, K) @ R
+
+    elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']:
+        dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
+        dF, dA = get_dF_dA(pcmobj.surface)
+        dSii = get_dSii(pcmobj.surface, dF)
+
+        # dR = f_eps/(2*pi) * (dD*A + D*dA)
+        # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
+
+        # d2R = f_eps/(2*pi) * (d2D*A + dD*dA + dD*dA + D*d2A)
+        # d2K = d2S - f_eps/(2*pi) * (d2D*A*S + D*d2A*S + D*A*d2S + dD*dA*S + dD*dA*S + dD*A*dS + dD*A*dS + D*dA*dS + D*dA*dS)
+        # The terms showing up twice on equation above (dD*dA + dD*dA for example) refer to dD/dx * dA/dy + dD/dy * dA/dx,
+        # since D is not symmetric, they are not the same.
+
+        # d(K-1 R) = - K-1 dK K-1 R + K-1 dR
+        # d2(K-1 R) = (K-1 dK K-1 dK K-1 R) + (K-1 dK K-1 dK K-1 R) - (K-1 d2K K-1 R) - (K-1 dK K-1 dR)
+        #             - (K-1 dK K-1 dR) + (K-1 d2R)
+        f_eps_over_2pi = f_epsilon/(2.0*PI)
+
+        dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
+        DA = D*A
+        dKdx_dot_q = dSdx_dot_q - f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q)
+        dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst)
+        dKdx_dot_q -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq)
+        AS = (A * S.T).T # It's just diag(A) @ S
+        ASq = AS @ q
+        dDdx_dot_ASq = get_dD_dot_q(dD, ASq, atmlst, gridslice, ngrids)
+        dKdx_dot_q -= f_eps_over_2pi * dDdx_dot_ASq
+        dDdx_dot_ASq = None
+
+        K_1_dot_dKdx_dot_q = einsum_ij_Adj_Adi_inverseK(K, dKdx_dot_q)
+        dKdx_dot_q = None
+
+        vK_1_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1, atmlst, gridslice)
+        vK_1_dot_dKdx = vK_1_dot_dSdx
+        vK_1_dot_dSdx = None
+        vK_1_dot_dDdx = get_dDT_dot_q(dD, vK_1, atmlst, gridslice, ngrids)
+        vK_1_dot_dKdx -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', AS.T, vK_1_dot_dDdx)
+        AS = None
+        vK_1D = D.T @ vK_1
+        vK_1D_dot_dAdx = get_dA_dot_q(dA, vK_1D, atmlst)
+        vK_1_dot_dKdx -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', S.T, vK_1D_dot_dAdx)
+        vK_1DA = DA.T @ vK_1
+        DA = None
+        vK_1DA_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1DA, atmlst, gridslice)
+        dS = None
+        dSii = None
+        vK_1_dot_dKdx -= f_eps_over_2pi * vK_1DA_dot_dSdx
+        vK_1DA_dot_dSdx = None
+
+        d2e_from_d2KR  = cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dKdx, K_1_dot_dKdx_dot_q)
+        d2e_from_d2KR += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dKdx, K_1_dot_dKdx_dot_q)
+
+        d2F, d2A = get_d2F_d2A(pcmobj.surface)
+        vK_1_d2K_q  = get_v_dot_d2A_dot_q(d2A, vK_1D, S @ q)
+        vK_1_d2R_V  = get_v_dot_d2A_dot_q(d2A, vK_1D, v_grids)
+        d2A = None
+        d2Sii = get_d2Sii(pcmobj.surface, dF, d2F)
+        dF = None
+        d2F = None
+        d2D, d2S = get_d2D_d2S(pcmobj.surface, with_D=True, with_S=True)
+        vK_1_d2K_q += get_v_dot_d2D_dot_q(d2D, vK_1, ASq, natom, gridslice)
+        vK_1_d2R_V += get_v_dot_d2D_dot_q(d2D, vK_1, A * v_grids, natom, gridslice)
+        d2D = None
+        vK_1_d2K_q += get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1DA, q, natom, gridslice)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx, dAdx_dot_Sq)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx * A, dSdx_dot_q)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1D_dot_dAdx, dSdx_dot_q)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx, dAdx_dot_Sq)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx * A, dSdx_dot_q)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1D_dot_dAdx, dSdx_dot_q)
+        vK_1_d2K_q *= -f_eps_over_2pi
+        vK_1_d2K_q += get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1, q, natom, gridslice)
+        d2S = None
+        d2Sii = None
+
+        d2e_from_d2KR -= vK_1_d2K_q
+
+        dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst)
+        dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice, ngrids)
+        dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V))
+        dDdx_dot_AV = None
+
+        K_1_dot_dRdx_dot_V = einsum_ij_Adj_Adi_inverseK(K, dRdx_dot_V)
+        dRdx_dot_V = None
+
+        d2e_from_d2KR -= cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dKdx, K_1_dot_dRdx_dot_V)
+        d2e_from_d2KR -= cupy.einsum('Adi,BDi->BADd', vK_1_dot_dKdx, K_1_dot_dRdx_dot_V)
+
+        vK_1_d2R_V += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx, dAdx_dot_V)
+        vK_1_d2R_V += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx, dAdx_dot_V)
+        vK_1_d2R_V *= f_eps_over_2pi
+
+        d2e_from_d2KR += vK_1_d2R_V
+
+        dK_1Rv = -K_1_dot_dKdx_dot_q + K_1_dot_dRdx_dot_V
+
+        VK_1D_dot_dAdx = get_dA_dot_q(dA, (D.T @ vK_1).T, atmlst)
+        VK_1_dot_dDdx = get_dDT_dot_q(dD, vK_1, atmlst, gridslice, ngrids)
+        VK_1_dot_dRdx = f_eps_over_2pi * (VK_1D_dot_dAdx + VK_1_dot_dDdx * A)
+
+        dvK_1R = -einsum_Adi_ij_Adj_inverseK(vK_1_dot_dKdx, K) @ R + VK_1_dot_dRdx
+
+    elif pcmobj.method.upper() in ['SS(V)PE']:
+        dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
+        dF, dA = get_dF_dA(pcmobj.surface)
+        dSii = get_dSii(pcmobj.surface, dF)
+
+        # dR = f_eps/(2*pi) * (dD*A + D*dA)
+        # dK = dS - f_eps/(4*pi) * (dD*A*S + D*dA*S + D*A*dS + dST*AT*DT + ST*dAT*DT + ST*AT*dDT)
+
+        # d2R = f_eps/(2*pi) * (d2D*A + dD*dA + dD*dA + D*d2A)
+        # d2K = d2S - f_eps/(4*pi) * (d2D*A*S + D*d2A*S + D*A*d2S + dD*dA*S + dD*dA*S + dD*A*dS + dD*A*dS + D*dA*dS + D*dA*dS
+        #                           + d2ST*AT*DT + ST*d2AT*DT + ST*AT*d2DT + dST*dAT*DT + dST*dAT*DT + dST*AT*dDT + dST*AT*dDT + ST*dAT*dDT + ST*dAT*dDT)
+        f_eps_over_2pi = f_epsilon/(2.0*PI)
+        f_eps_over_4pi = f_epsilon/(4.0*PI)
+
+        dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
+        DA = D*A
+        dKdx_dot_q = dSdx_dot_q - f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q)
+        dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst)
+        dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq)
+        AS = (A * S.T).T # It's just diag(A) @ S
+        ASq = AS @ q
+        dDdx_dot_ASq = get_dD_dot_q(dD, ASq, atmlst, gridslice, ngrids)
+        dKdx_dot_q -= f_eps_over_4pi * dDdx_dot_ASq
+        dDdx_dot_ASq = None
+        dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice, ngrids)
+        dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_q)
+        dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst)
+        dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_q)
+        AT_DT_q = DA.T @ q
+        dSdxT_dot_AT_DT_q = get_dS_dot_q(dS, dSii, AT_DT_q, atmlst, gridslice)
+        dKdx_dot_q -= f_eps_over_4pi * dSdxT_dot_AT_DT_q
+        dSdxT_dot_AT_DT_q = None
+
+        K_1_dot_dKdx_dot_q = einsum_ij_Adj_Adi_inverseK(K, dKdx_dot_q)
+        dKdx_dot_q = None
+
+        vK_1_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1, atmlst, gridslice)
+        vK_1_dot_dKdx = vK_1_dot_dSdx
+        vK_1_dot_dSdx = None
+        vK_1_dot_dDdx = get_dDT_dot_q(dD, vK_1, atmlst, gridslice, ngrids)
+        vK_1_dot_dKdx -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, vK_1_dot_dDdx)
+        vK_1D_dot_dAdx = get_dA_dot_q(dA, D.T @ vK_1, atmlst)
+        vK_1_dot_dKdx -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, vK_1D_dot_dAdx)
+        vK_1DA = DA.T @ vK_1
+        vK_1DA_dot_dSdx = get_dST_dot_q(dS, dSii, vK_1DA, atmlst, gridslice)
+        vK_1_dot_dKdx -= f_eps_over_4pi * vK_1DA_dot_dSdx
+        vK_1DA_dot_dSdx = None
+        vK_1_dot_dSdxT = get_dS_dot_q(dS, dSii, vK_1, atmlst, gridslice)
+        dS = None
+        dSii = None
+        vK_1_dot_dKdx -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, vK_1_dot_dSdxT)
+        DA = None
+        vK_1_ST_dot_dAdxT = get_dA_dot_q(dA, (S @ vK_1).T, atmlst)
+        vK_1_dot_dKdx -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, vK_1_ST_dot_dAdxT)
+        vK_1_ST_AT = AS @ vK_1
+        AS = None
+        vK_1_ST_AT_dot_dDdxT = get_dD_dot_q(dD, vK_1_ST_AT, atmlst, gridslice, ngrids)
+        vK_1_dot_dKdx -= f_eps_over_4pi * vK_1_ST_AT_dot_dDdxT
+        vK_1_ST_AT_dot_dDdxT = None
+
+        d2e_from_d2KR  = cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dKdx, K_1_dot_dKdx_dot_q)
+        d2e_from_d2KR += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dKdx, K_1_dot_dKdx_dot_q)
+
+        d2F, d2A = get_d2F_d2A(pcmobj.surface)
+        vK_1_d2K_q  = get_v_dot_d2A_dot_q(d2A, (D.T @ vK_1).T, S @ q)
+        vK_1_d2K_q += get_v_dot_d2A_dot_q(d2A, (S @ vK_1).T, D.T @ q)
+        vK_1_d2R_V  = get_v_dot_d2A_dot_q(d2A, (D.T @ vK_1).T, v_grids)
+        d2A = None
+        d2Sii = get_d2Sii(pcmobj.surface, dF, d2F)
+        dF = None
+        d2F = None
+        d2D, d2S = get_d2D_d2S(pcmobj.surface, with_D=True, with_S=True)
+        vK_1_d2K_q += get_v_dot_d2D_dot_q(d2D, vK_1, ASq, natom, gridslice)
+        vK_1_d2K_q += get_v_dot_d2DT_dot_q(d2D, vK_1_ST_AT, q, natom, gridslice)
+        vK_1_d2R_V += get_v_dot_d2D_dot_q(d2D, vK_1, A * v_grids, natom, gridslice)
+        d2D = None
+        vK_1_d2K_q += get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1DA, q, natom, gridslice)
+        vK_1_d2K_q += get_v_dot_d2ST_dot_q(d2S, d2Sii, vK_1, AT_DT_q, natom, gridslice)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx, dAdx_dot_Sq)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx * A, dSdx_dot_q)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1D_dot_dAdx, dSdx_dot_q)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dSdxT, dAdxT_dot_DT_q)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dSdxT * A, dDdxT_dot_q)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->ABdD', vK_1_ST_dot_dAdxT, dDdxT_dot_q)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx, dAdx_dot_Sq)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx * A, dSdx_dot_q)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1D_dot_dAdx, dSdx_dot_q)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dSdxT, dAdxT_dot_DT_q)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dSdxT * A, dDdxT_dot_q)
+        vK_1_d2K_q += cupy.einsum('Adi,BDi->BADd', vK_1_ST_dot_dAdxT, dDdxT_dot_q)
+        vK_1_d2K_q *= -f_eps_over_4pi
+        vK_1_d2K_q += get_v_dot_d2S_dot_q(d2S, d2Sii, vK_1, q, natom, gridslice)
+        d2S = None
+        d2Sii = None
+
+        d2e_from_d2KR -= vK_1_d2K_q
+
+        dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst)
+        dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice, ngrids)
+        dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V))
+        dDdx_dot_AV = None
+
+        K_1_dot_dRdx_dot_V = einsum_ij_Adj_Adi_inverseK(K, dRdx_dot_V)
+
+        d2e_from_d2KR -= cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dKdx, K_1_dot_dRdx_dot_V)
+        d2e_from_d2KR -= cupy.einsum('Adi,BDi->BADd', vK_1_dot_dKdx, K_1_dot_dRdx_dot_V)
+
+        vK_1_d2R_V += cupy.einsum('Adi,BDi->ABdD', vK_1_dot_dDdx, dAdx_dot_V)
+        vK_1_d2R_V += cupy.einsum('Adi,BDi->BADd', vK_1_dot_dDdx, dAdx_dot_V)
+        vK_1_d2R_V *= f_eps_over_2pi
+
+        d2e_from_d2KR += vK_1_d2R_V
+
+        dK_1Rv = -K_1_dot_dKdx_dot_q + K_1_dot_dRdx_dot_V
+
+        VK_1D_dot_dAdx = get_dA_dot_q(dA, (D.T @ vK_1).T, atmlst)
+        VK_1_dot_dDdx = get_dDT_dot_q(dD, vK_1, atmlst, gridslice, ngrids)
+        VK_1_dot_dRdx = f_eps_over_2pi * (VK_1D_dot_dAdx + VK_1_dot_dDdx * A)
+
+        dvK_1R = -einsum_Adi_ij_Adj_inverseK(vK_1_dot_dKdx, K) @ R + VK_1_dot_dRdx
+
+    else:
+        raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}")
+
+    d2e = d2e_from_d2KR
+
+    intopt_derivative = int3c1e.VHFOpt(mol)
+    intopt_derivative.build(cutoff = 1e-14, aosym = False)
+
+    dVdx = get_dvgrids(pcmobj, dm, range(mol.natm), intopt_derivative)
+    d2e -= cupy.einsum('Adi,BDi->BADd', dvK_1R, dVdx)
+    d2e -= cupy.einsum('Adi,BDi->ABdD', dVdx, dK_1Rv)
+
+    d2e *= 0.5
+    d2e = d2e.get()
+    t1 = log.timer_debug1('solvent hessian d(V * dK-1R/dx * V)/dx contribution', *t1)
+    return d2e
+
+def get_dqsym_dx_fix_vgrids(pcmobj, atmlst):
     assert pcmobj._intermediates is not None
 
     gridslice    = pcmobj.surface['gslice_by_atom']
@@ -161,35 +705,14 @@ def get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K):
     A            = pcmobj._intermediates['A']
     D            = pcmobj._intermediates['D']
     S            = pcmobj._intermediates['S']
+    K            = pcmobj._intermediates['K']
     R            = pcmobj._intermediates['R']
+    q            = pcmobj._intermediates['q']
     q_sym        = pcmobj._intermediates['q_sym']
     f_epsilon    = pcmobj._intermediates['f_epsilon']
 
     ngrids = q_sym.shape[0]
 
-    def get_dS_dot_q(dS, dSii, q, atmlst, gridslice):
-        output = cupy.einsum('diA,i->Adi', dSii[:,:,atmlst], q)
-        for i_atom in atmlst:
-            g0,g1 = gridslice[i_atom]
-            output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dS[:,g0:g1,:], q)
-            output[i_atom, :, :] -= cupy.einsum('dij,j->di', dS[:,:,g0:g1], q[g0:g1])
-        return output
-    def get_dST_dot_q(dS, dSii, q, atmlst, gridslice):
-        return get_dS_dot_q(-dS.transpose(0,2,1), dSii, q, atmlst, gridslice)
-
-    def get_dA_dot_q(dA, q, atmlst, gridslice):
-        return cupy.einsum('diA,i->Adi', dA[:,:,atmlst], q)
-
-    def get_dD_dot_q(dD, q, atmlst, gridslice):
-        output = cupy.zeros([len(atmlst), 3, ngrids])
-        for i_atom in atmlst:
-            g0,g1 = gridslice[i_atom]
-            output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dD[:,g0:g1,:], q)
-            output[i_atom, :, :] -= cupy.einsum('dij,j->di', dD[:,:,g0:g1], q[g0:g1])
-        return output
-    def get_dDT_dot_q(dD, q, atmlst, gridslice):
-        return get_dD_dot_q(-dD.transpose(0,2,1), q, atmlst, gridslice)
-
     if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']:
         _, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True)
         dF, _ = get_dF_dA(pcmobj.surface)
@@ -199,7 +722,7 @@ def get_dDT_dot_q(dD, q, atmlst, gridslice):
         # dR = 0, dK = dS
         dSdx_dot_q = get_dS_dot_q(dS, dSii, q_sym, atmlst, gridslice)
 
-        dqdx_fix_Vq = cupy.einsum('ij,Adj->Adi', inverse_K, dSdx_dot_q)
+        dqdx_fix_Vq = einsum_ij_Adj_Adi_inverseK(K, dSdx_dot_q)
 
     elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']:
         dF, dA = get_dF_dA(pcmobj.surface)
@@ -212,33 +735,32 @@ def get_dDT_dot_q(dD, q, atmlst, gridslice):
         # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
         f_eps_over_2pi = f_epsilon/(2.0*PI)
 
-        q = inverse_K @ R @ v_grids
         dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice)
 
         DA = D*A
         dKdx_dot_q = dSdx_dot_q - f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q)
 
-        dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice)
+        dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst)
         dKdx_dot_q -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq)
 
         AS = (A * S.T).T # It's just diag(A) @ S
-        dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice)
+        dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice, ngrids)
         dKdx_dot_q -= f_eps_over_2pi * dDdx_dot_ASq
 
-        dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q)
+        dqdx_fix_Vq = -einsum_ij_Adj_Adi_inverseK(K, dKdx_dot_q)
 
-        dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice)
+        dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst)
 
-        dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice)
+        dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice, ngrids)
 
         dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V))
-        dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V)
+        dqdx_fix_Vq += einsum_ij_Adj_Adi_inverseK(K, dRdx_dot_V)
 
-        invKT_V = inverse_K.T @ v_grids
-        dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice)
+        invKT_V = cupy.linalg.solve(K.T, v_grids)
+        dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice, ngrids)
 
         DT_invKT_V = D.T @ invKT_V
-        dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice)
+        dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst)
         dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V)
 
         dSdxT_dot_invKT_V = get_dST_dot_q(dS, dSii, invKT_V, atmlst, gridslice)
@@ -249,8 +771,9 @@ def get_dDT_dot_q(dD, q, atmlst, gridslice):
 
         dSdxT_dot_AT_DT_invKT_V = get_dST_dot_q(dS, dSii, DA.T @ invKT_V, atmlst, gridslice)
         dKdxT_dot_invKT_V -= f_eps_over_2pi * dSdxT_dot_AT_DT_invKT_V
+        invKT_dKdxT_dot_invKT_V = einsum_ij_Adj_Adi_inverseK(K.T, dKdxT_dot_invKT_V)
 
-        dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdxT_dot_invKT_V)
+        dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T, invKT_dKdxT_dot_invKT_V)
 
         dqdx_fix_Vq *= -0.5
 
@@ -269,17 +792,17 @@ def dK_dot_q(q):
             DA = D*A
             dKdx_dot_q = dSdx_dot_q - f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q)
 
-            dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice)
+            dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst)
             dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq)
 
             AS = (A * S.T).T # It's just diag(A) @ S
-            dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice)
+            dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice, ngrids)
             dKdx_dot_q -= f_eps_over_4pi * dDdx_dot_ASq
 
-            dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice)
+            dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice, ngrids)
             dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_q)
 
-            dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst, gridslice)
+            dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst)
             dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_q)
 
             dSdxT_dot_AT_DT_q = get_dST_dot_q(dS, dSii, DA.T @ q, atmlst, gridslice)
@@ -289,26 +812,27 @@ def dK_dot_q(q):
 
         f_eps_over_2pi = f_epsilon/(2.0*PI)
 
-        q = inverse_K @ R @ v_grids
         dKdx_dot_q = dK_dot_q(q)
-        dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q)
+        dqdx_fix_Vq = -einsum_ij_Adj_Adi_inverseK(K, dKdx_dot_q)
 
-        dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice)
+        dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst)
 
-        dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice)
+        dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice, ngrids)
 
         dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V))
-        dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V)
+        dqdx_fix_Vq += einsum_ij_Adj_Adi_inverseK(K, dRdx_dot_V)
 
-        invKT_V = inverse_K.T @ v_grids
-        dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice)
+        invKT_V = cupy.linalg.solve(K.T, v_grids)
+        dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice, ngrids)
 
         DT_invKT_V = D.T @ invKT_V
-        dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice)
+        dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst)
         dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V)
 
         dKdx_dot_invKT_V = dK_dot_q(invKT_V)
-        dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdx_dot_invKT_V)
+        invKT_dKdx_dot_invKT_V = einsum_ij_Adj_Adi_inverseK(K.T, dKdx_dot_invKT_V)
+
+        dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T, invKT_dKdx_dot_invKT_V)
 
         dqdx_fix_Vq *= -0.5
 
@@ -317,14 +841,13 @@ def dK_dot_q(q):
 
     return dqdx_fix_Vq
 
-def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative):
+def get_dvgrids(pcmobj, dm, atmlst, intopt_derivative):
     assert pcmobj._intermediates is not None
 
     mol = pcmobj.mol
     gridslice    = pcmobj.surface['gslice_by_atom']
     charge_exp   = pcmobj.surface['charge_exp']
     grid_coords  = pcmobj.surface['grid_coords']
-    R            = pcmobj._intermediates['R']
 
     atom_coords = mol.atom_coords(unit='B')
     atom_charges = numpy.asarray(mol.atom_charges(), dtype=numpy.float64)
@@ -351,17 +874,24 @@ def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative):
         g0,g1 = gridslice[i_atom]
         dV_on_charge_dx[i_atom,:,g0:g1] -= dIdC[:,g0:g1]
 
-    KR_symmetrized = 0.5 * (inverse_K @ R + R.T @ inverse_K.T)
-    dqdx_fix_K_R = cupy.einsum('ij,Adj->Adi', KR_symmetrized, dV_on_charge_dx)
+    return dV_on_charge_dx
+
+def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, intopt_derivative):
+    dV_on_charge_dx = get_dvgrids(pcmobj, dm, atmlst, intopt_derivative)
+    K = pcmobj._intermediates['K']
+    R = pcmobj._intermediates['R']
+    R_dVdx = cupy.einsum('ij,Adj->Adi', R, dV_on_charge_dx)
+    K_1_R_dVdx = einsum_ij_Adj_Adi_inverseK(K, R_dVdx)
+    K_1T_dVdx = einsum_ij_Adj_Adi_inverseK(K.T, dV_on_charge_dx)
+    RT_K_1T_dVdx = cupy.einsum('ij,Adj->Adi', R.T, K_1T_dVdx)
+    dqdx_fix_K_R = 0.5 * (K_1_R_dVdx + RT_K_1T_dVdx)
 
     return dqdx_fix_K_R
 
 def get_dqsym_dx(pcmobj, dm, atmlst, intopt_derivative):
-    K = pcmobj._intermediates['K']
-    inverse_K = cupy.linalg.inv(K)
-    return get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K) + get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative)
+    return get_dqsym_dx_fix_vgrids(pcmobj, atmlst) + get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, intopt_derivative)
 
-def analytic_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None):
+def analytical_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None):
     '''
     dv_solv / da
     '''
@@ -470,8 +1000,9 @@ def kernel(self, *args, dm=None, atmlst=None, **kwargs):
             dm = dm[0] + dm[1]
         is_equilibrium = self.base.with_solvent.equilibrium_solvation
         self.base.with_solvent.equilibrium_solvation = True
-        self.de_solvent = hess_elec(self.base.with_solvent, dm, verbose=self.verbose)
-        #self.de_solvent+= hess_nuc(self.base.with_solvent)
+        self.de_solvent  =    analytical_hess_nuc(self.base.with_solvent, dm, verbose=self.verbose)
+        self.de_solvent +=     analytical_hess_qv(self.base.with_solvent, dm, verbose=self.verbose)
+        self.de_solvent += analytical_hess_solver(self.base.with_solvent, dm, verbose=self.verbose)
         self.de_solute = super().kernel(*args, **kwargs)
         self.de = self.de_solute + self.de_solvent
         self.base.with_solvent.equilibrium_solvation = is_equilibrium
@@ -483,7 +1014,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
         if isinstance(self.base, scf.hf.RHF):
             dm = self.base.make_rdm1(ao_repr=True)
-            dv = analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+            dv = analytical_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1ao[i0] += dv[i0]
             return h1ao
@@ -492,15 +1023,15 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
             solvent = self.base.with_solvent
             dm = self.base.make_rdm1(ao_repr=True)
             dm = dm[0] + dm[1]
-            dva = analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
-            dvb = analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
+            dva = analytical_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
+            dvb = analytical_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1aoa[i0] += dva[i0]
                 h1aob[i0] += dvb[i0]
             return h1aoa, h1aob
         else:
             raise NotImplementedError('Base object is not supported')
-        
+
     def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1):
         v1vo = super().get_veff_resp_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi)
         if not self.base.with_solvent.equilibrium_solvation:
@@ -523,7 +1054,7 @@ def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1):
         else:
             raise NotImplementedError('Base object is not supported')
         return v1vo
-    
+
     def _finalize(self):
         # disable _finalize. It is called in grad_method.kernel method
         # where self.de was not yet initialized.
diff --git a/gpu4pyscf/solvent/hessian/smd.py b/gpu4pyscf/solvent/hessian/smd.py
index 49897d74..dafaa573 100644
--- a/gpu4pyscf/solvent/hessian/smd.py
+++ b/gpu4pyscf/solvent/hessian/smd.py
@@ -22,8 +22,6 @@
 from gpu4pyscf import scf
 from gpu4pyscf.lib import logger
 from gpu4pyscf.solvent import smd
-from gpu4pyscf.solvent.grad import smd as smd_grad
-from gpu4pyscf.solvent.grad import pcm as pcm_grad
 from gpu4pyscf.solvent.hessian import pcm as pcm_hess
 from gpu4pyscf.hessian.jk import _ao2mo
 
@@ -60,45 +58,6 @@ def smd_grad_scanner(mol):
     t1 = log.timer_debug1('solvent energy', *t1)
     return hess_cds # hartree
 
-
-def hess_elec(smdobj, dm, verbose=None):
-    '''
-    slow version with finite difference
-    TODO: use analytical hess_nuc
-    '''
-    log = logger.new_logger(smdobj, verbose)
-    t1 = log.init_timer()
-    pmol = smdobj.mol.copy()
-    mol = pmol.copy()
-    coords = mol.atom_coords(unit='Bohr')
-
-    def pcm_grad_scanner(mol):
-        # TODO: use more analytical forms
-        smdobj.reset(mol)
-        e, v = smdobj._get_vind(dm)
-        #return grad_elec(smdobj, dm)
-        grad = pcm_grad.grad_nuc(smdobj, dm)
-        grad+= smd_grad.grad_solver(smdobj, dm)
-        grad+= pcm_grad.grad_qv(smdobj, dm)
-        return grad
-
-    mol.verbose = 0
-    de = np.zeros([mol.natm, mol.natm, 3, 3])
-    eps = 1e-3
-    for ia in range(mol.natm):
-        for ix in range(3):
-            dv = np.zeros_like(coords)
-            dv[ia,ix] = eps
-            mol.set_geom_(coords + dv, unit='Bohr')
-            g0 = pcm_grad_scanner(mol)
-
-            mol.set_geom_(coords - dv, unit='Bohr')
-            g1 = pcm_grad_scanner(mol)
-            de[ia,:,ix] = (g0 - g1)/2.0/eps
-    t1 = log.timer_debug1('solvent energy', *t1)
-    smdobj.reset(pmol)
-    return de
-
 def make_hess_object(hess_method):
     '''For hess_method in vacuum, add nuclear Hessian of solvent smdobj'''
     if hess_method.base.with_solvent.frozen:
@@ -140,8 +99,9 @@ def kernel(self, *args, dm=None, atmlst=None, **kwargs):
             dm = dm[0] + dm[1]
         is_equilibrium = self.base.with_solvent.equilibrium_solvation
         self.base.with_solvent.equilibrium_solvation = True
-        self.de_solvent = pcm_hess.hess_elec(self.base.with_solvent, dm, verbose=self.verbose)
-        #self.de_solvent+= hess_nuc(self.base.with_solvent)
+        self.de_solvent  =    pcm_hess.analytical_hess_nuc(self.base.with_solvent, dm, verbose=self.verbose)
+        self.de_solvent +=     pcm_hess.analytical_hess_qv(self.base.with_solvent, dm, verbose=self.verbose)
+        self.de_solvent += pcm_hess.analytical_hess_solver(self.base.with_solvent, dm, verbose=self.verbose)
         self.de_solute = super().kernel(*args, **kwargs)
         self.de_cds = get_cds(self.base.with_solvent)
         self.de = self.de_solute + self.de_solvent + self.de_cds
@@ -154,7 +114,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
         if isinstance(self.base, scf.hf.RHF):
             dm = self.base.make_rdm1(ao_repr=True)
-            dv = pcm_hess.analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+            dv = pcm_hess.analytical_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1ao[i0] += dv[i0]
             return h1ao
@@ -163,8 +123,8 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
             solvent = self.base.with_solvent
             dm = self.base.make_rdm1(ao_repr=True)
             dm = dm[0] + dm[1]
-            dva = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
-            dvb = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
+            dva = pcm_hess.analytical_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose)
+            dvb = pcm_hess.analytical_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1aoa[i0] += dva[i0]
                 h1aob[i0] += dvb[i0]
diff --git a/gpu4pyscf/solvent/tests/test_pcm_grad.py b/gpu4pyscf/solvent/tests/test_pcm_grad.py
index f141ae56..c17e05f3 100644
--- a/gpu4pyscf/solvent/tests/test_pcm_grad.py
+++ b/gpu4pyscf/solvent/tests/test_pcm_grad.py
@@ -36,6 +36,7 @@ def setUpModule():
     mol.basis = 'sto3g'
     mol.output = '/dev/null'
     mol.build(verbose=0)
+    # Warning: This system has all orbitals filled, which is FAR from physical
     mol.nelectron = mol.nao * 2
     epsilon = 35.9
     lebedev_order = 3
@@ -169,11 +170,14 @@ def test_grad_IEFPCM(self):
 
     def test_grad_SSVPE(self):
         grad = _grad_with_solvent('SS(V)PE')
-        g0 = numpy.asarray(
-            [[ 3.42479745e-15, -1.00280742e-16, -1.61117735e+00],
-            [ 1.07135985e+00, -6.97375148e-16,  8.05588676e-01],
-            [-1.07135985e+00,  7.91425487e-16,  8.05588676e-01]]
-        )
+        # Note: This reference value is obtained via finite difference with dx = 1e-5
+        #       QChem 6.1 has a bug in SSVPE gradient, they use the IEFPCM gradient algorithm
+        #       to compute SSVPE gradient, which is wrong.
+        g0 = numpy.asarray([
+            [ 0.00000000e+00, -7.10542736e-10, -1.63195623e+00],
+            [ 1.07705138e+00,  2.13162821e-09,  8.15978117e-01],
+            [-1.07705138e+00, -2.13162821e-09,  8.15978116e-01],
+        ])
         print(f"Gradient error in RHF with SS(V)PE: {numpy.linalg.norm(g0 - grad)}")
         assert numpy.linalg.norm(g0 - grad) < 1e-6
 
diff --git a/gpu4pyscf/solvent/tests/test_pcm_hessian.py b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
index c7076f29..6e19ec96 100644
--- a/gpu4pyscf/solvent/tests/test_pcm_hessian.py
+++ b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
@@ -21,7 +21,7 @@
 from gpu4pyscf.solvent import pcm
 from gpu4pyscf import scf, dft
 from packaging import version
-from gpu4pyscf.solvent.hessian.pcm import analytic_grad_vmat
+from gpu4pyscf.solvent.hessian.pcm import analytical_grad_vmat, analytical_hess_nuc, analytical_hess_solver, analytical_hess_qv
 from gpu4pyscf.lib.cupy_helper import contract
 
 pyscf_25 = version.parse(pyscf.__version__) <= version.parse('2.5.0')
@@ -130,6 +130,37 @@ def pcm_vmat_scanner(mol):
     pcmobj.reset(pmol)
     return vmat
 
+def _fd_hess_contribution(pcmobj, dm, gradient_function):
+    pmol = pcmobj.mol.copy()
+    mol = pmol.copy()
+    coords = mol.atom_coords(unit='Bohr')
+
+    def pcm_grad_scanner(mol):
+        pcmobj.reset(mol)
+        e, v = pcmobj._get_vind(dm)
+        pcm_grad = gradient_function(pcmobj, dm)
+        # pcm_grad = grad_nuc(pcmobj, dm)
+        # pcm_grad+= grad_solver(pcmobj, dm)
+        # pcm_grad+= grad_qv(pcmobj, dm)
+        return pcm_grad
+
+    mol.verbose = 0
+    de = np.zeros([mol.natm, mol.natm, 3, 3])
+    eps = 1e-5
+    for ia in range(mol.natm):
+        for ix in range(3):
+            dv = np.zeros_like(coords)
+            dv[ia,ix] = eps
+            mol.set_geom_(coords + dv, unit='Bohr')
+            g0 = pcm_grad_scanner(mol)
+
+            mol.set_geom_(coords - dv, unit='Bohr')
+            g1 = pcm_grad_scanner(mol)
+
+            de[ia,:,ix,:] = (g0 - g1)/2.0/eps
+    pcmobj.reset(pmol)
+    return de
+
 @unittest.skipIf(pcm.libsolvent is None, "solvent extension not compiled")
 class KnownValues(unittest.TestCase):
     def test_df_hess_cpcm(self):
@@ -192,7 +223,7 @@ def test_grad_vmat_cpcm(self):
         mo_coeff = mf.mo_coeff
         mo_occ = mf.mo_occ
 
-        test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+        test_grad_vmat = analytical_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
         ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
 
         cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
@@ -206,7 +237,7 @@ def test_grad_vmat_iefpcm(self):
         mo_coeff = mf.mo_coeff
         mo_occ = mf.mo_occ
 
-        test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+        test_grad_vmat = analytical_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
         ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
 
         cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
@@ -220,11 +251,71 @@ def test_grad_vmat_ssvpe(self):
         mo_coeff = mf.mo_coeff
         mo_occ = mf.mo_occ
 
-        test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
+        test_grad_vmat = analytical_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
         ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ)
 
         cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
 
+    def test_hess_nuc_iefpcm(self):
+        print("testing IEF-PCM d2E_nuc/dx2")
+        mf = _make_mf(method='IEF-PCM')
+        hobj = mf.Hessian()
+        dm = mf.make_rdm1()
+
+        test_grad_vmat = analytical_hess_nuc(hobj.base.with_solvent, dm)
+        from gpu4pyscf.solvent.grad.pcm import grad_nuc
+        ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_nuc)
+
+        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
+    def test_hess_qv_iefpcm(self):
+        print("testing IEF-PCM d2E_elec/dx2")
+        mf = _make_mf(method='IEF-PCM')
+        hobj = mf.Hessian()
+        dm = mf.make_rdm1()
+
+        test_grad_vmat = analytical_hess_qv(hobj.base.with_solvent, dm)
+        from gpu4pyscf.solvent.grad.pcm import grad_qv
+        ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_qv)
+
+        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
+    def test_hess_solver_cpcm(self):
+        print("testing C-PCM d2E_KR/dx2")
+        mf = _make_mf(method='C-PCM')
+        hobj = mf.Hessian()
+        dm = mf.make_rdm1()
+
+        test_grad_vmat = analytical_hess_solver(hobj.base.with_solvent, dm)
+        from gpu4pyscf.solvent.grad.pcm import grad_solver
+        ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_solver)
+
+        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
+    def test_hess_solver_iefpcm(self):
+        print("testing IEF-PCM d2E_KR/dx2")
+        mf = _make_mf(method='IEF-PCM')
+        hobj = mf.Hessian()
+        dm = mf.make_rdm1()
+
+        test_grad_vmat = analytical_hess_solver(hobj.base.with_solvent, dm)
+        from gpu4pyscf.solvent.grad.pcm import grad_solver
+        ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_solver)
+
+        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
+    def test_hess_solver_ssvpe(self):
+        print("testing SS(V)PE d2E_KR/dx2")
+        mf = _make_mf(method='SS(V)PE')
+        hobj = mf.Hessian()
+        dm = mf.make_rdm1()
+
+        test_grad_vmat = analytical_hess_solver(hobj.base.with_solvent, dm)
+        from gpu4pyscf.solvent.grad.pcm import grad_solver
+        ref_grad_vmat = _fd_hess_contribution(hobj.base.with_solvent, dm, grad_solver)
+
+        cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10)
+
     @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher')
     def test_to_gpu(self):
         import pyscf