From b5050f9fff77b1a32f347186ca6962e5734e475c Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Tue, 25 Feb 2025 20:23:58 -0800
Subject: [PATCH 1/6] Enable building KXC

---
 gpu4pyscf/lib/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt
index 91e1d87a..521c2361 100644
--- a/gpu4pyscf/lib/CMakeLists.txt
+++ b/gpu4pyscf/lib/CMakeLists.txt
@@ -157,7 +157,7 @@ if(BUILD_LIBXC)
     PREFIX ${PROJECT_BINARY_DIR}/deps
     INSTALL_DIR ${PROJECT_SOURCE_DIR}/deps
     CMAKE_ARGS -DBUILD_SHARED_LIBS=ON -DENABLE_CUDA=ON
-            -DENABLE_FORTRAN=OFF -DDISABLE_KXC=ON -DDISABLE_LXC=ON -DDISABLE_FHC=ON
+            -DENABLE_FORTRAN=OFF -DDISABLE_KXC=OFF -DDISABLE_LXC=ON -DDISABLE_FHC=ON
             -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
             -DCMAKE_INSTALL_LIBDIR:PATH=lib
             -DCMAKE_C_CREATE_SHARED_LIBRARY=${C_LINK_TEMPLATE}

From c716d7ab30ea52c392549f31936f6f2b1c091fa5 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Tue, 25 Feb 2025 20:27:42 -0800
Subject: [PATCH 2/6] Bump version

---
 builder/setup_libxc.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/builder/setup_libxc.py b/builder/setup_libxc.py
index e4e94af6..e19bf94e 100644
--- a/builder/setup_libxc.py
+++ b/builder/setup_libxc.py
@@ -27,15 +27,15 @@
 from distutils.util import get_platform
 
 NAME = 'gpu4pyscf-libxc'
-AUTHOR = 'Qiming Sun'
-AUTHOR_EMAIL = 'osirpt.sun@gmail.com'
-DESCRIPTION = 'GPU extensions for PySCF'
-LICENSE = 'GPLv3'
+AUTHOR = 'PySCF developers'
+AUTHOR_EMAIL = None
+DESCRIPTION = 'Customized LibXC for GPU4PySCF'
+LICENSE = 'Apache-2.0'
 URL = None
 DOWNLOAD_URL = None
 CLASSIFIERS = None
 PLATFORMS = None
-VERSION = '0.5'
+VERSION = '0.6'
 
 def get_cuda_version():
     nvcc_out = subprocess.check_output(["nvcc", "--version"]).decode('utf-8')

From 13f7083b3fd5cb7fd5cdcb2314047fc65381aa40 Mon Sep 17 00:00:00 2001
From: puzhichen <147788878+puzhichen@users.noreply.github.com>
Date: Wed, 26 Feb 2025 23:36:03 +0800
Subject: [PATCH 3/6] add tddft in nightly build (#338)

:
---
 .github/workflows/nightly_build.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml
index 29ec300f..b1e26090 100644
--- a/.github/workflows/nightly_build.yml
+++ b/.github/workflows/nightly_build.yml
@@ -46,3 +46,8 @@ jobs:
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
         pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_uks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
+    - name: Test TDDFT
+      run: |
+        echo $GITHUB_WORKSPACE
+        export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+        pytest gpu4pyscf/tests/test_benchmark_tddft.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_tddft_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/

From 3f3fad7786cbc76b8bd20f5717a8373edc7ca2f2 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Wed, 26 Feb 2025 09:28:38 -0800
Subject: [PATCH 4/6] Block divergent optimization for the molecular int3c2e
 integral tensor (#337)

* Block-divergent int3c2e

* int3c2e correct

* int3c2e block-divergent version correct

* unroll int3c2e

* add unrolled_int3c2e_bdiv

* Add sort_orbitals and unsort_orbitals functions for int3c2e_bdiv

* Compatibility between new int3c2e and existing implmentations

* fixes

* Fixes for int3c2e_bdiv version

* Remove unused code

* Removing debug code

* Add missing file

* Import circular dependency

* Fix merging

---------

Co-authored-by: Qiming Sun <qiming.sun@bytedance.com>
---
 gpu4pyscf/df/df.py                            |   53 +
 gpu4pyscf/df/int3c2e_bdiv.py                  |  485 ++
 gpu4pyscf/df/tests/test_df_int3c2e.py         |  130 +
 gpu4pyscf/gto/mole.py                         |   79 +-
 gpu4pyscf/lib/CMakeLists.txt                  |    1 +
 gpu4pyscf/lib/cupy_helper.py                  |   24 +-
 gpu4pyscf/lib/gint-rys/CMakeLists.txt         |   13 +
 gpu4pyscf/lib/gint-rys/fill_int3c2e.cu        |  302 ++
 gpu4pyscf/lib/gint-rys/fill_int3c2e_bdiv.cu   |  330 ++
 gpu4pyscf/lib/gint-rys/gint_driver.cu         |  145 +
 gpu4pyscf/lib/gint-rys/int3c2e.cuh            |   75 +
 gpu4pyscf/lib/gint-rys/rys_roots_dat.cu       |    1 +
 gpu4pyscf/lib/gint-rys/unrolled_int3c2e.cu    | 3947 ++++++++++++++++
 .../lib/gint-rys/unrolled_int3c2e_bdiv.cu     | 4093 +++++++++++++++++
 gpu4pyscf/pbc/df/int3c2e.py                   |    3 +-
 gpu4pyscf/scf/int4c2e.py                      |    2 +-
 16 files changed, 9651 insertions(+), 32 deletions(-)
 create mode 100644 gpu4pyscf/df/int3c2e_bdiv.py
 create mode 100644 gpu4pyscf/df/tests/test_df_int3c2e.py
 create mode 100644 gpu4pyscf/lib/gint-rys/CMakeLists.txt
 create mode 100644 gpu4pyscf/lib/gint-rys/fill_int3c2e.cu
 create mode 100644 gpu4pyscf/lib/gint-rys/fill_int3c2e_bdiv.cu
 create mode 100644 gpu4pyscf/lib/gint-rys/gint_driver.cu
 create mode 100644 gpu4pyscf/lib/gint-rys/int3c2e.cuh
 create mode 100644 gpu4pyscf/lib/gint-rys/rys_roots_dat.cu
 create mode 100644 gpu4pyscf/lib/gint-rys/unrolled_int3c2e.cu
 create mode 100644 gpu4pyscf/lib/gint-rys/unrolled_int3c2e_bdiv.cu

diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index c58c1428..9bcda36d 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -23,6 +23,7 @@
 from gpu4pyscf.lib.cupy_helper import (cholesky, tag_array, get_avail_mem, 
                                        cart2sph, p2p_transfer, copy_array)
 from gpu4pyscf.df import int3c2e, df_jk
+from gpu4pyscf.df import int3c2e_bdiv
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
 from gpu4pyscf.__config__ import _streams, num_devices
@@ -30,6 +31,7 @@
 MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128)
 ALIGNED = getattr(__config__, 'ao_aligned', 32)
 GB = 1024*1024*1024
+INT3C2E_V2 = False
 
 LINEAR_DEP_THR = incore.LINEAR_DEP_THR
 GROUP_SIZE = 256
@@ -82,6 +84,42 @@ def build(self, direct_scf_tol=1e-14, omega=None):
         if auxmol is None:
             self.auxmol = auxmol = addons.make_auxmol(mol, self.auxbasis)
 
+        if INT3C2E_V2:
+            self.intopt = intopt = int3c2e_bdiv.Int3c2eOpt(mol, auxmol)
+            self._cderi = {}
+            self._cderi[0] = _cholesky_eri_bdiv(intopt, omega=omega)
+            ao_pair_mapping = intopt.create_ao_pair_mapping(cart=mol.cart)
+            rows, cols = divmod(cupy.asarray(ao_pair_mapping), mol.nao)
+            intopt.cderi_row = rows
+            intopt.cderi_col = cols
+
+            # intopt.cderi_diag stores the indices for cderi_row that
+            # corresponds to the diagonal blocks. Note this index array can
+            # contain some of the off-diagonal elements which happen to be the
+            # off-diagonal elements while within the diagonal blocks.
+            uniq_l = intopt.uniq_l_ctr[:,0]
+            if mol.cart:
+                nf = (uniq_l + 1) * (uniq_l + 2) // 2
+            else:
+                nf = uniq_l * 2 + 1
+            n_groups = len(uniq_l)
+            ij_tasks = ((i, j) for i in range(n_groups) for j in range(i+1))
+            nbas = intopt.sorted_mol.nbas
+            offset = 0
+            cderi_diag = []
+            for (i, j), bas_ij_idx in zip(ij_tasks, intopt.shl_pair_idx):
+                nfi = nf[i]
+                nfj = nf[j]
+                if i == j: # the diagonal blocks
+                    ish, jsh = divmod(bas_ij_idx, nbas)
+                    idx = np.where(ish == jsh)[0]
+                    addr = offset + idx[:,None] * (nfi*nfi) + np.arange(nfi*nfi)
+                    cderi_diag.append(addr.ravel())
+                offset += bas_ij_idx.size * nfi * nfj
+            intopt.cderi_diag = cupy.asarray(np.hstack(cderi_diag))
+            log.timer_debug1('cholesky_eri', *t0)
+            return self
+
         if omega and omega > 1e-10:
             with auxmol.with_range_coulomb(omega):
                 j2c_cpu = auxmol.intor('int2c2e', hermi=1)
@@ -364,3 +402,18 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize,
                 _cderi[0][:,ij0:ij1] = cderi_block
             t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)    
     return
+
+# Generate CDERI using the new int3c2e_bdiv algorithm
+def _cholesky_eri_bdiv(intopt, omega=None):
+    assert isinstance(intopt, int3c2e_bdiv.Int3c2eOpt)
+    assert omega is None
+    eri3c = intopt.int3c2e_bdiv_kernel()
+    if intopt.mol.cart:
+        eri3c = intopt.orbital_pair_cart2sph(eri3c)
+    auxmol = intopt.auxmol
+    j2c = cupy.asarray(auxmol.intor('int2c2e', hermi=1), order='C')
+    cd_low = cholesky(j2c)
+    aux_coeff = cupy.array(intopt.aux_coeff, copy=True)
+    cd_low = solve_triangular(cd_low, aux_coeff.T, lower=True, overwrite_b=True)
+    cderi = cd_low.dot(eri3c.T)
+    return cderi
diff --git a/gpu4pyscf/df/int3c2e_bdiv.py b/gpu4pyscf/df/int3c2e_bdiv.py
new file mode 100644
index 00000000..d915d78f
--- /dev/null
+++ b/gpu4pyscf/df/int3c2e_bdiv.py
@@ -0,0 +1,485 @@
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+3-center 2-electron Coulomb integral helper functions
+'''
+
+import ctypes
+import math
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.lib.parameters import ANGULAR
+from pyscf.gto.mole import ANG_OF, ATOM_OF, PTR_COORD, PTR_EXP, conc_env
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import load_library, contract
+from gpu4pyscf.gto.mole import group_basis, PTR_BAS_COORD
+from gpu4pyscf.scf.jk import g_pair_idx, _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE
+from gpu4pyscf.gto.mole import basis_seg_contraction, extract_pgto_params, cart2sph_by_l
+
+__all__ = [
+    'aux_e2',
+]
+libgint_rys = load_library('libgint_rys')
+libgint_rys.fill_int3c2e.restype = ctypes.c_int
+libgint_rys.fill_int3c2e_bdiv.restype = ctypes.c_int
+libgint_rys.init_constant.restype = ctypes.c_int
+
+LMAX = 4
+L_AUX_MAX = 6
+GOUT_WIDTH = 45
+THREADS = 256
+
+def aux_e2(mol, auxmol):
+    r'''
+    Short-range 3-center integrals (ij|k). The auxiliary basis functions are
+    placed at the second electron.
+    '''
+    int3c2e_opt = Int3c2eOpt(mol, auxmol).build()
+    ao_pair_mapping = cp.asarray(int3c2e_opt.create_ao_pair_mapping())
+    nao, nao_orig = int3c2e_opt.coeff.shape
+    naux = int3c2e_opt.aux_coeff.shape[0]
+    out = cp.zeros((nao*nao, naux))
+    p0 = p1 = 0
+    for ij_shls, eri3c in int3c2e_opt.int3c2e_kernel():
+        p0, p1 = p1, p1 + eri3c.shape[0]
+        addr = ao_pair_mapping[p0:p1]
+        out[addr] = eri3c
+        i, j = divmod(addr, nao)
+        out[j*nao+i] = eri3c
+    log = logger.new_logger(mol)
+    t1 = log.init_timer()
+    out = out.reshape(nao, nao, naux)
+    aux_coeff = cp.asarray(int3c2e_opt.aux_coeff)
+    coeff = cp.asarray(int3c2e_opt.coeff)
+    out = contract('pqr,rk->pqk', out, aux_coeff)
+    out = contract('pqk,qj->pjk', out, coeff)
+    out = contract('pjk,pi->ijk', out, coeff)
+    t1 = log.timer_debug1('aux_e2: transform basis ordering', *t1)
+    return out
+
+class Int3c2eOpt:
+    def __init__(self, mol, auxmol):
+        self.mol = mol
+        self.auxmol = auxmol
+        self.sorted_mol = None
+
+    def build(self, cutoff=1e-14):
+        log = logger.new_logger(self.mol)
+        t0 = log.init_timer()
+        # allow_replica=True to transform the general contracted basis sets into
+        # segment contracted sets
+        mol, c2s = basis_seg_contraction(self.mol, allow_replica=True)
+        mol, coeff, uniq_l_ctr, l_ctr_counts, bas_mapping = group_basis(
+            mol, tile=1, return_bas_mapping=True)
+        self.sorted_mol = mol
+        self.uniq_l_ctr = uniq_l_ctr
+        l_ctr_offsets = self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts))
+        self.coeff = coeff.dot(c2s).get()
+        # Sorted AO indices, allow using the fancyindices to transform tensors
+        # between sorted_mol and mol (see function sort_orbitals)
+        ao_loc = mol.ao_loc_nr(cart=self.mol.cart)
+        ao_idx = np.array_split(np.arange(self.mol.nao), ao_loc[1:-1])
+        self.ao_idx = np.hstack([ao_idx[i] for i in bas_mapping]).argsort()
+
+        auxmol, coeff, uniq_l_ctr_aux, l_ctr_aux_counts = group_basis(self.auxmol, tile=1)
+        self.sorted_auxmol = auxmol
+        self.uniq_l_ctr_aux = uniq_l_ctr_aux
+        l_ctr_aux_offsets = self.l_ctr_aux_offsets = np.append(0, np.cumsum(l_ctr_aux_counts))
+        self.aux_coeff = coeff.get()
+
+        _atm_cpu, _bas_cpu, _env_cpu = conc_env(
+            mol._atm, mol._bas, _scale_sp_ctr_coeff(mol),
+            auxmol._atm, auxmol._bas, _scale_sp_ctr_coeff(auxmol))
+        #NOTE: PTR_BAS_COORD is not updated in conc_env()
+        off = _bas_cpu[mol.nbas,PTR_EXP] - auxmol._bas[0,PTR_EXP]
+        _bas_cpu[mol.nbas:,PTR_BAS_COORD] += off
+        self._atm = _atm_cpu
+        self._bas = _bas_cpu
+        self._env = _env_cpu
+
+        ao_loc_cpu = mol.ao_loc
+        aux_loc = auxmol.ao_loc
+
+        _atm = cp.array(_atm_cpu, dtype=np.int32)
+        _bas = cp.array(_bas_cpu, dtype=np.int32)
+        _env = cp.array(_env_cpu, dtype=np.float64)
+        ao_loc = cp.asarray(_conc_locs(ao_loc_cpu, aux_loc), dtype=np.int32)
+        self.int3c2e_envs = Int3c2eEnvVars(
+            mol.natm, mol.nbas, _atm.data.ptr, _bas.data.ptr, _env.data.ptr,
+            ao_loc.data.ptr, math.log(cutoff),
+        )
+        # Keep a reference to these arrays, prevent releasing them upon returning the closure
+        self.int3c2e_envs._env_ref_holder = (_atm, _bas, _env, ao_loc)
+
+        nksh_per_block = 16
+        # the auxiliary function offset (address) in the output tensor for each blockIdx.y
+        ksh_offsets = []
+        for ksh0, ksh1 in zip(l_ctr_aux_offsets[:-1], l_ctr_aux_offsets[1:]):
+            ksh_offsets.append(np.arange(ksh0, ksh1, nksh_per_block, dtype=np.int32))
+        ksh_offsets.append(l_ctr_aux_offsets[-1])
+        ksh_offsets = np.hstack(ksh_offsets)
+        ksh_offsets += mol.nbas
+        self.ksh_offsets = ksh_offsets
+
+        uniq_l = uniq_l_ctr[:,0]
+        assert uniq_l.max() <= LMAX
+        n_groups = len(uniq_l)
+        ij_tasks = ((i, j) for i in range(n_groups) for j in range(i+1))
+
+        ovlp = estimate_shl_ovlp(mol)
+        mask = np.tril(ovlp > cutoff)
+        # The effective shell pair = ish*nbas+jsh
+        shl_pair_idx = []
+        # the bas_ij_idx offset for each blockIdx.x
+        shl_pair_offsets = []
+        # the AO-pair offset (address) in the output tensor for each blockIdx.x
+        ao_pair_loc = []
+        nao_pair0 = nao_pair = 0
+        sp0 = sp1 = 0
+        nbas = mol.nbas
+        for i, j in ij_tasks:
+            li = uniq_l[i]
+            lj = uniq_l[j]
+            ish0, ish1 = l_ctr_offsets[i], l_ctr_offsets[i+1]
+            jsh0, jsh1 = l_ctr_offsets[j], l_ctr_offsets[j+1]
+            ish, jsh = np.where(mask[ish0:ish1,jsh0:jsh1])
+            ish += ish0
+            jsh += jsh0
+            idx = ish * nbas + jsh
+            nshl_pair = idx.size
+            shl_pair_idx.append(idx)
+            nfi = (li + 1) * (li + 2) // 2
+            nfj = (lj + 1) * (lj + 2) // 2
+            nfij = nfi * nfj
+            nao_pair0, nao_pair = nao_pair, nao_pair + nfij * nshl_pair
+
+            sp0, sp1 = sp1, sp1 + nshl_pair
+            nsp_per_block = _estimate_shl_pairs_per_block(li, lj, nshl_pair)
+            shl_pair_offsets.append(np.arange(sp0, sp1, nsp_per_block, dtype=np.int32))
+            ao_pair_loc.append(
+                np.arange(nao_pair0, nao_pair, nsp_per_block*nfij, dtype=np.int32))
+            if log.verbose >= logger.DEBUG2:
+                log.debug2('group=(%d,%d), li,lj=(%d,%d), sp range(%d,%d,%d), '
+                           'nao_pair offset=%d',
+                           i, j, li, lj, sp0, sp1, nsp_per_block, nao_pair0)
+
+        self.shl_pair_idx = shl_pair_idx
+        shl_pair_offsets.append([sp1])
+        self.shl_pair_offsets = np.hstack(shl_pair_offsets)
+        ao_pair_loc.append(nao_pair)
+        self.ao_pair_loc = np.hstack(ao_pair_loc)
+        if log.verbose >= logger.DEBUG1:
+            log.timer_debug1('initialize int3c2e_kernel', *t0)
+        return self
+
+    def int3c2e_kernel(self, cutoff=1e-14, verbose=None):
+        if self.sorted_mol is None:
+            self.build(cutoff)
+        log = logger.new_logger(self.mol, verbose)
+        t0 = t1 = log.init_timer()
+        l_ctr_offsets = self.l_ctr_offsets
+        l_ctr_aux_offsets = self.l_ctr_aux_offsets
+        int3c2e_envs = self.int3c2e_envs
+        _atm_cpu = self._atm
+        _bas_cpu = self._bas
+        _env_cpu = self._env
+        mol = self.sorted_mol
+        aux_loc = self.sorted_auxmol.ao_loc
+        naux = aux_loc[-1]
+
+        uniq_l = self.uniq_l_ctr[:,0]
+        nfcart = (uniq_l + 1) * (uniq_l + 2) // 2
+        n_groups = len(uniq_l)
+        ij_tasks = [(i, j) for i in range(n_groups) for j in range(i+1)]
+        npair_ij = 0
+        for (i, j), bas_ij_idx in zip(ij_tasks, self.shl_pair_idx):
+            nfij = nfcart[i] * nfcart[j]
+            npair_ij = max(npair_ij, len(bas_ij_idx) * nfij)
+        buf = cp.empty((npair_ij, naux))
+
+        init_constant(mol)
+        kern = libgint_rys.fill_int3c2e
+        timing_collection = {}
+        kern_counts = 0
+
+        for (i, j), bas_ij_idx in zip(ij_tasks, self.shl_pair_idx):
+            ish0, ish1 = l_ctr_offsets[i], l_ctr_offsets[i+1]
+            jsh0, jsh1 = l_ctr_offsets[j], l_ctr_offsets[j+1]
+            npair_ij = len(bas_ij_idx)
+            bas_ij_idx = cp.asarray(bas_ij_idx, dtype=np.int32)
+            li = uniq_l[i]
+            lj = uniq_l[j]
+            nfij = nfcart[i] * nfcart[j]
+            eri3c = cp.ndarray((npair_ij*nfij, naux), dtype=np.float64, memptr=buf.data)
+
+            for k, lk in enumerate(self.uniq_l_ctr_aux[:,0]):
+                ksh0, ksh1 = l_ctr_aux_offsets[k:k+2]
+                shls_slice = ish0, ish1, jsh0, jsh1, ksh0, ksh1
+                lll = f'({ANGULAR[li]}{ANGULAR[lj]}|{ANGULAR[lk]})'
+                scheme = int3c2e_scheme(li, lj, lk)
+                log.debug2('int3c2e_scheme for %s: %s', lll, scheme)
+                err = kern(
+                    ctypes.cast(eri3c.data.ptr, ctypes.c_void_p),
+                    ctypes.byref(int3c2e_envs), (ctypes.c_int*3)(*scheme),
+                    (ctypes.c_int*6)(*shls_slice), aux_loc.ctypes,
+                    ctypes.c_int(naux), ctypes.c_int(npair_ij),
+                    ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p),
+                    _atm_cpu.ctypes, ctypes.c_int(mol.natm),
+                    _bas_cpu.ctypes, ctypes.c_int(mol.nbas), _env_cpu.ctypes)
+                if err != 0:
+                    raise RuntimeError(f'fill_int3c2e kernel for {lll} failed')
+                if log.verbose >= logger.DEBUG1:
+                    t1, t1p = log.timer_debug1(f'processing {lll}', *t1), t1
+                    if lll not in timing_collection:
+                        timing_collection[lll] = 0
+                    timing_collection[lll] += t1[1] - t1p[1]
+                    kern_counts += 1
+
+            ij_shls = ish0, ish1, jsh0, jsh1
+            yield ij_shls, eri3c
+
+        if log.verbose >= logger.DEBUG1:
+            cp.cuda.Stream.null.synchronize()
+            log.timer('int3c2e', *t0)
+            log.debug1('kernel launches %d', kern_counts)
+            for lll, t in timing_collection.items():
+                log.debug1('%s wall time %.2f', lll, t)
+
+    def int3c2e_bdiv_kernel(self, cutoff=1e-14, verbose=None):
+        '''Construct the entire block using the block-divergent parallelism'''
+        if self.sorted_mol is None:
+            self.build(cutoff)
+        log = logger.new_logger(self.mol, verbose)
+        t0 = log.init_timer()
+        int3c2e_envs = self.int3c2e_envs
+        _atm_cpu = self._atm
+        _bas_cpu = self._bas
+        _env_cpu = self._env
+        mol = self.sorted_mol
+        aux_loc = self.sorted_auxmol.ao_loc
+        naux = aux_loc[-1]
+        nao_pair = self.ao_pair_loc[-1]
+
+        # nst_lookup stores the nst_per_block for each (li,lj,lk) pattern
+        nst_lookup = cp.asarray(create_nst_lookup_table(), dtype=np.int32)
+
+        shl_pair_idx = cp.asarray(np.hstack(self.shl_pair_idx), dtype=np.int32)
+        shl_pair_offsets = cp.asarray(self.shl_pair_offsets, dtype=np.int32)
+        ksh_offsets = cp.asarray(self.ksh_offsets, dtype=np.int32)
+        nbatches_shl_pair = len(shl_pair_offsets) - 1
+        nbatches_ksh = len(ksh_offsets) - 1
+        ao_pair_loc = cp.asarray(self.ao_pair_loc, dtype=np.int32)
+        log.debug1('sp_blocks = %d, ksh_blocks = %d', nbatches_shl_pair, nbatches_ksh)
+
+        init_constant(mol)
+        kern = libgint_rys.fill_int3c2e_bdiv
+        eri3c = cp.empty((nao_pair, naux))
+        err = kern(
+            ctypes.cast(eri3c.data.ptr, ctypes.c_void_p),
+            ctypes.byref(int3c2e_envs),
+            ctypes.c_int(SHM_SIZE), ctypes.c_int(naux),
+            ctypes.c_int(nbatches_shl_pair), ctypes.c_int(nbatches_ksh),
+            ctypes.cast(shl_pair_offsets.data.ptr, ctypes.c_void_p),
+            ctypes.cast(ao_pair_loc.data.ptr, ctypes.c_void_p),
+            ctypes.cast(ksh_offsets.data.ptr, ctypes.c_void_p),
+            ctypes.cast(shl_pair_idx.data.ptr, ctypes.c_void_p),
+            ctypes.cast(nst_lookup.data.ptr, ctypes.c_void_p),
+            _atm_cpu.ctypes, ctypes.c_int(mol.natm),
+            _bas_cpu.ctypes, ctypes.c_int(mol.nbas), _env_cpu.ctypes)
+        if err != 0:
+            raise RuntimeError('fill_int3c2e_bdiv kernel failed')
+        if log.verbose >= logger.DEBUG1:
+            cp.cuda.Stream.null.synchronize()
+            log.timer_debug1('processing int3c2e_bdiv_kernel', *t0)
+        return eri3c
+
+    def create_ao_pair_mapping(self, cart=True):
+        '''ao_pair_mapping stores AO-pair addresses in the nao x nao matrix,
+        which allows the decompression for the CUDA kernel generated compressed_eri3c:
+        sparse_eri3c[ao_pair_mapping] = compressed_eri3c
+
+        int3c2e CUDA kernel stores intgrals as [ij_shl,j,i,k,ksh].
+        ao_pair_mapping indicates the ij addresses in eri3c[k,i,j];
+        '''
+        mol = self.sorted_mol
+        ao_loc = mol.ao_loc_nr(cart)
+        nao = ao_loc[-1]
+        uniq_l = self.uniq_l_ctr[:,0]
+        if cart:
+            nf = (uniq_l + 1) * (uniq_l + 2) // 2
+        else:
+            nf = uniq_l * 2 + 1
+        n_groups = len(uniq_l)
+        ij_tasks = ((i, j) for i in range(n_groups) for j in range(i+1))
+        nbas = mol.nbas
+        ao_pair_mapping = []
+        for (i, j), bas_ij_idx in zip(ij_tasks, self.shl_pair_idx):
+            ish, jsh = divmod(bas_ij_idx, nbas)
+            nfi = nf[i]
+            nfj = nf[j]
+            iaddr = ao_loc[ish,None] + np.arange(nfi)
+            jaddr = ao_loc[jsh,None] + np.arange(nfj)
+            ao_pair_mapping.append((iaddr[:,None,:] * nao + jaddr[:,:,None]).ravel())
+        return np.hstack(ao_pair_mapping)
+
+    def orbital_pair_cart2sph(self, compressed_eri3c, inplace=True):
+        '''Transforms the AO of the compressed eri3c from Cartesian to spherical basis'''
+        if inplace:
+            out = compressed_eri3c
+        else:
+            out = compressed_eri3c.copy()
+        uniq_l = self.uniq_l_ctr[:,0]
+        n_groups = len(uniq_l)
+        ij_tasks = ((i, j) for i in range(n_groups) for j in range(i+1))
+        c2s = [cart2sph_by_l(l) for l in uniq_l]
+        naux = compressed_eri3c.shape[1]
+        npair0 = npair = 0
+        p0 = p1 = 0
+        for (i, j), bas_ij_idx in zip(ij_tasks, self.shl_pair_idx):
+            nshl_pair = bas_ij_idx.size
+            ci = c2s[i]
+            cj = c2s[j]
+            nfi, di = ci.shape
+            nfj, dj = cj.shape
+            npair0, npair = npair, npair + nfi*nfj * nshl_pair
+            p0, p1 = p1, p1 + di*dj * nshl_pair
+            if npair0 > len(compressed_eri3c):
+                raise RuntimeError('Size mismatch. The eri3c may have been transformed')
+            t = compressed_eri3c[npair0:npair].reshape(nshl_pair,nfj,nfi,naux)
+            t = contract('mpqr,pj->mjqr', t, cj)
+            t = contract('mjqr,qi->mjir', t, ci)
+            out[p0:p1] = t.reshape(p1-p0,naux)
+        return out[:p1]
+
+    def sort_orbitals(self, mat, axis=[]):
+        ''' Transform given axis of a matrix into sorted AO'''
+        ndim_to_transform = len(axis)
+        assert ndim_to_transform <= 2
+        if ndim_to_transform == 0:
+            return mat
+
+        idx = self.ao_idx
+        fancy_index = [slice(None)] * mat.ndim
+        if ndim_to_transform == 1:
+            fancy_index[axis[0]] = idx
+        elif ndim_to_transform == 2:
+            assert abs(axis[0] - axis[1]) == 1, 'Must be adjacent axes'
+            fancy_index[axis[0]] = idx[:,None]
+            fancy_index[axis[1]] = idx
+        return mat[tuple(fancy_index)]
+
+    def unsort_orbitals(self, sorted_mat, axis=[]):
+        '''sort_orbitals reversed, transform the matrix in sorted AOs back to
+        the original matrix.
+        '''
+        ndim_to_transform = len(axis)
+        assert ndim_to_transform <= 2
+        if ndim_to_transform == 0:
+            return sorted_mat
+
+        idx = self.ao_idx
+        fancy_index = [slice(None)] * sorted_mat.ndim
+        if ndim_to_transform == 1:
+            fancy_index[axis[0]] = idx
+        elif ndim_to_transform == 2:
+            assert abs(axis[0] - axis[1]) == 1, 'Must be adjacent axes'
+            fancy_index[axis[0]] = idx[:,None]
+            fancy_index[axis[1]] = idx
+        mat = cp.empty_like(sorted_mat)
+        mat[tuple(fancy_index)] = sorted_mat
+        return mat
+
+def _conc_locs(ao_loc1, ao_loc2):
+    return np.append(ao_loc1[:-1], ao_loc1[-1] + ao_loc2)
+
+class Int3c2eEnvVars(ctypes.Structure):
+    _fields_ = [
+        ('natm', ctypes.c_uint16),
+        ('nbas', ctypes.c_uint16),
+        ('atm', ctypes.c_void_p),
+        ('bas', ctypes.c_void_p),
+        ('env', ctypes.c_void_p),
+        ('ao_loc', ctypes.c_void_p),
+        ('log_cutoff', ctypes.c_float),
+    ]
+
+def init_constant(mol):
+    g_idx, offsets = g_pair_idx()
+    err = libgint_rys.init_constant(
+        g_idx.ctypes, offsets.ctypes, mol._env.ctypes, ctypes.c_int(mol._env.size),
+        ctypes.c_int(SHM_SIZE))
+    if err != 0:
+        raise RuntimeError('CUDA kernel initialization')
+
+def int3c2e_scheme(li, lj, lk, shm_size=SHM_SIZE):
+    order = li + lj + lk
+    nroots = (order//2 + 1) * 2
+
+    g_size = (li+1)*(lj+1)*(lk+1)
+    unit = g_size*3 + nroots*2 + 7
+    nst_max = shm_size//(unit*8)
+    nst_max = _nearest_power2(nst_max)
+
+    nfi = (li + 1) * (li + 2) // 2
+    nfj = (lj + 1) * (lj + 2) // 2
+    nfk = (lk + 1) * (lk + 2) // 2
+    gout_size = nfi * nfj * nfk
+    gout_stride = (gout_size + GOUT_WIDTH-1) // GOUT_WIDTH
+    # Round up to the next 2^n
+    gout_stride = _nearest_power2(gout_stride, return_leq=False)
+    gout_stride = min(gout_stride, 64)
+
+    nst_per_block = min(nst_max, THREADS // gout_stride)
+    gout_stride = THREADS // nst_per_block
+    return nst_per_block, gout_stride
+
+def _estimate_shl_pairs_per_block(li, lj, nshl_pair):
+    return _nearest_power2(THREADS*2 // ((li+1)*(lj+1)), return_leq=False)
+
+def create_nst_lookup_table():
+    nst_lookup = np.empty([L_AUX_MAX+1]*3, dtype=np.int32)
+    for lk in range(L_AUX_MAX+1):
+        for li in range(lk+1):
+            for lj in range(li+1):
+                nst_lookup[lk,li,lj] = int3c2e_scheme(li, lj, lk)[0]
+    idx = np.arange(L_AUX_MAX+1)
+    z, y, x = np.sort(np.meshgrid(idx, idx, idx), axis=0)
+    nst_lookup = nst_lookup[x, y, z]
+    return nst_lookup[:,:LMAX+1,:LMAX+1]
+
+def estimate_shl_ovlp(mol):
+    # consider only the most diffused component of a basis
+    exps, cs = extract_pgto_params(mol, 'diffused')
+    ls = mol._bas[:,ANG_OF]
+    bas_coords = mol.atom_coords()[mol._bas[:,ATOM_OF]]
+
+    norm = cs * ((2*ls+1)/(4*np.pi))**.5
+    aij = exps[:,None] + exps
+    fi = exps[:,None] / aij
+    fj = exps[None,:] / aij
+    theta = exps[:,None] * fj
+
+    rirj = bas_coords[:,None,:] - bas_coords
+    dr = np.linalg.norm(rirj, axis=2)
+    dri = fj * dr
+    drj = fi * dr
+    li = ls[:,None]
+    lj = ls[None,:]
+    fac_dri = (li * .5/aij + dri**2) ** (li*.5)
+    fac_drj = (lj * .5/aij + drj**2) ** (lj*.5)
+    fac_norm = norm[:,None]*norm * (np.pi/aij)**1.5
+    ovlp = fac_norm * np.exp(-theta*dr**2) * fac_dri * fac_drj
+    return ovlp
diff --git a/gpu4pyscf/df/tests/test_df_int3c2e.py b/gpu4pyscf/df/tests/test_df_int3c2e.py
new file mode 100644
index 00000000..79fd7c91
--- /dev/null
+++ b/gpu4pyscf/df/tests/test_df_int3c2e.py
@@ -0,0 +1,130 @@
+import cupy as cp
+import pyscf
+from pyscf.df import incore
+from gpu4pyscf.df import int3c2e_bdiv
+from gpu4pyscf.lib.cupy_helper import contract
+
+def test_int3c2e():
+    mol = pyscf.M(
+        atom='''C1   1.3    .2       .3
+                C2   .19   .1      1.1
+        ''',
+        basis={'C1': [[3, [1.5, 1.], [.9, 1.]],
+                      [4, [2., 1.]]],
+               'C2': 'ccpvdz'})
+    auxmol = mol.copy()
+    auxmol.basis = {
+        'C1': '''
+C    S
+      2.9917624900           1.0000000000
+C    P
+     28.1325940100           1.0000000000
+C    P
+      9.8364318200           1.0000000000
+C    P
+      3.3490545000           1.0000000000
+C    P
+      1.4947618600           1.0000000000
+C    P
+      0.5769010900           1.0000000000
+C    D
+      0.1995412500           1.0000000000 ''',
+        'C2': [[0, [.5, 1.]], [1, [.8, 1.]], [3, [.9, 1]]],
+    }
+    auxmol.build()
+    dat = int3c2e_bdiv.aux_e2(mol, auxmol)
+    ref = incore.aux_e2(mol, auxmol)
+    assert abs(dat.get()-ref).max() < 1e-10
+
+def test_int3c2e_bdiv():
+    mol = pyscf.M(
+        atom='''C1   1.3    .2       .3
+                C2   .19   .1      1.1
+        ''',
+        basis={'C1': [[3, [1.5, 1.], [.9, 1.]],
+                      [4, [2., 1.]]],
+               'C2': 'ccpvdz'})
+
+    auxmol = mol.copy()
+    auxmol.basis = {
+        'C1':'''
+C    S
+      2.9917624900           1.0000000000
+C    P
+     28.1325940100           1.0000000000
+C    P
+      9.8364318200           1.0000000000
+C    P
+      3.3490545000           1.0000000000
+C    P
+      1.4947618600           1.0000000000
+C    P
+      0.5769010900           1.0000000000
+C    D
+      0.1995412500           1.0000000000 ''',
+        'C2':[[0, [.5, 1.]], [1, [.8, 1.]], [3, [.9, 1]]],
+    }
+    auxmol.build()
+    int3c2e_opt = int3c2e_bdiv.Int3c2eOpt(mol, auxmol).build()
+    nao, nao_orig = int3c2e_opt.coeff.shape
+    naux = int3c2e_opt.aux_coeff.shape[0]
+    out = cp.zeros((nao*nao, naux))
+    eri3c = int3c2e_opt.int3c2e_bdiv_kernel()
+    ao_pair_mapping = int3c2e_opt.create_ao_pair_mapping()
+    out[ao_pair_mapping] = eri3c
+    i, j = divmod(ao_pair_mapping, nao)
+    out[j*nao+i] = eri3c
+    out = out.reshape(nao, nao, naux)
+    aux_coeff = cp.asarray(int3c2e_opt.aux_coeff)
+    coeff = cp.asarray(int3c2e_opt.coeff)
+    out = contract('pqr,rk->pqk', out, aux_coeff)
+    out = contract('pqk,qj->pjk', out, coeff)
+    out = contract('pjk,pi->ijk', out, coeff)
+    ref = incore.aux_e2(mol, auxmol)
+    assert abs(out.get()-ref).max() < 1e-10
+
+    eri3c = int3c2e_opt.orbital_pair_cart2sph(eri3c)
+    ao_pair_mapping = int3c2e_opt.create_ao_pair_mapping(cart=mol.cart)
+    out = cp.zeros((nao_orig*nao_orig, naux))
+    out[ao_pair_mapping] = eri3c
+    i, j = divmod(ao_pair_mapping, nao_orig)
+    out[j*nao_orig+i] = eri3c
+    out = out.reshape(nao_orig, nao_orig, naux)
+    out = contract('pqr,rk->pqk', out, aux_coeff)
+    out = int3c2e_opt.unsort_orbitals(out, axis=(0,1))
+    assert abs(out.get()-ref).max() < 1e-10
+
+def test_int3c2e_sparse():
+    mol = pyscf.M(
+        atom='''
+O       0.873    5.017    1.816
+H       1.128    5.038    2.848
+H       0.173    4.317    1.960
+O       3.665    1.316    1.319
+H       3.904    2.233    1.002
+H       4.224    0.640    0.837
+''',
+        basis='def2-tzvp'
+    )
+    auxmol = mol.copy()
+    auxmol.basis = 'ccpvdz-jkfit'
+    auxmol.build()
+    int3c2e_opt = int3c2e_bdiv.Int3c2eOpt(mol, auxmol).build()
+    dat = int3c2e_bdiv.aux_e2(mol, auxmol)
+    ref = incore.aux_e2(mol, auxmol)
+    assert abs(dat.get()-ref).max() < 1e-10
+
+    eri3c = int3c2e_opt.int3c2e_bdiv_kernel()
+    eri3c = int3c2e_opt.orbital_pair_cart2sph(eri3c)
+    ao_pair_mapping = int3c2e_opt.create_ao_pair_mapping(cart=mol.cart)
+    nao, nao_orig = int3c2e_opt.coeff.shape
+    naux = int3c2e_opt.aux_coeff.shape[0]
+    out = cp.zeros((nao_orig*nao_orig, naux))
+    out[ao_pair_mapping] = eri3c
+    i, j = divmod(ao_pair_mapping, nao_orig)
+    out[j*nao_orig+i] = eri3c
+    out = out.reshape(nao_orig, nao_orig, naux)
+    aux_coeff = cp.asarray(int3c2e_opt.aux_coeff)
+    out = contract('pqr,rk->pqk', out, aux_coeff)
+    out = int3c2e_opt.unsort_orbitals(out, axis=(0,1))
+    assert abs(out.get()-ref).max() < 1e-10
diff --git a/gpu4pyscf/gto/mole.py b/gpu4pyscf/gto/mole.py
index f3237e96..2f384d93 100644
--- a/gpu4pyscf/gto/mole.py
+++ b/gpu4pyscf/gto/mole.py
@@ -15,21 +15,17 @@
 
 import functools
 import numpy as np
-import scipy.linalg
+import cupy as cp
 from pyscf import gto
 from pyscf.gto import (ANG_OF, ATOM_OF, NPRIM_OF, NCTR_OF, PTR_COORD, PTR_COEFF,
                        PTR_EXP)
-from gpu4pyscf.lib import logger
 
 PTR_BAS_COORD = 7
 
 @functools.lru_cache(20)
-def get_cart2sph(lmax=12):
-    cart2sph = []
-    for l in range(lmax):
-        c2s = gto.mole.cart2sph(l, normalized='sp')
-        cart2sph.append(np.asarray(c2s, order='C'))
-    return cart2sph
+def cart2sph_by_l(l, normalized='sp'):
+    c2s = gto.mole.cart2sph(l, normalized='sp')
+    return cp.asarray(c2s, order='C')
 
 def basis_seg_contraction(mol, allow_replica=1):
     '''transform generally contracted basis to segment contracted basis
@@ -40,6 +36,7 @@ def basis_seg_contraction(mol, allow_replica=1):
             By default, high angular momentum functions (d, f shells) are fully
             uncontracted.
     '''
+    from gpu4pyscf.lib.cupy_helper import block_diag
     # Ensure backward compatibility. When allow_replica is True, decontraction
     # to primitive functions is disabled. When allow_replica is False, all
     # general contraction are decontracted.
@@ -69,13 +66,13 @@ def basis_seg_contraction(mol, allow_replica=1):
                 nctr = shell[NCTR_OF]
                 if nctr == 1:
                     bas_of_ia.append(shell)
-                    coeff.append(np.eye(nf))
+                    coeff.append(cp.eye(nf))
                     continue
                 # Only basis with nctr > 1 needs to be decontracted
                 nprim = shell[NPRIM_OF]
                 pcoeff = shell[PTR_COEFF]
                 if l <= allow_replica:
-                    coeff.extend([np.eye(nf)] * nctr)
+                    coeff.extend([cp.eye(nf)] * nctr)
                     bs = np.repeat(shell[np.newaxis], nctr, axis=0)
                     bs[:,NCTR_OF] = 1
                     bs[:,PTR_COEFF] = np.arange(pcoeff, pcoeff+nprim*nctr, nprim)
@@ -87,7 +84,7 @@ def basis_seg_contraction(mol, allow_replica=1):
                     # remove normalization from contraction coefficients
                     c = _env[pcoeff:pcoeff+nprim*nctr].reshape(nctr,nprim)
                     c = np.einsum('ip,p,ef->iepf', c, 1/norm, np.eye(nf))
-                    coeff.append(c.reshape(nf*nctr, nf*nprim).T)
+                    coeff.append(cp.asarray(c.reshape(nf*nctr, nf*nprim).T))
 
                     _env[pcoeff:pcoeff+nprim] = norm
                     bs = np.repeat(shell[np.newaxis], nprim, axis=0)
@@ -110,10 +107,11 @@ def basis_seg_contraction(mol, allow_replica=1):
     pmol.cart = True
     pmol._bas = np.asarray(np.vstack(_bas), dtype=np.int32)
     pmol._env = _env
-    contr_coeff = scipy.linalg.block_diag(*contr_coeff)
+    contr_coeff = block_diag(contr_coeff)
 
     if not mol.cart:
-        contr_coeff = contr_coeff.dot(mol.cart2sph_coeff())
+        c2s = block_diag([cart2sph_by_l(l) for l in pmol._bas[:,ANG_OF]])
+        contr_coeff = contr_coeff.dot(c2s)
     return pmol, contr_coeff
 
 def sort_atoms(mol):
@@ -160,8 +158,13 @@ def sort_atoms(mol):
 
     return [x for heavy_list in full_path for x in heavy_list]
 
-def group_basis(mol, tile=1, group_size=None):
-    '''Group basis functions according to their [l, nprim] patterns'''
+def group_basis(mol, tile=1, group_size=None, return_bas_mapping=False):
+    '''Group basis functions according to their [l, nprim] patterns.
+
+    bas_mapping is the index that transforms _bas from sorted_mol to mol:
+    mol._bas = sorted_mol._bas[bas_mapping]
+    '''
+    from gpu4pyscf.lib import logger
     mol, coeff = basis_seg_contraction(mol)
     # Sort basis according to angular momentum and contraction patterns so
     # as to group the basis functions to blocks in GPU kernel.
@@ -175,10 +178,11 @@ def group_basis(mol, tile=1, group_size=None):
 
     nao_orig = coeff.shape[1]
     ao_loc = mol.ao_loc
-    coeff = np.split(coeff, ao_loc[1:-1], axis=0)
+    coeff = cp.split(coeff, ao_loc[1:-1], axis=0)
 
     pad_bas = []
     if tile > 1:
+        assert not return_bas_mapping, 'bas_mapping requires tile=1'
         l_ctr_counts_orig = l_ctr_counts.copy()
         pad_inv_idx = []
         env_ptr = mol._env.size
@@ -196,12 +200,12 @@ def group_basis(mol, tile=1, group_size=None):
 
             l = l_ctr[0]
             nf = (l + 1) * (l + 2) // 2
-            coeff.extend([np.zeros((nf, nao_orig))] * padding)
+            coeff.extend([cp.zeros((nf, nao_orig))] * padding)
 
         inv_idx = np.hstack([inv_idx.ravel(), pad_inv_idx])
 
     sorted_idx = np.argsort(inv_idx.ravel(), kind='stable').astype(np.int32)
-    coeff = np.vstack([coeff[i] for i in sorted_idx])
+    coeff = cp.vstack([coeff[i] for i in sorted_idx])
     assert coeff.shape[0] < 32768
 
     max_nprims = uniq_l_ctr[:,1].max()
@@ -228,7 +232,10 @@ def group_basis(mol, tile=1, group_size=None):
 
     # PTR_BAS_COORD is required by various CUDA kernels
     mol._bas[:,PTR_BAS_COORD] = mol._atm[mol._bas[:,ATOM_OF],PTR_COORD]
-    return mol, coeff, uniq_l_ctr, l_ctr_counts
+    if return_bas_mapping:
+        return mol, coeff, uniq_l_ctr, l_ctr_counts, sorted_idx.argsort()
+    else:
+        return mol, coeff, uniq_l_ctr, l_ctr_counts
 
 def _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size, align=1):
     '''Splits l_ctr patterns into small groups with group_size the maximum
@@ -257,3 +264,37 @@ def _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size, align=1):
     uniq_l_ctr = np.vstack(_l_ctrs)
     l_ctr_counts = np.hstack(_l_ctr_counts)
     return uniq_l_ctr, l_ctr_counts
+
+# This function is only available in pyscf-2.8 or later
+def extract_pgto_params(mol, op='diffused'):
+    '''A helper function to extract exponents and contraction coefficients for
+    estimate_xxx function
+    '''
+    es = []
+    cs = []
+    if op == 'diffused':
+        precision = 1e-8
+        for i in range(mol.nbas):
+            e = mol.bas_exp(i)
+            c = abs(mol._libcint_ctr_coeff(i)).max(axis=1)
+            l = mol.bas_angular(i)
+            # A quick estimation for the radius that each primitive GTO vanishes
+            r2 = np.log(c**2 / precision * 10**l) / e
+            idx = r2.argmax()
+            es.append(e[idx])
+            cs.append(c[idx].max())
+    elif op == 'compact':
+        precision = 1e-8
+        for i in range(mol.nbas):
+            e = mol.bas_exp(i)
+            c = abs(mol._libcint_ctr_coeff(i)).max(axis=1)
+            l = mol.bas_angular(i)
+            # A quick estimation for the resolution of planewaves that each
+            # primitive GTO requires
+            ke = np.log(c**2 / precision * 50**l) * e
+            idx = ke.argmax()
+            es.append(e[idx])
+            cs.append(c[idx].max())
+    else:
+        raise RuntimeError(f'Unsupported operation {op}')
+    return np.array(es), np.array(cs)
diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt
index 91e1d87a..4dbd8f2c 100644
--- a/gpu4pyscf/lib/CMakeLists.txt
+++ b/gpu4pyscf/lib/CMakeLists.txt
@@ -144,6 +144,7 @@ if(BUILD_SOLVENT)
   add_subdirectory(solvent)
 endif()
 
+add_subdirectory(gint-rys)
 add_subdirectory(gvhf-rys)
 add_subdirectory(gvhf-md)
 add_subdirectory(pbc)
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 20b91a8c..490455b5 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -20,7 +20,6 @@
 import cupy
 from pyscf import lib
 from gpu4pyscf.lib import logger
-from gpu4pyscf.gto import mole
 from gpu4pyscf.lib.cutensor import contract
 from gpu4pyscf.lib.cusolver import eigh, cholesky  #NOQA
 from gpu4pyscf.lib.memcpy import copy_array, p2p_transfer  #NOQA
@@ -29,10 +28,6 @@
 LMAX_ON_GPU = 7
 DSOLVE_LINDEP = 1e-13
 
-c2s_l = mole.get_cart2sph(lmax=LMAX_ON_GPU)
-c2s_offset = np.cumsum([0] + [x.shape[0]*x.shape[1] for x in c2s_l])
-_data = {'c2s': None}
-
 _kernel_registery = {}
 
 def load_library(libname):
@@ -306,6 +301,14 @@ def dist_matrix(x, y, out=None):
         raise RuntimeError('failed in calculating distance matrix')
     return out
 
+@functools.lru_cache(1)
+def _initialize_c2s_data():
+    from gpu4pyscf.gto import mole
+    c2s_l = [mole.cart2sph_by_l(l) for l in range(LMAX_ON_GPU)]
+    c2s_data = cupy.concatenate([x.ravel() for x in c2s_l])
+    c2s_offset = np.cumsum([0] + [x.shape[0]*x.shape[1] for x in c2s_l])
+    return c2s_l, c2s_data, c2s_offset
+
 def block_c2s_diag(angular, counts):
     '''
     Diagonal blocked cartesian to spherical transformation
@@ -313,10 +316,7 @@ def block_c2s_diag(angular, counts):
         angular (list): angular momentum type, e.g. [0,1,2,3]
         counts (list): count of each angular momentum
     '''
-    if _data['c2s'] is None:
-        c2s_data = cupy.concatenate([cupy.asarray(x.ravel()) for x in c2s_l])
-        _data['c2s'] = c2s_data
-    c2s_data = _data['c2s']
+    c2s_l, c2s_data, c2s_offset = _initialize_c2s_data()
 
     nshells = np.sum(counts)
     rows = [np.array([0], dtype='int32')]
@@ -489,11 +489,12 @@ def cart2sph_cutensor(t, axis=0, ang=1, out=None):
     '''
     transform 'axis' of a tensor from cartesian basis into spherical basis with cutensor
     '''
+    from gpu4pyscf.gto import mole
     if(ang <= 1):
         if(out is not None): out[:] = t
         return t
     size = list(t.shape)
-    c2s = cupy.asarray(c2s_l[ang])
+    c2s = mole.cart2sph_by_l(ang)
     if(not t.flags['C_CONTIGUOUS']): t = cupy.asarray(t, order='C')
     li_size = c2s.shape
     nli = size[axis] // li_size[0]
@@ -511,11 +512,12 @@ def cart2sph(t, axis=0, ang=1, out=None, stream=None):
     '''
     transform 'axis' of a tensor from cartesian basis into spherical basis
     '''
+    from gpu4pyscf.gto import mole
     if(ang <= 1):
         if(out is not None): out[:] = t
         return t
     size = list(t.shape)
-    c2s = c2s_l[ang]
+    c2s = mole.cart2sph_by_l(ang)
     if(not t.flags['C_CONTIGUOUS']): t = cupy.asarray(t, order='C')
     li_size = c2s.shape
     nli = size[axis] // li_size[0]
diff --git a/gpu4pyscf/lib/gint-rys/CMakeLists.txt b/gpu4pyscf/lib/gint-rys/CMakeLists.txt
new file mode 100644
index 00000000..f0583873
--- /dev/null
+++ b/gpu4pyscf/lib/gint-rys/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")# -maxrregcount=128")
+
+add_library(gint_rys SHARED
+  gint_driver.cu fill_int3c2e.cu unrolled_int3c2e.cu
+  fill_int3c2e_bdiv.cu unrolled_int3c2e_bdiv.cu
+  rys_roots_dat.cu
+)
+
+set_target_properties(gint_rys PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}
+  CUDA_SEPARABLE_COMPILATION ON)
+
+#target_link_libraries(ft_ao OpenMP::OpenMP_C)
diff --git a/gpu4pyscf/lib/gint-rys/fill_int3c2e.cu b/gpu4pyscf/lib/gint-rys/fill_int3c2e.cu
new file mode 100644
index 00000000..295b8239
--- /dev/null
+++ b/gpu4pyscf/lib/gint-rys/fill_int3c2e.cu
@@ -0,0 +1,302 @@
+/*
+ * Copyright 2025 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+#include "gvhf-rys/vhf.cuh"
+#include "gvhf-rys/rys_roots.cu"
+#include "int3c2e.cuh"
+
+// TODO: benchmark performance for 32, 38, 40, 45, 54
+#define GOUT_WIDTH      45
+
+__global__
+void int3c2e_kernel(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int gout_stride = blockDim.y;
+    int st_id = threadIdx.x;
+    int gout_id = threadIdx.y;
+    int batch_id = blockIdx.x;
+    int li = bounds.li;
+    int lj = bounds.lj;
+    int lk = bounds.lk;
+    int lij = li + lj;
+    int nroots = bounds.nroots;
+    int nfij = bounds.nfij;
+    int nfk = bounds.nfk;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int stride_j = bounds.stride_j;
+    int stride_k = bounds.stride_k;
+    int g_size = bounds.g_size;
+    int *idx_ij = c_g_pair_idx + c_g_pair_offsets[li*LMAX1+lj];
+    int *idy_ij = idx_ij + nfij;
+    int *idz_ij = idy_ij + nfij;
+    int lk_offset = lk * (lk + 1) * (lk + 2) / 2;
+    int *idx_k = c_g_cart_idx + lk_offset;
+    int *idy_k = idx_k + nfk;
+    int *idz_k = idy_k + nfk;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+
+    int gx_len = g_size * nst_per_block;
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *g = rw + nst_per_block * nroots*2;
+    double *gx = g;
+    double *gy = gx + gx_len;
+    double *gz = gy + gx_len;
+    double *Rpq = gz + gx_len;
+    double *rjri = Rpq + nst_per_block * 3;
+    double gout[GOUT_WIDTH];
+    if (gout_id == 0) {
+        gx[0] = 1.;
+    }
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0+st_id; ijk_idx < st1+st_id; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        if (ijk_idx >= nst) {
+            shl_pair_idx = st0 / nksh;
+            if (gout_id == 0) {
+                gx[0] = 0.;
+            }
+        }
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        if (gout_id == 0) {
+            double xjxi = rj[0] - ri[0];
+            double yjyi = rj[1] - ri[1];
+            double zjzi = rj[2] - ri[2];
+            double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+            rjri[0*nst_per_block] = xjxi;
+            rjri[1*nst_per_block] = yjyi;
+            rjri[2*nst_per_block] = zjzi;
+            rjri[3*nst_per_block] = rr_ij;
+        }
+
+        for (int gout_start = 0; gout_start < nfij*nfk;
+             gout_start+=gout_stride*GOUT_WIDTH) {
+#pragma unroll
+            for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; }
+
+            for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+                int ijp = ijkp / kprim;
+                int kp = ijkp % kprim;
+                int ip = ijp / jprim;
+                int jp = ijp % jprim;
+                double ai = expi[ip];
+                double aj = expj[jp];
+                double ak = expk[kp];
+                double aij = ai + aj;
+                double aj_aij = aj / aij;
+                __syncthreads();
+                double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+                double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+                double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+                double xpq = xij - rk[0];
+                double ypq = yij - rk[1];
+                double zpq = zij - rk[2];
+                if (gout_id == 0) {
+                    double cijk = ci[ip] * cj[jp] * ck[kp];
+                    double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+                    double theta_ij = ai * aj_aij;
+                    double Kab = theta_ij * rjri[3*nst_per_block];
+                    gy[0] = fac * exp(-Kab);
+                    Rpq[0*nst_per_block] = xpq;
+                    Rpq[1*nst_per_block] = ypq;
+                    Rpq[2*nst_per_block] = zpq;
+                }
+                double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+                double theta = aij * ak / (aij + ak);
+                double theta_rr = theta * rr;
+                if (omega == 0) {
+                    rys_roots(nroots, theta_rr, rw, nst_per_block, gout_id, gout_stride);
+                } else if (omega > 0) {
+                    double omega2 = omega * omega;
+                    double theta_fac = omega2 / (omega2 + theta);
+                    rys_roots(nroots, theta_fac*theta_rr, rw, nst_per_block, gout_id, gout_stride);
+                    __syncthreads();
+                    double sqrt_theta_fac = sqrt(theta_fac);
+                    for (int irys = gout_id; irys < nroots; irys+=gout_stride) {
+                        rw[ irys*2   *nst_per_block] *= theta_fac;
+                        rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                    }
+                } else {
+                    double omega2 = omega * omega;
+                    double theta_fac = omega2 / (omega2 + theta);
+                    int _nroots = nroots/2;
+                    rys_roots(_nroots, theta_rr, rw+nroots*nst_per_block,
+                              nst_per_block, gout_id, gout_stride);
+                    rys_roots(_nroots, theta_fac*theta_rr, rw,
+                              nst_per_block, gout_id, gout_stride);
+                    __syncthreads();
+                    double sqrt_theta_fac = -sqrt(theta_fac);
+                    for (int irys = gout_id; irys < _nroots; irys+=gout_stride) {
+                        rw[ irys*2   *nst_per_block] *= theta_fac;
+                        rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                    }
+                }
+                double s0x, s1x, s2x;
+                for (int irys = 0; irys < nroots; ++irys) {
+                    __syncthreads();
+                    if (gout_id == 0) {
+                        gz[0] = rw[(irys*2+1)*nst_per_block];
+                    }
+                    double rt = rw[ irys*2   *nst_per_block];
+                    double rt_aa = rt / (aij + ak);
+
+                    if (lij > 0) {
+                        __syncthreads();
+                        double rt_aij = rt_aa * ak;
+                        double b10 = .5/aij * (1 - rt_aij);
+                        // gx(0,n+1) = c0*gx(0,n) + n*b10*gx(0,n-1)
+                        for (int n = gout_id; n < 3; n += gout_stride) {
+                            double *_gx = gx + n * gx_len;
+                            double xjxi = rjri[n*nst_per_block];
+                            double xpa = xjxi * aj_aij;
+                            //double c0x = Rpa[ir] - rt_aij * Rpq[n*nst_per_block];
+                            double c0x = xpa - rt_aij * Rpq[n*nst_per_block];
+                            s0x = _gx[0];
+                            s1x = c0x * s0x;
+                            _gx[nst_per_block] = s1x;
+                            for (int i = 1; i < lij; ++i) {
+                                s2x = c0x * s1x + i * b10 * s0x;
+                                _gx[(i+1)*nst_per_block] = s2x;
+                                s0x = s1x;
+                                s1x = s2x;
+                            }
+                        }
+                    }
+
+                    if (lk > 0) {
+                        int lij3 = (lij+1)*3;
+                        double rt_ak  = rt_aa * aij;
+                        double b00 = .5 * rt_aa;
+                        double b01 = .5/ak  * (1 - rt_ak );
+                        for (int n = gout_id; n < lij3+gout_id; n += gout_stride) {
+                            __syncthreads();
+                            int i = n / 3; //for i in range(lij+1):
+                            int _ix = n % 3; // TODO: remove _ix for nroots > 2
+                            double *_gx = gx + (i + _ix * g_size) * nst_per_block;
+                            double cpx = rt_ak * Rpq[_ix*nst_per_block];
+                            //for i in range(lij+1):
+                            //    trr(i,1) = c0p * trr(i,0) + i*b00 * trr(i-1,0)
+                            if (n < lij3) {
+                                s0x = _gx[0];
+                                s1x = cpx * s0x;
+                                if (i > 0) {
+                                    s1x += i * b00 * _gx[-nst_per_block];
+                                }
+                                _gx[stride_k*nst_per_block] = s1x;
+                            }
+                            //for k in range(1, lk):
+                            //    for i in range(lij+1):
+                            //        trr(i,k+1) = cp * trr(i,k) + k*b01 * trr(i,k-1) + i*b00 * trr(i-1,k)
+                            for (int k = 1; k < lk; ++k) {
+                                __syncthreads();
+                                if (n < lij3) {
+                                    s2x = cpx*s1x + k*b01*s0x;
+                                    if (i > 0) {
+                                        s2x += i * b00 * _gx[(k*stride_k-1)*nst_per_block];
+                                    }
+                                    _gx[(k*stride_k+stride_k)*nst_per_block] = s2x;
+                                    s0x = s1x;
+                                    s1x = s2x;
+                                }
+                            }
+                        }
+                    }
+
+                    // hrr
+                    // g(i,j+1) = rirj * g(i,j) +  g(i+1,j)
+                    // g(...,k,l+1) = rkrl * g(...,k,l) + g(...,k+1,l)
+                    if (lj > 0) {
+                        __syncthreads();
+                        int lk3 = (lk+1)*3;
+                        for (int m = gout_id; m < lk3; m += gout_stride) {
+                            int k = m / 3;
+                            int _ix = m % 3;
+                            double xjxi = rjri[_ix*nst_per_block];
+                            double *_gx = g + (_ix*g_size + k*stride_k) *
+                                nst_per_block;
+                            for (int j = 0; j < lj; ++j) {
+                                int ij = (lij-j) + j*stride_j;
+                                s1x = _gx[ij*nst_per_block];
+                                for (--ij; ij >= j*stride_j; --ij) {
+                                    s0x = _gx[ij*nst_per_block];
+                                    _gx[(ij+stride_j)*nst_per_block] = s1x - xjxi * s0x;
+                                    s1x = s0x;
+                                }
+                            }
+                        }
+                    }
+
+                    __syncthreads();
+#pragma unroll
+                    for (int n = 0; n < GOUT_WIDTH; ++n) {
+                        int ijk = gout_start + n*gout_stride+gout_id;
+                        int k  = ijk % nfk;
+                        int ij = ijk / nfk;
+                        if (ij >= nfij) break;
+                        int addrx = (idx_ij[ij] + idx_k[k] * stride_k) * nst_per_block;
+                        int addry = (idy_ij[ij] + idy_k[k] * stride_k) * nst_per_block;
+                        int addrz = (idz_ij[ij] + idz_k[k] * stride_k) * nst_per_block;
+                        gout[n] += gx[addrx] * gy[addry] * gz[addrz];
+                    }
+                }
+            }
+
+            if (ijk_idx < nst) {
+                int naux = bounds.naux;
+                double *eri_tensor = out + shl_pair_idx * nfij * naux
+                        + ksh_in_auxmol * nfk;
+                for (int n = 0; n < GOUT_WIDTH; ++n) {
+                    int ijk = gout_start + n*gout_stride+gout_id;
+                    int k  = ijk % nfk;
+                    int ij = ijk / nfk;
+                    if (ij >= nfij) break;
+                    eri_tensor[ij * naux + k] = gout[n];
+                }
+            }
+        }
+    }
+}
diff --git a/gpu4pyscf/lib/gint-rys/fill_int3c2e_bdiv.cu b/gpu4pyscf/lib/gint-rys/fill_int3c2e_bdiv.cu
new file mode 100644
index 00000000..4111e657
--- /dev/null
+++ b/gpu4pyscf/lib/gint-rys/fill_int3c2e_bdiv.cu
@@ -0,0 +1,330 @@
+/*
+ * Copyright 2025 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+#include "gvhf-rys/vhf.cuh"
+#include "gvhf-rys/rys_roots.cu"
+#include "int3c2e.cuh"
+
+// TODO: benchmark performance for 32, 38, 40, 45, 54
+#define GOUT_WIDTH      45
+
+__device__ int int3c2e_bdiv_unrolled(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds);
+
+__global__
+void int3c2e_bdiv_kernel(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    if (int3c2e_bdiv_unrolled(out, envs, bounds)) {
+        return;
+    }
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int *bas = envs.bas;
+    int li = bas[ish0*BAS_SLOTS+ANG_OF];
+    int lj = bas[jsh0*BAS_SLOTS+ANG_OF];
+    int lk = bas[ksh0*BAS_SLOTS+ANG_OF];
+    int lij = li + lj;
+    int nroots = (lij + lk) / 2 + 1;
+    int nfi = (li + 1) * (li + 2) / 2;
+    int nfj = (lj + 1) * (lj + 2) / 2;
+    int nfk = (lk + 1) * (lk + 2) / 2;
+    int nfij = nfi * nfj;
+    int *idx_ij = c_g_pair_idx + c_g_pair_offsets[li*LMAX1+lj];
+    int *idy_ij = idx_ij + nfij;
+    int *idz_ij = idy_ij + nfij;
+    int lk_offset = lk * (lk + 1) * (lk + 2) / 2;
+    int *idx_k = c_g_cart_idx + lk_offset;
+    int *idy_k = idx_k + nfk;
+    int *idz_k = idy_k + nfk;
+    int stride_j = li + 1;
+    int stride_k = stride_j * (lj + 1);
+    int g_size = stride_k * (lk + 1);
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+
+    int nst_per_block = blockDim.x;
+    if (lij + lk > 2) {
+        nst_per_block = bounds.nst_lookup[(lk*LMAX1+lj)*LMAX1+li];
+    }
+    int gout_stride = blockDim.x / nst_per_block;
+    int thread_id = threadIdx.x;
+    int st_id = thread_id % nst_per_block;
+    int gout_id = thread_id / nst_per_block;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    int gx_len = g_size * nst_per_block;
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *g = rw + nst_per_block * nroots*2;
+    double *gx = g;
+    double *gy = gx + gx_len;
+    double *gz = gy + gx_len;
+    double *Rpq = gz + gx_len;
+    double *rjri = Rpq + nst_per_block * 3;
+    double gout[GOUT_WIDTH];
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    if (gout_id == 0) {
+        gx[0] = 1.;
+    }
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst+st_id; ijk_idx += nst_per_block) {
+        // convert task_id to ish, jsh, ksh
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        __syncthreads();
+        if (ijk_idx >= nst) {
+            shl_pair_in_block = 0;
+            if (gout_id == 0) {
+                gx[0] = 0.;
+            }
+        }
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        if (gout_id == 0) {
+            double xjxi = rj[0] - ri[0];
+            double yjyi = rj[1] - ri[1];
+            double zjzi = rj[2] - ri[2];
+            double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+            rjri[0*nst_per_block] = xjxi;
+            rjri[1*nst_per_block] = yjyi;
+            rjri[2*nst_per_block] = zjzi;
+            rjri[3*nst_per_block] = rr_ij;
+        }
+
+        for (int gout_start = 0; gout_start < nfij*nfk;
+             gout_start+=gout_stride*GOUT_WIDTH) {
+#pragma unroll
+            for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; }
+
+            for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+                int ijp = ijkp / kprim;
+                int kp = ijkp % kprim;
+                int ip = ijp / jprim;
+                int jp = ijp % jprim;
+                double ai = expi[ip];
+                double aj = expj[jp];
+                double ak = expk[kp];
+                double aij = ai + aj;
+                double aj_aij = aj / aij;
+                __syncthreads();
+                double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+                double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+                double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+                double xpq = xij - rk[0];
+                double ypq = yij - rk[1];
+                double zpq = zij - rk[2];
+                if (gout_id == 0) {
+                    double cijk = ci[ip] * cj[jp] * ck[kp];
+                    double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+                    double theta_ij = ai * aj_aij;
+                    double Kab = theta_ij * rjri[3*nst_per_block];
+                    gy[0] = fac * exp(-Kab);
+                    Rpq[0*nst_per_block] = xpq;
+                    Rpq[1*nst_per_block] = ypq;
+                    Rpq[2*nst_per_block] = zpq;
+                }
+                double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+                double theta = aij * ak / (aij + ak);
+                double theta_rr = theta * rr;
+                if (omega == 0) {
+                    rys_roots(nroots, theta_rr, rw, nst_per_block, gout_id, gout_stride);
+                } else if (omega > 0) {
+                    double omega2 = omega * omega;
+                    double theta_fac = omega2 / (omega2 + theta);
+                    rys_roots(nroots, theta_fac*theta_rr, rw, nst_per_block, gout_id, gout_stride);
+                    __syncthreads();
+                    double sqrt_theta_fac = sqrt(theta_fac);
+                    for (int irys = gout_id; irys < nroots; irys+=gout_stride) {
+                        rw[ irys*2   *nst_per_block] *= theta_fac;
+                        rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                    }
+                } else {
+                    double omega2 = omega * omega;
+                    double theta_fac = omega2 / (omega2 + theta);
+                    int _nroots = nroots/2;
+                    rys_roots(_nroots, theta_rr, rw+nroots*nst_per_block,
+                              nst_per_block, gout_id, gout_stride);
+                    rys_roots(_nroots, theta_fac*theta_rr, rw,
+                              nst_per_block, gout_id, gout_stride);
+                    __syncthreads();
+                    double sqrt_theta_fac = -sqrt(theta_fac);
+                    for (int irys = gout_id; irys < _nroots; irys+=gout_stride) {
+                        rw[ irys*2   *nst_per_block] *= theta_fac;
+                        rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                    }
+                }
+                double s0x, s1x, s2x;
+                for (int irys = 0; irys < nroots; ++irys) {
+                    __syncthreads();
+                    if (gout_id == 0) {
+                        gz[0] = rw[(irys*2+1)*nst_per_block];
+                    }
+                    double rt = rw[ irys*2   *nst_per_block];
+                    double rt_aa = rt / (aij + ak);
+
+                    if (lij > 0) {
+                        __syncthreads();
+                        double rt_aij = rt_aa * ak;
+                        double b10 = .5/aij * (1 - rt_aij);
+                        // gx(0,n+1) = c0*gx(0,n) + n*b10*gx(0,n-1)
+                        for (int n = gout_id; n < 3; n += gout_stride) {
+                            double *_gx = gx + n * gx_len;
+                            double xjxi = rjri[n*nst_per_block];
+                            double xpa = xjxi * aj_aij;
+                            //double c0x = Rpa[ir] - rt_aij * Rpq[n*nst_per_block];
+                            double c0x = xpa - rt_aij * Rpq[n*nst_per_block];
+                            s0x = _gx[0];
+                            s1x = c0x * s0x;
+                            _gx[nst_per_block] = s1x;
+                            for (int i = 1; i < lij; ++i) {
+                                s2x = c0x * s1x + i * b10 * s0x;
+                                _gx[(i+1)*nst_per_block] = s2x;
+                                s0x = s1x;
+                                s1x = s2x;
+                            }
+                        }
+                    }
+
+                    if (lk > 0) {
+                        int lij3 = (lij+1)*3;
+                        double rt_ak  = rt_aa * aij;
+                        double b00 = .5 * rt_aa;
+                        double b01 = .5/ak  * (1 - rt_ak );
+                        for (int n = gout_id; n < lij3+gout_id; n += gout_stride) {
+                            __syncthreads();
+                            int i = n / 3; //for i in range(lij+1):
+                            int _ix = n % 3; // TODO: remove _ix for nroots > 2
+                            double *_gx = gx + (i + _ix * g_size) * nst_per_block;
+                            double cpx = rt_ak * Rpq[_ix*nst_per_block];
+                            //for i in range(lij+1):
+                            //    trr(i,1) = c0p * trr(i,0) + i*b00 * trr(i-1,0)
+                            if (n < lij3) {
+                                s0x = _gx[0];
+                                s1x = cpx * s0x;
+                                if (i > 0) {
+                                    s1x += i * b00 * _gx[-nst_per_block];
+                                }
+                                _gx[stride_k*nst_per_block] = s1x;
+                            }
+                            //for k in range(1, lk):
+                            //    for i in range(lij+1):
+                            //        trr(i,k+1) = cp * trr(i,k) + k*b01 * trr(i,k-1) + i*b00 * trr(i-1,k)
+                            for (int k = 1; k < lk; ++k) {
+                                __syncthreads();
+                                if (n < lij3) {
+                                    s2x = cpx*s1x + k*b01*s0x;
+                                    if (i > 0) {
+                                        s2x += i * b00 * _gx[(k*stride_k-1)*nst_per_block];
+                                    }
+                                    _gx[(k*stride_k+stride_k)*nst_per_block] = s2x;
+                                    s0x = s1x;
+                                    s1x = s2x;
+                                }
+                            }
+                        }
+                    }
+
+                    // hrr
+                    // g(i,j+1) = rirj * g(i,j) +  g(i+1,j)
+                    // g(...,k,l+1) = rkrl * g(...,k,l) + g(...,k+1,l)
+                    if (lj > 0) {
+                        __syncthreads();
+                        int lk3 = (lk+1)*3;
+                        for (int m = gout_id; m < lk3; m += gout_stride) {
+                            int k = m / 3;
+                            int _ix = m % 3;
+                            double xjxi = rjri[_ix*nst_per_block];
+                            double *_gx = g + (_ix*g_size + k*stride_k) *
+                                nst_per_block;
+                            for (int j = 0; j < lj; ++j) {
+                                int ij = (lij-j) + j*stride_j;
+                                s1x = _gx[ij*nst_per_block];
+                                for (--ij; ij >= j*stride_j; --ij) {
+                                    s0x = _gx[ij*nst_per_block];
+                                    _gx[(ij+stride_j)*nst_per_block] = s1x - xjxi * s0x;
+                                    s1x = s0x;
+                                }
+                            }
+                        }
+                    }
+
+                    __syncthreads();
+#pragma unroll
+                    for (int n = 0; n < GOUT_WIDTH; ++n) {
+                        int ijk = gout_start + n*gout_stride+gout_id;
+                        int k  = ijk % nfk;
+                        int ij = ijk / nfk;
+                        if (ij >= nfij) break;
+                        int addrx = (idx_ij[ij] + idx_k[k] * stride_k) * nst_per_block;
+                        int addry = (idy_ij[ij] + idy_k[k] * stride_k) * nst_per_block;
+                        int addrz = (idz_ij[ij] + idz_k[k] * stride_k) * nst_per_block;
+                        gout[n] += gx[addrx] * gy[addry] * gz[addrz];
+                    }
+                }
+            }
+
+            if (ijk_idx < nst) {
+                int *ao_loc = envs.ao_loc;
+                int k0 = ao_loc[ksh0] - ao_loc[nbas];
+                double *eri_tensor = out_local + shl_pair_in_block * nfij * naux
+                        + k0 + ksh_in_block * nfk;
+                for (int n = 0; n < GOUT_WIDTH; ++n) {
+                    int ijk = gout_start + n*gout_stride+gout_id;
+                    int k  = ijk % nfk;
+                    int ij = ijk / nfk;
+                    if (ij >= nfij) break;
+                    eri_tensor[ij * naux + k] = gout[n];
+                }
+            }
+        }
+    }
+}
diff --git a/gpu4pyscf/lib/gint-rys/gint_driver.cu b/gpu4pyscf/lib/gint-rys/gint_driver.cu
new file mode 100644
index 00000000..b7844fd7
--- /dev/null
+++ b/gpu4pyscf/lib/gint-rys/gint_driver.cu
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2024 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "gvhf-rys/vhf.cuh"
+#include "int3c2e.cuh"
+
+__constant__ int c_g_pair_idx[3675]; // corresponding to LMAX=4
+__constant__ int c_g_pair_offsets[LMAX1*LMAX1];
+__constant__ int c_g_cart_idx[252]; // corresponding to LMAX=6
+
+extern __global__
+void int3c2e_kernel(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds);
+int int3c2e_unrolled(double *out, Int3c2eEnvVars *envs, Int3c2eBounds *bounds);
+
+extern __global__
+void int3c2e_bdiv_kernel(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds);
+
+extern "C" {
+int fill_int3c2e(double *out, Int3c2eEnvVars *envs, int *scheme, int *shls_slice,
+                 int *aux_loc, int naux, int nshl_pair, int *bas_ij_idx,
+                 int *atm, int natm, int *bas, int nbas, double *env)
+{
+    uint16_t ish0 = shls_slice[0];
+    uint16_t jsh0 = shls_slice[2];
+    uint16_t ksh0 = shls_slice[4] + nbas;
+    uint16_t ksh1 = shls_slice[5] + nbas;
+    uint16_t nksh = ksh1 - ksh0;
+    uint8_t li = bas[ANG_OF + ish0*BAS_SLOTS];
+    uint8_t lj = bas[ANG_OF + jsh0*BAS_SLOTS];
+    uint8_t lk = bas[ANG_OF + ksh0*BAS_SLOTS];
+    uint8_t iprim = bas[NPRIM_OF + ish0*BAS_SLOTS];
+    uint8_t jprim = bas[NPRIM_OF + jsh0*BAS_SLOTS];
+    uint8_t kprim = bas[NPRIM_OF + ksh0*BAS_SLOTS];
+    uint8_t nfi = (li+1)*(li+2)/2;
+    uint8_t nfj = (lj+1)*(lj+2)/2;
+    uint8_t nfk = (lk+1)*(lk+2)/2;
+    uint8_t nfij = nfi * nfj;
+    uint8_t order = li + lj + lk;
+    uint8_t nroots = order / 2 + 1;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) { // SR ERIs
+        nroots *= 2;
+    }
+    uint8_t stride_i = 1;
+    uint8_t stride_j = li + 1;
+    uint8_t stride_k = stride_j * (lj + 1);
+    // up to (gg|i)
+    uint8_t g_size = stride_k * (lk + 1);
+    Int3c2eBounds bounds = {li, lj, lk, nroots, nfi, nfij, nfk,
+        iprim, jprim, kprim, stride_i, stride_j, stride_k, g_size,
+        (uint16_t)naux, nksh, ksh0, nshl_pair, bas_ij_idx};
+
+    int k0 = aux_loc[ksh0 - nbas];
+    out += k0; // offset when writing output
+    if (!int3c2e_unrolled(out, envs, &bounds)) {
+        int nst_per_block = scheme[0];
+        int gout_stride = scheme[1];
+        dim3 threads(nst_per_block, gout_stride);
+        int tasks_per_block = BATCHES_PER_BLOCK * nst_per_block;
+        int st_blocks = (nksh*nshl_pair + tasks_per_block - 1) / tasks_per_block;
+        int buflen = (nroots*2+g_size*3+7) * nst_per_block * sizeof(double);
+        int3c2e_kernel<<<st_blocks, threads, buflen>>>(out, *envs, bounds);
+    }
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in int3c2e_kernel: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int fill_int3c2e_bdiv(double *out, Int3c2eEnvVars *envs, int shm_size, int naux,
+                      int nbatches_shl_pair, int nbatches_ksh,
+                      int *shl_pair_offsets, int *ao_pair_loc, int *ksh_offsets,
+                      int *bas_ij_idx, int *nst_lookup,
+                      int *atm, int natm, int *bas, int nbas, double *env)
+{
+    BDiv3c2eBounds bounds = {naux, bas_ij_idx, shl_pair_offsets, ao_pair_loc,
+        ksh_offsets, nst_lookup};
+    int threads = 256;
+    dim3 blocks(nbatches_shl_pair, nbatches_ksh);
+    int3c2e_bdiv_kernel<<<blocks, threads, shm_size>>>(out, *envs, bounds);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in int3c2e_bdiv_kernel: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int init_constant(int *g_pair_idx, int *offsets,
+                  double *env, int env_size, int shm_size)
+{
+    cudaMemcpyToSymbol(c_g_pair_idx, g_pair_idx, 3675*sizeof(int));
+    cudaMemcpyToSymbol(c_g_pair_offsets, offsets, sizeof(int) * LMAX1*LMAX1);
+
+    int *g_cart_idx = (int *)malloc(252*sizeof(int));
+    int *idx, *idy, *idz;
+    idx = g_cart_idx;
+    for (int l = 0; l <= L_AUX_MAX; ++l) {
+        int nf = (l + 1) * (l + 2) / 2;
+        idy = idx + nf;
+        idz = idy + nf;
+        for (int i = 0, ix = l; ix >= 0; --ix) {
+        for (int iy = l - ix; iy >= 0; --iy, ++i) {
+            int iz = l - ix - iy;
+            idx[i] = ix;
+            idy[i] = iy;
+            idz[i] = iz;
+        } }
+        idx += nf * 3;
+    }
+    cudaMemcpyToSymbol(c_g_cart_idx, g_cart_idx, 252*sizeof(int));
+    free(g_cart_idx);
+
+    cudaFuncSetAttribute(int3c2e_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    cudaFuncSetAttribute(int3c2e_bdiv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size,
+                cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+}
diff --git a/gpu4pyscf/lib/gint-rys/int3c2e.cuh b/gpu4pyscf/lib/gint-rys/int3c2e.cuh
new file mode 100644
index 00000000..b6452471
--- /dev/null
+++ b/gpu4pyscf/lib/gint-rys/int3c2e.cuh
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2024 The PySCF Developers. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+#define BATCHES_PER_BLOCK       16
+#define L_AUX_MAX       6
+
+#ifndef HAVE_DEFINED_INT3CENVVAS_H
+#define HAVE_DEFINED_INT3CENVVAS_H
+typedef struct {
+    uint16_t natm;
+    uint16_t nbas;
+    int *atm;
+    int *bas;
+    double *env;
+    int *ao_loc;
+    float log_cutoff;
+} Int3c2eEnvVars;
+
+typedef struct {
+    uint8_t li;
+    uint8_t lj;
+    uint8_t lk;
+    uint8_t nroots;
+    uint8_t nfi;
+    uint8_t nfij;
+    uint8_t nfk;
+    uint8_t iprim;
+    uint8_t jprim;
+    uint8_t kprim;
+    uint8_t stride_i;
+    uint8_t stride_j;
+    uint8_t stride_k;
+    uint8_t g_size;
+    uint16_t naux;
+    uint16_t nksh;
+    uint16_t ksh0;
+    int nshl_pair;
+    // The effective basis pair Id = ish*nbas+jsh
+    int *bas_ij_idx;
+} Int3c2eBounds;
+
+typedef struct {
+    int naux;
+    // The effective basis pair Id = ish*nbas+jsh
+    int *bas_ij_idx;
+    // the bas_ij_idx offset for each blockIdx.x
+    int *shl_pair_offsets;
+    // the AO-pair offset (address) in the output tensor for each blockIdx.x
+    int *ao_pair_loc;
+    // the auxiliary function offset (address) in the output tensor for each blockIdx.y
+    int *ksh_offsets;
+    // nst_per_block for each (li,lj,lk) pattern
+    int *nst_lookup;
+} BDiv3c2eBounds;
+
+#ifdef __CUDACC__
+extern __constant__ int c_g_pair_idx[];
+extern __constant__ int c_g_pair_offsets[];
+extern __constant__ int c_g_cart_idx[];
+#endif
+#endif
diff --git a/gpu4pyscf/lib/gint-rys/rys_roots_dat.cu b/gpu4pyscf/lib/gint-rys/rys_roots_dat.cu
new file mode 100644
index 00000000..1644fc8c
--- /dev/null
+++ b/gpu4pyscf/lib/gint-rys/rys_roots_dat.cu
@@ -0,0 +1 @@
+#include "gvhf-rys/rys_roots_dat.cu"
diff --git a/gpu4pyscf/lib/gint-rys/unrolled_int3c2e.cu b/gpu4pyscf/lib/gint-rys/unrolled_int3c2e.cu
new file mode 100644
index 00000000..b1714ee2
--- /dev/null
+++ b/gpu4pyscf/lib/gint-rys/unrolled_int3c2e.cu
@@ -0,0 +1,3947 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include "gvhf-rys/vhf.cuh"
+#include "gvhf-rys/rys_roots.cu"
+#include "int3c2e.cuh"
+
+
+#if CUDA_VERSION >= 12040
+__global__ __maxnreg__(128)
+#else
+__global__
+#endif
+void int3c2e_000(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(1, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 1; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 2*nst_per_block;
+                rys_roots(1, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 1; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                gout0 += 1 * fac1 * wt;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 1 * naux + ksh_in_auxmol * 1;
+        eri_tensor[0*naux + 0] = gout0;
+    }
+}
+
+#if CUDA_VERSION >= 12040
+__global__ __maxnreg__(128)
+#else
+__global__
+#endif
+void int3c2e_100(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(1, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 1; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 2*nst_per_block;
+                rys_roots(1, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 1; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double c0x = xjxi * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                gout0 += trr_10x * fac1 * wt;
+                double c0y = yjyi * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += 1 * trr_10y * wt;
+                double c0z = zjzi * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += 1 * fac1 * trr_10z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 3 * naux + ksh_in_auxmol * 1;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[2*naux + 0] = gout2;
+    }
+}
+
+#if CUDA_VERSION >= 12040
+__global__ __maxnreg__(128)
+#else
+__global__
+#endif
+void int3c2e_110(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = xjxi * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double hrr_110x = trr_20x - xjxi * trr_10x;
+                gout0 += hrr_110x * fac1 * wt;
+                double hrr_010x = trr_10x - xjxi * 1;
+                double c0y = yjyi * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += hrr_010x * trr_10y * wt;
+                double c0z = zjzi * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += hrr_010x * fac1 * trr_10z;
+                double hrr_010y = trr_10y - yjyi * fac1;
+                gout3 += trr_10x * hrr_010y * wt;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                double hrr_110y = trr_20y - yjyi * trr_10y;
+                gout4 += 1 * hrr_110y * wt;
+                gout5 += 1 * hrr_010y * trr_10z;
+                double hrr_010z = trr_10z - zjzi * wt;
+                gout6 += trr_10x * fac1 * hrr_010z;
+                gout7 += 1 * trr_10y * hrr_010z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                double hrr_110z = trr_20z - zjzi * trr_10z;
+                gout8 += 1 * fac1 * hrr_110z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 9 * naux + ksh_in_auxmol * 1;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[6*naux + 0] = gout6;
+        eri_tensor[7*naux + 0] = gout7;
+        eri_tensor[8*naux + 0] = gout8;
+    }
+}
+
+#if CUDA_VERSION >= 12040
+__global__ __maxnreg__(128)
+#else
+__global__
+#endif
+void int3c2e_200(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = xjxi * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                gout0 += trr_20x * fac1 * wt;
+                double c0y = yjyi * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += trr_10x * trr_10y * wt;
+                double c0z = zjzi * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += trr_10x * fac1 * trr_10z;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                gout3 += 1 * trr_20y * wt;
+                gout4 += 1 * trr_10y * trr_10z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                gout5 += 1 * fac1 * trr_20z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 6 * naux + ksh_in_auxmol * 1;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[5*naux + 0] = gout5;
+    }
+}
+
+#if CUDA_VERSION >= 12040
+__global__ __maxnreg__(128)
+#else
+__global__
+#endif
+void int3c2e_210(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = xjxi * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
+                double hrr_210x = trr_30x - xjxi * trr_20x;
+                gout0 += hrr_210x * fac1 * wt;
+                double hrr_110x = trr_20x - xjxi * trr_10x;
+                double c0y = yjyi * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += hrr_110x * trr_10y * wt;
+                double c0z = zjzi * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += hrr_110x * fac1 * trr_10z;
+                double hrr_010x = trr_10x - xjxi * 1;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                gout3 += hrr_010x * trr_20y * wt;
+                gout4 += hrr_010x * trr_10y * trr_10z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                gout5 += hrr_010x * fac1 * trr_20z;
+                double hrr_010y = trr_10y - yjyi * fac1;
+                gout6 += trr_20x * hrr_010y * wt;
+                double hrr_110y = trr_20y - yjyi * trr_10y;
+                gout7 += trr_10x * hrr_110y * wt;
+                gout8 += trr_10x * hrr_010y * trr_10z;
+                double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
+                double hrr_210y = trr_30y - yjyi * trr_20y;
+                gout9 += 1 * hrr_210y * wt;
+                gout10 += 1 * hrr_110y * trr_10z;
+                gout11 += 1 * hrr_010y * trr_20z;
+                double hrr_010z = trr_10z - zjzi * wt;
+                gout12 += trr_20x * fac1 * hrr_010z;
+                gout13 += trr_10x * trr_10y * hrr_010z;
+                double hrr_110z = trr_20z - zjzi * trr_10z;
+                gout14 += trr_10x * fac1 * hrr_110z;
+                gout15 += 1 * trr_20y * hrr_010z;
+                gout16 += 1 * trr_10y * hrr_110z;
+                double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
+                double hrr_210z = trr_30z - zjzi * trr_20z;
+                gout17 += 1 * fac1 * hrr_210z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 18 * naux + ksh_in_auxmol * 1;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[6*naux + 0] = gout6;
+        eri_tensor[7*naux + 0] = gout7;
+        eri_tensor[8*naux + 0] = gout8;
+        eri_tensor[9*naux + 0] = gout9;
+        eri_tensor[10*naux + 0] = gout10;
+        eri_tensor[11*naux + 0] = gout11;
+        eri_tensor[12*naux + 0] = gout12;
+        eri_tensor[13*naux + 0] = gout13;
+        eri_tensor[14*naux + 0] = gout14;
+        eri_tensor[15*naux + 0] = gout15;
+        eri_tensor[16*naux + 0] = gout16;
+        eri_tensor[17*naux + 0] = gout17;
+    }
+}
+
+__global__
+void int3c2e_220(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        double gout27 = 0;
+        double gout28 = 0;
+        double gout29 = 0;
+        double gout30 = 0;
+        double gout31 = 0;
+        double gout32 = 0;
+        double gout33 = 0;
+        double gout34 = 0;
+        double gout35 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(3, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 6*nst_per_block;
+                rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = xjxi * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
+                double trr_40x = c0x * trr_30x + 3*b10 * trr_20x;
+                double hrr_310x = trr_40x - xjxi * trr_30x;
+                double hrr_210x = trr_30x - xjxi * trr_20x;
+                double hrr_220x = hrr_310x - xjxi * hrr_210x;
+                gout0 += hrr_220x * fac1 * wt;
+                double hrr_110x = trr_20x - xjxi * trr_10x;
+                double hrr_120x = hrr_210x - xjxi * hrr_110x;
+                double c0y = yjyi * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += hrr_120x * trr_10y * wt;
+                double c0z = zjzi * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += hrr_120x * fac1 * trr_10z;
+                double hrr_010x = trr_10x - xjxi * 1;
+                double hrr_020x = hrr_110x - xjxi * hrr_010x;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                gout3 += hrr_020x * trr_20y * wt;
+                gout4 += hrr_020x * trr_10y * trr_10z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                gout5 += hrr_020x * fac1 * trr_20z;
+                double hrr_010y = trr_10y - yjyi * fac1;
+                gout6 += hrr_210x * hrr_010y * wt;
+                double hrr_110y = trr_20y - yjyi * trr_10y;
+                gout7 += hrr_110x * hrr_110y * wt;
+                gout8 += hrr_110x * hrr_010y * trr_10z;
+                double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
+                double hrr_210y = trr_30y - yjyi * trr_20y;
+                gout9 += hrr_010x * hrr_210y * wt;
+                gout10 += hrr_010x * hrr_110y * trr_10z;
+                gout11 += hrr_010x * hrr_010y * trr_20z;
+                double hrr_010z = trr_10z - zjzi * wt;
+                gout12 += hrr_210x * fac1 * hrr_010z;
+                gout13 += hrr_110x * trr_10y * hrr_010z;
+                double hrr_110z = trr_20z - zjzi * trr_10z;
+                gout14 += hrr_110x * fac1 * hrr_110z;
+                gout15 += hrr_010x * trr_20y * hrr_010z;
+                gout16 += hrr_010x * trr_10y * hrr_110z;
+                double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
+                double hrr_210z = trr_30z - zjzi * trr_20z;
+                gout17 += hrr_010x * fac1 * hrr_210z;
+                double hrr_020y = hrr_110y - yjyi * hrr_010y;
+                gout18 += trr_20x * hrr_020y * wt;
+                double hrr_120y = hrr_210y - yjyi * hrr_110y;
+                gout19 += trr_10x * hrr_120y * wt;
+                gout20 += trr_10x * hrr_020y * trr_10z;
+                double trr_40y = c0y * trr_30y + 3*b10 * trr_20y;
+                double hrr_310y = trr_40y - yjyi * trr_30y;
+                double hrr_220y = hrr_310y - yjyi * hrr_210y;
+                gout21 += 1 * hrr_220y * wt;
+                gout22 += 1 * hrr_120y * trr_10z;
+                gout23 += 1 * hrr_020y * trr_20z;
+                gout24 += trr_20x * hrr_010y * hrr_010z;
+                gout25 += trr_10x * hrr_110y * hrr_010z;
+                gout26 += trr_10x * hrr_010y * hrr_110z;
+                gout27 += 1 * hrr_210y * hrr_010z;
+                gout28 += 1 * hrr_110y * hrr_110z;
+                gout29 += 1 * hrr_010y * hrr_210z;
+                double hrr_020z = hrr_110z - zjzi * hrr_010z;
+                gout30 += trr_20x * fac1 * hrr_020z;
+                gout31 += trr_10x * trr_10y * hrr_020z;
+                double hrr_120z = hrr_210z - zjzi * hrr_110z;
+                gout32 += trr_10x * fac1 * hrr_120z;
+                gout33 += 1 * trr_20y * hrr_020z;
+                gout34 += 1 * trr_10y * hrr_120z;
+                double trr_40z = c0z * trr_30z + 3*b10 * trr_20z;
+                double hrr_310z = trr_40z - zjzi * trr_30z;
+                double hrr_220z = hrr_310z - zjzi * hrr_210z;
+                gout35 += 1 * fac1 * hrr_220z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 36 * naux + ksh_in_auxmol * 1;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[6*naux + 0] = gout6;
+        eri_tensor[7*naux + 0] = gout7;
+        eri_tensor[8*naux + 0] = gout8;
+        eri_tensor[9*naux + 0] = gout9;
+        eri_tensor[10*naux + 0] = gout10;
+        eri_tensor[11*naux + 0] = gout11;
+        eri_tensor[12*naux + 0] = gout12;
+        eri_tensor[13*naux + 0] = gout13;
+        eri_tensor[14*naux + 0] = gout14;
+        eri_tensor[15*naux + 0] = gout15;
+        eri_tensor[16*naux + 0] = gout16;
+        eri_tensor[17*naux + 0] = gout17;
+        eri_tensor[18*naux + 0] = gout18;
+        eri_tensor[19*naux + 0] = gout19;
+        eri_tensor[20*naux + 0] = gout20;
+        eri_tensor[21*naux + 0] = gout21;
+        eri_tensor[22*naux + 0] = gout22;
+        eri_tensor[23*naux + 0] = gout23;
+        eri_tensor[24*naux + 0] = gout24;
+        eri_tensor[25*naux + 0] = gout25;
+        eri_tensor[26*naux + 0] = gout26;
+        eri_tensor[27*naux + 0] = gout27;
+        eri_tensor[28*naux + 0] = gout28;
+        eri_tensor[29*naux + 0] = gout29;
+        eri_tensor[30*naux + 0] = gout30;
+        eri_tensor[31*naux + 0] = gout31;
+        eri_tensor[32*naux + 0] = gout32;
+        eri_tensor[33*naux + 0] = gout33;
+        eri_tensor[34*naux + 0] = gout34;
+        eri_tensor[35*naux + 0] = gout35;
+    }
+}
+
+#if CUDA_VERSION >= 12040
+__global__ __maxnreg__(128)
+#else
+__global__
+#endif
+void int3c2e_001(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(1, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 1; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 2*nst_per_block;
+                rys_roots(1, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 1; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_ak = rt_aa * aij;
+                double cpx = xpq*rt_ak;
+                double trr_01x = cpx * 1;
+                gout0 += trr_01x * fac1 * wt;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout1 += 1 * trr_01y * wt;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout2 += 1 * fac1 * trr_01z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 1 * naux + ksh_in_auxmol * 3;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout1;
+        eri_tensor[0*naux + 2] = gout2;
+    }
+}
+
+#if CUDA_VERSION >= 12040
+__global__ __maxnreg__(128)
+#else
+__global__
+#endif
+void int3c2e_101(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double c0x = xjxi * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                gout0 += trr_11x * fac1 * wt;
+                double trr_01x = cpx * 1;
+                double c0y = yjyi * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += trr_01x * trr_10y * wt;
+                double c0z = zjzi * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += trr_01x * fac1 * trr_10z;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout3 += trr_10x * trr_01y * wt;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout4 += 1 * trr_11y * wt;
+                gout5 += 1 * trr_01y * trr_10z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout6 += trr_10x * fac1 * trr_01z;
+                gout7 += 1 * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout8 += 1 * fac1 * trr_11z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 3 * naux + ksh_in_auxmol * 3;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout3;
+        eri_tensor[0*naux + 2] = gout6;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout4;
+        eri_tensor[1*naux + 2] = gout7;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout5;
+        eri_tensor[2*naux + 2] = gout8;
+    }
+}
+
+__global__
+void int3c2e_111(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = xjxi * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                double hrr_111x = trr_21x - xjxi * trr_11x;
+                gout0 += hrr_111x * fac1 * wt;
+                double trr_01x = cpx * 1;
+                double hrr_011x = trr_11x - xjxi * trr_01x;
+                double c0y = yjyi * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += hrr_011x * trr_10y * wt;
+                double c0z = zjzi * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += hrr_011x * fac1 * trr_10z;
+                double hrr_010y = trr_10y - yjyi * fac1;
+                gout3 += trr_11x * hrr_010y * wt;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                double hrr_110y = trr_20y - yjyi * trr_10y;
+                gout4 += trr_01x * hrr_110y * wt;
+                gout5 += trr_01x * hrr_010y * trr_10z;
+                double hrr_010z = trr_10z - zjzi * wt;
+                gout6 += trr_11x * fac1 * hrr_010z;
+                gout7 += trr_01x * trr_10y * hrr_010z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                double hrr_110z = trr_20z - zjzi * trr_10z;
+                gout8 += trr_01x * fac1 * hrr_110z;
+                double hrr_110x = trr_20x - xjxi * trr_10x;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout9 += hrr_110x * trr_01y * wt;
+                double hrr_010x = trr_10x - xjxi * 1;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout10 += hrr_010x * trr_11y * wt;
+                gout11 += hrr_010x * trr_01y * trr_10z;
+                double hrr_011y = trr_11y - yjyi * trr_01y;
+                gout12 += trr_10x * hrr_011y * wt;
+                double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
+                double hrr_111y = trr_21y - yjyi * trr_11y;
+                gout13 += 1 * hrr_111y * wt;
+                gout14 += 1 * hrr_011y * trr_10z;
+                gout15 += trr_10x * trr_01y * hrr_010z;
+                gout16 += 1 * trr_11y * hrr_010z;
+                gout17 += 1 * trr_01y * hrr_110z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout18 += hrr_110x * fac1 * trr_01z;
+                gout19 += hrr_010x * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout20 += hrr_010x * fac1 * trr_11z;
+                gout21 += trr_10x * hrr_010y * trr_01z;
+                gout22 += 1 * hrr_110y * trr_01z;
+                gout23 += 1 * hrr_010y * trr_11z;
+                double hrr_011z = trr_11z - zjzi * trr_01z;
+                gout24 += trr_10x * fac1 * hrr_011z;
+                gout25 += 1 * trr_10y * hrr_011z;
+                double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
+                double hrr_111z = trr_21z - zjzi * trr_11z;
+                gout26 += 1 * fac1 * hrr_111z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 9 * naux + ksh_in_auxmol * 3;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout9;
+        eri_tensor[0*naux + 2] = gout18;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout10;
+        eri_tensor[1*naux + 2] = gout19;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout11;
+        eri_tensor[2*naux + 2] = gout20;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[3*naux + 1] = gout12;
+        eri_tensor[3*naux + 2] = gout21;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[4*naux + 1] = gout13;
+        eri_tensor[4*naux + 2] = gout22;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[5*naux + 1] = gout14;
+        eri_tensor[5*naux + 2] = gout23;
+        eri_tensor[6*naux + 0] = gout6;
+        eri_tensor[6*naux + 1] = gout15;
+        eri_tensor[6*naux + 2] = gout24;
+        eri_tensor[7*naux + 0] = gout7;
+        eri_tensor[7*naux + 1] = gout16;
+        eri_tensor[7*naux + 2] = gout25;
+        eri_tensor[8*naux + 0] = gout8;
+        eri_tensor[8*naux + 1] = gout17;
+        eri_tensor[8*naux + 2] = gout26;
+    }
+}
+
+#if CUDA_VERSION >= 12040
+__global__ __maxnreg__(128)
+#else
+__global__
+#endif
+void int3c2e_201(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = xjxi * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
+                gout0 += trr_21x * fac1 * wt;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                double c0y = yjyi * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += trr_11x * trr_10y * wt;
+                double c0z = zjzi * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += trr_11x * fac1 * trr_10z;
+                double trr_01x = cpx * 1;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                gout3 += trr_01x * trr_20y * wt;
+                gout4 += trr_01x * trr_10y * trr_10z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                gout5 += trr_01x * fac1 * trr_20z;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout6 += trr_20x * trr_01y * wt;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout7 += trr_10x * trr_11y * wt;
+                gout8 += trr_10x * trr_01y * trr_10z;
+                double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
+                gout9 += 1 * trr_21y * wt;
+                gout10 += 1 * trr_11y * trr_10z;
+                gout11 += 1 * trr_01y * trr_20z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout12 += trr_20x * fac1 * trr_01z;
+                gout13 += trr_10x * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout14 += trr_10x * fac1 * trr_11z;
+                gout15 += 1 * trr_20y * trr_01z;
+                gout16 += 1 * trr_10y * trr_11z;
+                double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
+                gout17 += 1 * fac1 * trr_21z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 6 * naux + ksh_in_auxmol * 3;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout6;
+        eri_tensor[0*naux + 2] = gout12;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout7;
+        eri_tensor[1*naux + 2] = gout13;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout8;
+        eri_tensor[2*naux + 2] = gout14;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[3*naux + 1] = gout9;
+        eri_tensor[3*naux + 2] = gout15;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[4*naux + 1] = gout10;
+        eri_tensor[4*naux + 2] = gout16;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[5*naux + 1] = gout11;
+        eri_tensor[5*naux + 2] = gout17;
+    }
+}
+
+__global__
+void int3c2e_211(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        double gout27 = 0;
+        double gout28 = 0;
+        double gout29 = 0;
+        double gout30 = 0;
+        double gout31 = 0;
+        double gout32 = 0;
+        double gout33 = 0;
+        double gout34 = 0;
+        double gout35 = 0;
+        double gout36 = 0;
+        double gout37 = 0;
+        double gout38 = 0;
+        double gout39 = 0;
+        double gout40 = 0;
+        double gout41 = 0;
+        double gout42 = 0;
+        double gout43 = 0;
+        double gout44 = 0;
+        double gout45 = 0;
+        double gout46 = 0;
+        double gout47 = 0;
+        double gout48 = 0;
+        double gout49 = 0;
+        double gout50 = 0;
+        double gout51 = 0;
+        double gout52 = 0;
+        double gout53 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(3, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 6*nst_per_block;
+                rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = xjxi * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
+                double trr_31x = cpx * trr_30x + 3*b00 * trr_20x;
+                double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
+                double hrr_211x = trr_31x - xjxi * trr_21x;
+                gout0 += hrr_211x * fac1 * wt;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                double hrr_111x = trr_21x - xjxi * trr_11x;
+                double c0y = yjyi * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += hrr_111x * trr_10y * wt;
+                double c0z = zjzi * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += hrr_111x * fac1 * trr_10z;
+                double trr_01x = cpx * 1;
+                double hrr_011x = trr_11x - xjxi * trr_01x;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                gout3 += hrr_011x * trr_20y * wt;
+                gout4 += hrr_011x * trr_10y * trr_10z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                gout5 += hrr_011x * fac1 * trr_20z;
+                double hrr_010y = trr_10y - yjyi * fac1;
+                gout6 += trr_21x * hrr_010y * wt;
+                double hrr_110y = trr_20y - yjyi * trr_10y;
+                gout7 += trr_11x * hrr_110y * wt;
+                gout8 += trr_11x * hrr_010y * trr_10z;
+                double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
+                double hrr_210y = trr_30y - yjyi * trr_20y;
+                gout9 += trr_01x * hrr_210y * wt;
+                gout10 += trr_01x * hrr_110y * trr_10z;
+                gout11 += trr_01x * hrr_010y * trr_20z;
+                double hrr_010z = trr_10z - zjzi * wt;
+                gout12 += trr_21x * fac1 * hrr_010z;
+                gout13 += trr_11x * trr_10y * hrr_010z;
+                double hrr_110z = trr_20z - zjzi * trr_10z;
+                gout14 += trr_11x * fac1 * hrr_110z;
+                gout15 += trr_01x * trr_20y * hrr_010z;
+                gout16 += trr_01x * trr_10y * hrr_110z;
+                double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
+                double hrr_210z = trr_30z - zjzi * trr_20z;
+                gout17 += trr_01x * fac1 * hrr_210z;
+                double hrr_210x = trr_30x - xjxi * trr_20x;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout18 += hrr_210x * trr_01y * wt;
+                double hrr_110x = trr_20x - xjxi * trr_10x;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout19 += hrr_110x * trr_11y * wt;
+                gout20 += hrr_110x * trr_01y * trr_10z;
+                double hrr_010x = trr_10x - xjxi * 1;
+                double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
+                gout21 += hrr_010x * trr_21y * wt;
+                gout22 += hrr_010x * trr_11y * trr_10z;
+                gout23 += hrr_010x * trr_01y * trr_20z;
+                double hrr_011y = trr_11y - yjyi * trr_01y;
+                gout24 += trr_20x * hrr_011y * wt;
+                double hrr_111y = trr_21y - yjyi * trr_11y;
+                gout25 += trr_10x * hrr_111y * wt;
+                gout26 += trr_10x * hrr_011y * trr_10z;
+                double trr_31y = cpy * trr_30y + 3*b00 * trr_20y;
+                double hrr_211y = trr_31y - yjyi * trr_21y;
+                gout27 += 1 * hrr_211y * wt;
+                gout28 += 1 * hrr_111y * trr_10z;
+                gout29 += 1 * hrr_011y * trr_20z;
+                gout30 += trr_20x * trr_01y * hrr_010z;
+                gout31 += trr_10x * trr_11y * hrr_010z;
+                gout32 += trr_10x * trr_01y * hrr_110z;
+                gout33 += 1 * trr_21y * hrr_010z;
+                gout34 += 1 * trr_11y * hrr_110z;
+                gout35 += 1 * trr_01y * hrr_210z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout36 += hrr_210x * fac1 * trr_01z;
+                gout37 += hrr_110x * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout38 += hrr_110x * fac1 * trr_11z;
+                gout39 += hrr_010x * trr_20y * trr_01z;
+                gout40 += hrr_010x * trr_10y * trr_11z;
+                double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
+                gout41 += hrr_010x * fac1 * trr_21z;
+                gout42 += trr_20x * hrr_010y * trr_01z;
+                gout43 += trr_10x * hrr_110y * trr_01z;
+                gout44 += trr_10x * hrr_010y * trr_11z;
+                gout45 += 1 * hrr_210y * trr_01z;
+                gout46 += 1 * hrr_110y * trr_11z;
+                gout47 += 1 * hrr_010y * trr_21z;
+                double hrr_011z = trr_11z - zjzi * trr_01z;
+                gout48 += trr_20x * fac1 * hrr_011z;
+                gout49 += trr_10x * trr_10y * hrr_011z;
+                double hrr_111z = trr_21z - zjzi * trr_11z;
+                gout50 += trr_10x * fac1 * hrr_111z;
+                gout51 += 1 * trr_20y * hrr_011z;
+                gout52 += 1 * trr_10y * hrr_111z;
+                double trr_31z = cpz * trr_30z + 3*b00 * trr_20z;
+                double hrr_211z = trr_31z - zjzi * trr_21z;
+                gout53 += 1 * fac1 * hrr_211z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 18 * naux + ksh_in_auxmol * 3;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout18;
+        eri_tensor[0*naux + 2] = gout36;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout19;
+        eri_tensor[1*naux + 2] = gout37;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout20;
+        eri_tensor[2*naux + 2] = gout38;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[3*naux + 1] = gout21;
+        eri_tensor[3*naux + 2] = gout39;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[4*naux + 1] = gout22;
+        eri_tensor[4*naux + 2] = gout40;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[5*naux + 1] = gout23;
+        eri_tensor[5*naux + 2] = gout41;
+        eri_tensor[6*naux + 0] = gout6;
+        eri_tensor[6*naux + 1] = gout24;
+        eri_tensor[6*naux + 2] = gout42;
+        eri_tensor[7*naux + 0] = gout7;
+        eri_tensor[7*naux + 1] = gout25;
+        eri_tensor[7*naux + 2] = gout43;
+        eri_tensor[8*naux + 0] = gout8;
+        eri_tensor[8*naux + 1] = gout26;
+        eri_tensor[8*naux + 2] = gout44;
+        eri_tensor[9*naux + 0] = gout9;
+        eri_tensor[9*naux + 1] = gout27;
+        eri_tensor[9*naux + 2] = gout45;
+        eri_tensor[10*naux + 0] = gout10;
+        eri_tensor[10*naux + 1] = gout28;
+        eri_tensor[10*naux + 2] = gout46;
+        eri_tensor[11*naux + 0] = gout11;
+        eri_tensor[11*naux + 1] = gout29;
+        eri_tensor[11*naux + 2] = gout47;
+        eri_tensor[12*naux + 0] = gout12;
+        eri_tensor[12*naux + 1] = gout30;
+        eri_tensor[12*naux + 2] = gout48;
+        eri_tensor[13*naux + 0] = gout13;
+        eri_tensor[13*naux + 1] = gout31;
+        eri_tensor[13*naux + 2] = gout49;
+        eri_tensor[14*naux + 0] = gout14;
+        eri_tensor[14*naux + 1] = gout32;
+        eri_tensor[14*naux + 2] = gout50;
+        eri_tensor[15*naux + 0] = gout15;
+        eri_tensor[15*naux + 1] = gout33;
+        eri_tensor[15*naux + 2] = gout51;
+        eri_tensor[16*naux + 0] = gout16;
+        eri_tensor[16*naux + 1] = gout34;
+        eri_tensor[16*naux + 2] = gout52;
+        eri_tensor[17*naux + 0] = gout17;
+        eri_tensor[17*naux + 1] = gout35;
+        eri_tensor[17*naux + 2] = gout53;
+    }
+}
+
+__global__
+void int3c2e_221(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int st_id = threadIdx.x;
+    int gout_id = threadIdx.y;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = bounds.nroots;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+    double *gx = rw + nroots * 128;
+    double *gy = gx + 1152;
+    double *gz = gy + 1152;
+    double *Rpq = gz + 1152;
+    double *rjri = Rpq + 192;
+    if (gout_id == 0) {
+        gx[0] = 1.;
+    }
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * 64 * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + 64 * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0+st_id; ijk_idx < st1+st_id; ijk_idx += 64) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        __syncthreads();
+        if (ijk_idx >= nst) {
+            shl_pair_idx = st0 / nksh;
+            if (gout_id == 0) {
+                gx[0] = 0.;
+            }
+        }
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        if (gout_id == 0) {
+            double xjxi = rj[0] - ri[0];
+            double yjyi = rj[1] - ri[1];
+            double zjzi = rj[2] - ri[2];
+            double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+            rjri[0] = xjxi;
+            rjri[64] = yjyi;
+            rjri[128] = zjzi;
+            rjri[192] = rr_ij;
+        }
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        double s0, s1, s2;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double aj_aij = aj / aij;
+            __syncthreads();
+            double xij = rjri[0] * aj_aij + ri[0];
+            double yij = rjri[64] * aj_aij + ri[1];
+            double zij = rjri[128] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            if (gout_id == 0) {
+                double cijk = ci[ip] * cj[jp] * ck[kp];
+                double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+                double theta_ij = ai * aj_aij;
+                double Kab = theta_ij * rjri[192];
+                gy[0] = fac * exp(-Kab);
+                Rpq[0] = xpq;
+                Rpq[64] = ypq;
+                Rpq[128] = zpq;
+            }
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(3, theta_rr, rw, 64, gout_id, 4);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4);
+                __syncthreads();
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = gout_id; irys < 3; irys+=4) {
+                    rw[ irys*2   *64] *= theta_fac;
+                    rw[(irys*2+1)*64] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 384;
+                rys_roots(3, theta_rr, rw1, 64, gout_id, 4);
+                rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4);
+                __syncthreads();
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = gout_id; irys < 3; irys+=4) {
+                    rw[ irys*2   *64] *= theta_fac;
+                    rw[(irys*2+1)*64] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                __syncthreads();
+                double rt = rw[irys*128];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double rt_ak = rt_aa * aij;
+                double b00 = .5 * rt_aa;
+                for (int n = gout_id; n < 3; n += 4) {
+                    if (n == 2) {
+                        gz[0] = rw[irys*128+64];
+                    }
+                    double *_gx = gx + n * 1152;
+                    double xjxi = rjri[n * 64];
+                    double Rpa = xjxi * aj_aij;
+                    double c0x = Rpa - rt_aij * Rpq[n * 64];
+                    s0 = _gx[0];
+                    s1 = c0x * s0;
+                    _gx[64] = s1;
+                    s2 = c0x * s1 + 1 * b10 * s0;
+                    _gx[128] = s2;
+                    s0 = s1;
+                    s1 = s2;
+                    s2 = c0x * s1 + 2 * b10 * s0;
+                    _gx[192] = s2;
+                    s0 = s1;
+                    s1 = s2;
+                    s2 = c0x * s1 + 3 * b10 * s0;
+                    _gx[256] = s2;
+                    double cpx = rt_ak * Rpq[n * 64];
+                    s0 = _gx[0];
+                    s1 = cpx * s0;
+                    _gx[576] = s1;
+                    s0 = _gx[64];
+                    s1 = cpx * s0;
+                    s1 += 1 * b00 * _gx[0];
+                    _gx[640] = s1;
+                    s0 = _gx[128];
+                    s1 = cpx * s0;
+                    s1 += 2 * b00 * _gx[64];
+                    _gx[704] = s1;
+                    s0 = _gx[192];
+                    s1 = cpx * s0;
+                    s1 += 3 * b00 * _gx[128];
+                    _gx[768] = s1;
+                    s0 = _gx[256];
+                    s1 = cpx * s0;
+                    s1 += 4 * b00 * _gx[192];
+                    _gx[832] = s1;
+                    s1 = _gx[256];
+                    s0 = _gx[192];
+                    _gx[384] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[128];
+                    _gx[320] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[64];
+                    _gx[256] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[0];
+                    _gx[192] = s1 - xjxi * s0;
+                    s1 = _gx[384];
+                    s0 = _gx[320];
+                    _gx[512] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[256];
+                    _gx[448] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[192];
+                    _gx[384] = s1 - xjxi * s0;
+                    s1 = _gx[832];
+                    s0 = _gx[768];
+                    _gx[960] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[704];
+                    _gx[896] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[640];
+                    _gx[832] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[576];
+                    _gx[768] = s1 - xjxi * s0;
+                    s1 = _gx[960];
+                    s0 = _gx[896];
+                    _gx[1088] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[832];
+                    _gx[1024] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[768];
+                    _gx[960] = s1 - xjxi * s0;
+                }
+                __syncthreads();
+                switch (gout_id) {
+                case 0:
+                gout0 += gx[1088] * gy[0] * gz[0];
+                gout1 += gx[448] * gy[640] * gz[0];
+                gout2 += gx[448] * gy[0] * gz[640];
+                gout3 += gx[960] * gy[64] * gz[64];
+                gout4 += gx[384] * gy[576] * gz[128];
+                gout5 += gx[320] * gy[192] * gz[576];
+                gout6 += gx[832] * gy[192] * gz[64];
+                gout7 += gx[192] * gy[896] * gz[0];
+                gout8 += gx[192] * gy[256] * gz[640];
+                gout9 += gx[896] * gy[0] * gz[192];
+                gout10 += gx[256] * gy[640] * gz[192];
+                gout11 += gx[256] * gy[0] * gz[832];
+                gout12 += gx[768] * gy[64] * gz[256];
+                gout13 += gx[192] * gy[576] * gz[320];
+                gout14 += gx[128] * gy[384] * gz[576];
+                gout15 += gx[640] * gy[384] * gz[64];
+                gout16 += gx[0] * gy[1088] * gz[0];
+                gout17 += gx[0] * gy[448] * gz[640];
+                gout18 += gx[704] * gy[192] * gz[192];
+                gout19 += gx[64] * gy[832] * gz[192];
+                gout20 += gx[64] * gy[192] * gz[832];
+                gout21 += gx[576] * gy[256] * gz[256];
+                gout22 += gx[0] * gy[768] * gz[320];
+                gout23 += gx[128] * gy[0] * gz[960];
+                gout24 += gx[640] * gy[0] * gz[448];
+                gout25 += gx[0] * gy[704] * gz[384];
+                gout26 += gx[0] * gy[64] * gz[1024];
+                break;
+                case 1:
+                gout0 += gx[512] * gy[576] * gz[0];
+                gout1 += gx[448] * gy[64] * gz[576];
+                gout2 += gx[960] * gy[128] * gz[0];
+                gout3 += gx[384] * gy[640] * gz[64];
+                gout4 += gx[384] * gy[0] * gz[704];
+                gout5 += gx[832] * gy[256] * gz[0];
+                gout6 += gx[256] * gy[768] * gz[64];
+                gout7 += gx[192] * gy[320] * gz[576];
+                gout8 += gx[768] * gy[192] * gz[128];
+                gout9 += gx[320] * gy[576] * gz[192];
+                gout10 += gx[256] * gy[64] * gz[768];
+                gout11 += gx[768] * gy[128] * gz[192];
+                gout12 += gx[192] * gy[640] * gz[256];
+                gout13 += gx[192] * gy[0] * gz[896];
+                gout14 += gx[640] * gy[448] * gz[0];
+                gout15 += gx[64] * gy[960] * gz[64];
+                gout16 += gx[0] * gy[512] * gz[576];
+                gout17 += gx[576] * gy[384] * gz[128];
+                gout18 += gx[128] * gy[768] * gz[192];
+                gout19 += gx[64] * gy[256] * gz[768];
+                gout20 += gx[576] * gy[320] * gz[192];
+                gout21 += gx[0] * gy[832] * gz[256];
+                gout22 += gx[0] * gy[192] * gz[896];
+                gout23 += gx[640] * gy[64] * gz[384];
+                gout24 += gx[64] * gy[576] * gz[448];
+                gout25 += gx[0] * gy[128] * gz[960];
+                gout26 += gx[576] * gy[0] * gz[512];
+                break;
+                case 2:
+                gout0 += gx[512] * gy[0] * gz[576];
+                gout1 += gx[1024] * gy[0] * gz[64];
+                gout2 += gx[384] * gy[704] * gz[0];
+                gout3 += gx[384] * gy[64] * gz[640];
+                gout4 += gx[896] * gy[192] * gz[0];
+                gout5 += gx[256] * gy[832] * gz[0];
+                gout6 += gx[256] * gy[192] * gz[640];
+                gout7 += gx[768] * gy[256] * gz[64];
+                gout8 += gx[192] * gy[768] * gz[128];
+                gout9 += gx[320] * gy[0] * gz[768];
+                gout10 += gx[832] * gy[0] * gz[256];
+                gout11 += gx[192] * gy[704] * gz[192];
+                gout12 += gx[192] * gy[64] * gz[832];
+                gout13 += gx[704] * gy[384] * gz[0];
+                gout14 += gx[64] * gy[1024] * gz[0];
+                gout15 += gx[64] * gy[384] * gz[640];
+                gout16 += gx[576] * gy[448] * gz[64];
+                gout17 += gx[0] * gy[960] * gz[128];
+                gout18 += gx[128] * gy[192] * gz[768];
+                gout19 += gx[640] * gy[192] * gz[256];
+                gout20 += gx[0] * gy[896] * gz[192];
+                gout21 += gx[0] * gy[256] * gz[832];
+                gout22 += gx[704] * gy[0] * gz[384];
+                gout23 += gx[64] * gy[640] * gz[384];
+                gout24 += gx[64] * gy[0] * gz[1024];
+                gout25 += gx[576] * gy[64] * gz[448];
+                gout26 += gx[0] * gy[576] * gz[512];
+                break;
+                case 3:
+                gout0 += gx[1024] * gy[64] * gz[0];
+                gout1 += gx[448] * gy[576] * gz[64];
+                gout2 += gx[384] * gy[128] * gz[576];
+                gout3 += gx[960] * gy[0] * gz[128];
+                gout4 += gx[320] * gy[768] * gz[0];
+                gout5 += gx[256] * gy[256] * gz[576];
+                gout6 += gx[768] * gy[320] * gz[0];
+                gout7 += gx[192] * gy[832] * gz[64];
+                gout8 += gx[192] * gy[192] * gz[704];
+                gout9 += gx[832] * gy[64] * gz[192];
+                gout10 += gx[256] * gy[576] * gz[256];
+                gout11 += gx[192] * gy[128] * gz[768];
+                gout12 += gx[768] * gy[0] * gz[320];
+                gout13 += gx[128] * gy[960] * gz[0];
+                gout14 += gx[64] * gy[448] * gz[576];
+                gout15 += gx[576] * gy[512] * gz[0];
+                gout16 += gx[0] * gy[1024] * gz[64];
+                gout17 += gx[0] * gy[384] * gz[704];
+                gout18 += gx[640] * gy[256] * gz[192];
+                gout19 += gx[64] * gy[768] * gz[256];
+                gout20 += gx[0] * gy[320] * gz[768];
+                gout21 += gx[576] * gy[192] * gz[320];
+                gout22 += gx[128] * gy[576] * gz[384];
+                gout23 += gx[64] * gy[64] * gz[960];
+                gout24 += gx[576] * gy[128] * gz[384];
+                gout25 += gx[0] * gy[640] * gz[448];
+                gout26 += gx[0] * gy[0] * gz[1088];
+                break;
+                }
+            }
+        }
+        if (ijk_idx < nst) {
+            int naux = bounds.naux;
+            double *eri_tensor = out + shl_pair_idx * 36 * naux + ksh_in_auxmol * 3;
+            switch (gout_id) {
+            case 0:
+            eri_tensor[0*naux + 0] = gout0;
+            eri_tensor[1*naux + 1] = gout1;
+            eri_tensor[2*naux + 2] = gout2;
+            eri_tensor[4*naux + 0] = gout3;
+            eri_tensor[5*naux + 1] = gout4;
+            eri_tensor[6*naux + 2] = gout5;
+            eri_tensor[8*naux + 0] = gout6;
+            eri_tensor[9*naux + 1] = gout7;
+            eri_tensor[10*naux + 2] = gout8;
+            eri_tensor[12*naux + 0] = gout9;
+            eri_tensor[13*naux + 1] = gout10;
+            eri_tensor[14*naux + 2] = gout11;
+            eri_tensor[16*naux + 0] = gout12;
+            eri_tensor[17*naux + 1] = gout13;
+            eri_tensor[18*naux + 2] = gout14;
+            eri_tensor[20*naux + 0] = gout15;
+            eri_tensor[21*naux + 1] = gout16;
+            eri_tensor[22*naux + 2] = gout17;
+            eri_tensor[24*naux + 0] = gout18;
+            eri_tensor[25*naux + 1] = gout19;
+            eri_tensor[26*naux + 2] = gout20;
+            eri_tensor[28*naux + 0] = gout21;
+            eri_tensor[29*naux + 1] = gout22;
+            eri_tensor[30*naux + 2] = gout23;
+            eri_tensor[32*naux + 0] = gout24;
+            eri_tensor[33*naux + 1] = gout25;
+            eri_tensor[34*naux + 2] = gout26;
+            break;
+            case 1:
+            eri_tensor[0*naux + 1] = gout0;
+            eri_tensor[1*naux + 2] = gout1;
+            eri_tensor[3*naux + 0] = gout2;
+            eri_tensor[4*naux + 1] = gout3;
+            eri_tensor[5*naux + 2] = gout4;
+            eri_tensor[7*naux + 0] = gout5;
+            eri_tensor[8*naux + 1] = gout6;
+            eri_tensor[9*naux + 2] = gout7;
+            eri_tensor[11*naux + 0] = gout8;
+            eri_tensor[12*naux + 1] = gout9;
+            eri_tensor[13*naux + 2] = gout10;
+            eri_tensor[15*naux + 0] = gout11;
+            eri_tensor[16*naux + 1] = gout12;
+            eri_tensor[17*naux + 2] = gout13;
+            eri_tensor[19*naux + 0] = gout14;
+            eri_tensor[20*naux + 1] = gout15;
+            eri_tensor[21*naux + 2] = gout16;
+            eri_tensor[23*naux + 0] = gout17;
+            eri_tensor[24*naux + 1] = gout18;
+            eri_tensor[25*naux + 2] = gout19;
+            eri_tensor[27*naux + 0] = gout20;
+            eri_tensor[28*naux + 1] = gout21;
+            eri_tensor[29*naux + 2] = gout22;
+            eri_tensor[31*naux + 0] = gout23;
+            eri_tensor[32*naux + 1] = gout24;
+            eri_tensor[33*naux + 2] = gout25;
+            eri_tensor[35*naux + 0] = gout26;
+            break;
+            case 2:
+            eri_tensor[0*naux + 2] = gout0;
+            eri_tensor[2*naux + 0] = gout1;
+            eri_tensor[3*naux + 1] = gout2;
+            eri_tensor[4*naux + 2] = gout3;
+            eri_tensor[6*naux + 0] = gout4;
+            eri_tensor[7*naux + 1] = gout5;
+            eri_tensor[8*naux + 2] = gout6;
+            eri_tensor[10*naux + 0] = gout7;
+            eri_tensor[11*naux + 1] = gout8;
+            eri_tensor[12*naux + 2] = gout9;
+            eri_tensor[14*naux + 0] = gout10;
+            eri_tensor[15*naux + 1] = gout11;
+            eri_tensor[16*naux + 2] = gout12;
+            eri_tensor[18*naux + 0] = gout13;
+            eri_tensor[19*naux + 1] = gout14;
+            eri_tensor[20*naux + 2] = gout15;
+            eri_tensor[22*naux + 0] = gout16;
+            eri_tensor[23*naux + 1] = gout17;
+            eri_tensor[24*naux + 2] = gout18;
+            eri_tensor[26*naux + 0] = gout19;
+            eri_tensor[27*naux + 1] = gout20;
+            eri_tensor[28*naux + 2] = gout21;
+            eri_tensor[30*naux + 0] = gout22;
+            eri_tensor[31*naux + 1] = gout23;
+            eri_tensor[32*naux + 2] = gout24;
+            eri_tensor[34*naux + 0] = gout25;
+            eri_tensor[35*naux + 1] = gout26;
+            break;
+            case 3:
+            eri_tensor[1*naux + 0] = gout0;
+            eri_tensor[2*naux + 1] = gout1;
+            eri_tensor[3*naux + 2] = gout2;
+            eri_tensor[5*naux + 0] = gout3;
+            eri_tensor[6*naux + 1] = gout4;
+            eri_tensor[7*naux + 2] = gout5;
+            eri_tensor[9*naux + 0] = gout6;
+            eri_tensor[10*naux + 1] = gout7;
+            eri_tensor[11*naux + 2] = gout8;
+            eri_tensor[13*naux + 0] = gout9;
+            eri_tensor[14*naux + 1] = gout10;
+            eri_tensor[15*naux + 2] = gout11;
+            eri_tensor[17*naux + 0] = gout12;
+            eri_tensor[18*naux + 1] = gout13;
+            eri_tensor[19*naux + 2] = gout14;
+            eri_tensor[21*naux + 0] = gout15;
+            eri_tensor[22*naux + 1] = gout16;
+            eri_tensor[23*naux + 2] = gout17;
+            eri_tensor[25*naux + 0] = gout18;
+            eri_tensor[26*naux + 1] = gout19;
+            eri_tensor[27*naux + 2] = gout20;
+            eri_tensor[29*naux + 0] = gout21;
+            eri_tensor[30*naux + 1] = gout22;
+            eri_tensor[31*naux + 2] = gout23;
+            eri_tensor[33*naux + 0] = gout24;
+            eri_tensor[34*naux + 1] = gout25;
+            eri_tensor[35*naux + 2] = gout26;
+            break;
+            }
+        }
+    }
+}
+
+#if CUDA_VERSION >= 12040
+__global__ __maxnreg__(128)
+#else
+__global__
+#endif
+void int3c2e_002(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_ak = rt_aa * aij;
+                double b01 = .5/ak * (1 - rt_ak);
+                double cpx = xpq*rt_ak;
+                double trr_01x = cpx * 1;
+                double trr_02x = cpx * trr_01x + 1*b01 * 1;
+                gout0 += trr_02x * fac1 * wt;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout1 += trr_01x * trr_01y * wt;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout2 += trr_01x * fac1 * trr_01z;
+                double trr_02y = cpy * trr_01y + 1*b01 * fac1;
+                gout3 += 1 * trr_02y * wt;
+                gout4 += 1 * trr_01y * trr_01z;
+                double trr_02z = cpz * trr_01z + 1*b01 * wt;
+                gout5 += 1 * fac1 * trr_02z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 1 * naux + ksh_in_auxmol * 6;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout1;
+        eri_tensor[0*naux + 2] = gout2;
+        eri_tensor[0*naux + 3] = gout3;
+        eri_tensor[0*naux + 4] = gout4;
+        eri_tensor[0*naux + 5] = gout5;
+    }
+}
+
+#if CUDA_VERSION >= 12040
+__global__ __maxnreg__(128)
+#else
+__global__
+#endif
+void int3c2e_102(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double b01 = .5/ak * (1 - rt_ak);
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double c0x = xjxi * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                double trr_01x = cpx * 1;
+                double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x;
+                gout0 += trr_12x * fac1 * wt;
+                double trr_02x = cpx * trr_01x + 1*b01 * 1;
+                double c0y = yjyi * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += trr_02x * trr_10y * wt;
+                double c0z = zjzi * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += trr_02x * fac1 * trr_10z;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout3 += trr_11x * trr_01y * wt;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout4 += trr_01x * trr_11y * wt;
+                gout5 += trr_01x * trr_01y * trr_10z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout6 += trr_11x * fac1 * trr_01z;
+                gout7 += trr_01x * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout8 += trr_01x * fac1 * trr_11z;
+                double trr_02y = cpy * trr_01y + 1*b01 * fac1;
+                gout9 += trr_10x * trr_02y * wt;
+                double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y;
+                gout10 += 1 * trr_12y * wt;
+                gout11 += 1 * trr_02y * trr_10z;
+                gout12 += trr_10x * trr_01y * trr_01z;
+                gout13 += 1 * trr_11y * trr_01z;
+                gout14 += 1 * trr_01y * trr_11z;
+                double trr_02z = cpz * trr_01z + 1*b01 * wt;
+                gout15 += trr_10x * fac1 * trr_02z;
+                gout16 += 1 * trr_10y * trr_02z;
+                double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z;
+                gout17 += 1 * fac1 * trr_12z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 3 * naux + ksh_in_auxmol * 6;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout3;
+        eri_tensor[0*naux + 2] = gout6;
+        eri_tensor[0*naux + 3] = gout9;
+        eri_tensor[0*naux + 4] = gout12;
+        eri_tensor[0*naux + 5] = gout15;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout4;
+        eri_tensor[1*naux + 2] = gout7;
+        eri_tensor[1*naux + 3] = gout10;
+        eri_tensor[1*naux + 4] = gout13;
+        eri_tensor[1*naux + 5] = gout16;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout5;
+        eri_tensor[2*naux + 2] = gout8;
+        eri_tensor[2*naux + 3] = gout11;
+        eri_tensor[2*naux + 4] = gout14;
+        eri_tensor[2*naux + 5] = gout17;
+    }
+}
+
+__global__
+void int3c2e_112(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        double gout27 = 0;
+        double gout28 = 0;
+        double gout29 = 0;
+        double gout30 = 0;
+        double gout31 = 0;
+        double gout32 = 0;
+        double gout33 = 0;
+        double gout34 = 0;
+        double gout35 = 0;
+        double gout36 = 0;
+        double gout37 = 0;
+        double gout38 = 0;
+        double gout39 = 0;
+        double gout40 = 0;
+        double gout41 = 0;
+        double gout42 = 0;
+        double gout43 = 0;
+        double gout44 = 0;
+        double gout45 = 0;
+        double gout46 = 0;
+        double gout47 = 0;
+        double gout48 = 0;
+        double gout49 = 0;
+        double gout50 = 0;
+        double gout51 = 0;
+        double gout52 = 0;
+        double gout53 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(3, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 6*nst_per_block;
+                rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double b01 = .5/ak * (1 - rt_ak);
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = xjxi * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                double trr_22x = cpx * trr_21x + 1*b01 * trr_20x + 2*b00 * trr_11x;
+                double trr_01x = cpx * 1;
+                double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x;
+                double hrr_112x = trr_22x - xjxi * trr_12x;
+                gout0 += hrr_112x * fac1 * wt;
+                double trr_02x = cpx * trr_01x + 1*b01 * 1;
+                double hrr_012x = trr_12x - xjxi * trr_02x;
+                double c0y = yjyi * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += hrr_012x * trr_10y * wt;
+                double c0z = zjzi * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += hrr_012x * fac1 * trr_10z;
+                double hrr_010y = trr_10y - yjyi * fac1;
+                gout3 += trr_12x * hrr_010y * wt;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                double hrr_110y = trr_20y - yjyi * trr_10y;
+                gout4 += trr_02x * hrr_110y * wt;
+                gout5 += trr_02x * hrr_010y * trr_10z;
+                double hrr_010z = trr_10z - zjzi * wt;
+                gout6 += trr_12x * fac1 * hrr_010z;
+                gout7 += trr_02x * trr_10y * hrr_010z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                double hrr_110z = trr_20z - zjzi * trr_10z;
+                gout8 += trr_02x * fac1 * hrr_110z;
+                double hrr_111x = trr_21x - xjxi * trr_11x;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout9 += hrr_111x * trr_01y * wt;
+                double hrr_011x = trr_11x - xjxi * trr_01x;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout10 += hrr_011x * trr_11y * wt;
+                gout11 += hrr_011x * trr_01y * trr_10z;
+                double hrr_011y = trr_11y - yjyi * trr_01y;
+                gout12 += trr_11x * hrr_011y * wt;
+                double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
+                double hrr_111y = trr_21y - yjyi * trr_11y;
+                gout13 += trr_01x * hrr_111y * wt;
+                gout14 += trr_01x * hrr_011y * trr_10z;
+                gout15 += trr_11x * trr_01y * hrr_010z;
+                gout16 += trr_01x * trr_11y * hrr_010z;
+                gout17 += trr_01x * trr_01y * hrr_110z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout18 += hrr_111x * fac1 * trr_01z;
+                gout19 += hrr_011x * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout20 += hrr_011x * fac1 * trr_11z;
+                gout21 += trr_11x * hrr_010y * trr_01z;
+                gout22 += trr_01x * hrr_110y * trr_01z;
+                gout23 += trr_01x * hrr_010y * trr_11z;
+                double hrr_011z = trr_11z - zjzi * trr_01z;
+                gout24 += trr_11x * fac1 * hrr_011z;
+                gout25 += trr_01x * trr_10y * hrr_011z;
+                double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
+                double hrr_111z = trr_21z - zjzi * trr_11z;
+                gout26 += trr_01x * fac1 * hrr_111z;
+                double hrr_110x = trr_20x - xjxi * trr_10x;
+                double trr_02y = cpy * trr_01y + 1*b01 * fac1;
+                gout27 += hrr_110x * trr_02y * wt;
+                double hrr_010x = trr_10x - xjxi * 1;
+                double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y;
+                gout28 += hrr_010x * trr_12y * wt;
+                gout29 += hrr_010x * trr_02y * trr_10z;
+                double hrr_012y = trr_12y - yjyi * trr_02y;
+                gout30 += trr_10x * hrr_012y * wt;
+                double trr_22y = cpy * trr_21y + 1*b01 * trr_20y + 2*b00 * trr_11y;
+                double hrr_112y = trr_22y - yjyi * trr_12y;
+                gout31 += 1 * hrr_112y * wt;
+                gout32 += 1 * hrr_012y * trr_10z;
+                gout33 += trr_10x * trr_02y * hrr_010z;
+                gout34 += 1 * trr_12y * hrr_010z;
+                gout35 += 1 * trr_02y * hrr_110z;
+                gout36 += hrr_110x * trr_01y * trr_01z;
+                gout37 += hrr_010x * trr_11y * trr_01z;
+                gout38 += hrr_010x * trr_01y * trr_11z;
+                gout39 += trr_10x * hrr_011y * trr_01z;
+                gout40 += 1 * hrr_111y * trr_01z;
+                gout41 += 1 * hrr_011y * trr_11z;
+                gout42 += trr_10x * trr_01y * hrr_011z;
+                gout43 += 1 * trr_11y * hrr_011z;
+                gout44 += 1 * trr_01y * hrr_111z;
+                double trr_02z = cpz * trr_01z + 1*b01 * wt;
+                gout45 += hrr_110x * fac1 * trr_02z;
+                gout46 += hrr_010x * trr_10y * trr_02z;
+                double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z;
+                gout47 += hrr_010x * fac1 * trr_12z;
+                gout48 += trr_10x * hrr_010y * trr_02z;
+                gout49 += 1 * hrr_110y * trr_02z;
+                gout50 += 1 * hrr_010y * trr_12z;
+                double hrr_012z = trr_12z - zjzi * trr_02z;
+                gout51 += trr_10x * fac1 * hrr_012z;
+                gout52 += 1 * trr_10y * hrr_012z;
+                double trr_22z = cpz * trr_21z + 1*b01 * trr_20z + 2*b00 * trr_11z;
+                double hrr_112z = trr_22z - zjzi * trr_12z;
+                gout53 += 1 * fac1 * hrr_112z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 9 * naux + ksh_in_auxmol * 6;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout9;
+        eri_tensor[0*naux + 2] = gout18;
+        eri_tensor[0*naux + 3] = gout27;
+        eri_tensor[0*naux + 4] = gout36;
+        eri_tensor[0*naux + 5] = gout45;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout10;
+        eri_tensor[1*naux + 2] = gout19;
+        eri_tensor[1*naux + 3] = gout28;
+        eri_tensor[1*naux + 4] = gout37;
+        eri_tensor[1*naux + 5] = gout46;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout11;
+        eri_tensor[2*naux + 2] = gout20;
+        eri_tensor[2*naux + 3] = gout29;
+        eri_tensor[2*naux + 4] = gout38;
+        eri_tensor[2*naux + 5] = gout47;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[3*naux + 1] = gout12;
+        eri_tensor[3*naux + 2] = gout21;
+        eri_tensor[3*naux + 3] = gout30;
+        eri_tensor[3*naux + 4] = gout39;
+        eri_tensor[3*naux + 5] = gout48;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[4*naux + 1] = gout13;
+        eri_tensor[4*naux + 2] = gout22;
+        eri_tensor[4*naux + 3] = gout31;
+        eri_tensor[4*naux + 4] = gout40;
+        eri_tensor[4*naux + 5] = gout49;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[5*naux + 1] = gout14;
+        eri_tensor[5*naux + 2] = gout23;
+        eri_tensor[5*naux + 3] = gout32;
+        eri_tensor[5*naux + 4] = gout41;
+        eri_tensor[5*naux + 5] = gout50;
+        eri_tensor[6*naux + 0] = gout6;
+        eri_tensor[6*naux + 1] = gout15;
+        eri_tensor[6*naux + 2] = gout24;
+        eri_tensor[6*naux + 3] = gout33;
+        eri_tensor[6*naux + 4] = gout42;
+        eri_tensor[6*naux + 5] = gout51;
+        eri_tensor[7*naux + 0] = gout7;
+        eri_tensor[7*naux + 1] = gout16;
+        eri_tensor[7*naux + 2] = gout25;
+        eri_tensor[7*naux + 3] = gout34;
+        eri_tensor[7*naux + 4] = gout43;
+        eri_tensor[7*naux + 5] = gout52;
+        eri_tensor[8*naux + 0] = gout8;
+        eri_tensor[8*naux + 1] = gout17;
+        eri_tensor[8*naux + 2] = gout26;
+        eri_tensor[8*naux + 3] = gout35;
+        eri_tensor[8*naux + 4] = gout44;
+        eri_tensor[8*naux + 5] = gout53;
+    }
+}
+
+__global__
+void int3c2e_202(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xk = rk[0];
+        double yk = rk[1];
+        double zk = rk[2];
+        double xjxi = rj[0] - xi;
+        double yjyi = rj[1] - yi;
+        double zjzi = rj[2] - zi;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        double gout27 = 0;
+        double gout28 = 0;
+        double gout29 = 0;
+        double gout30 = 0;
+        double gout31 = 0;
+        double gout32 = 0;
+        double gout33 = 0;
+        double gout34 = 0;
+        double gout35 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rr_ij;
+            double fac1 = fac * exp(-Kab);
+            double xij = xjxi * aj_aij + xi;
+            double yij = yjyi * aj_aij + yi;
+            double zij = zjzi * aj_aij + zi;
+            double xpq = xij - xk;
+            double ypq = yij - yk;
+            double zpq = zij - zk;
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(3, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 6*nst_per_block;
+                rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < bounds.nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double b01 = .5/ak * (1 - rt_ak);
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = xjxi * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                double trr_22x = cpx * trr_21x + 1*b01 * trr_20x + 2*b00 * trr_11x;
+                gout0 += trr_22x * fac1 * wt;
+                double trr_01x = cpx * 1;
+                double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x;
+                double c0y = yjyi * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += trr_12x * trr_10y * wt;
+                double c0z = zjzi * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += trr_12x * fac1 * trr_10z;
+                double trr_02x = cpx * trr_01x + 1*b01 * 1;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                gout3 += trr_02x * trr_20y * wt;
+                gout4 += trr_02x * trr_10y * trr_10z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                gout5 += trr_02x * fac1 * trr_20z;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout6 += trr_21x * trr_01y * wt;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout7 += trr_11x * trr_11y * wt;
+                gout8 += trr_11x * trr_01y * trr_10z;
+                double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
+                gout9 += trr_01x * trr_21y * wt;
+                gout10 += trr_01x * trr_11y * trr_10z;
+                gout11 += trr_01x * trr_01y * trr_20z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout12 += trr_21x * fac1 * trr_01z;
+                gout13 += trr_11x * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout14 += trr_11x * fac1 * trr_11z;
+                gout15 += trr_01x * trr_20y * trr_01z;
+                gout16 += trr_01x * trr_10y * trr_11z;
+                double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
+                gout17 += trr_01x * fac1 * trr_21z;
+                double trr_02y = cpy * trr_01y + 1*b01 * fac1;
+                gout18 += trr_20x * trr_02y * wt;
+                double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y;
+                gout19 += trr_10x * trr_12y * wt;
+                gout20 += trr_10x * trr_02y * trr_10z;
+                double trr_22y = cpy * trr_21y + 1*b01 * trr_20y + 2*b00 * trr_11y;
+                gout21 += 1 * trr_22y * wt;
+                gout22 += 1 * trr_12y * trr_10z;
+                gout23 += 1 * trr_02y * trr_20z;
+                gout24 += trr_20x * trr_01y * trr_01z;
+                gout25 += trr_10x * trr_11y * trr_01z;
+                gout26 += trr_10x * trr_01y * trr_11z;
+                gout27 += 1 * trr_21y * trr_01z;
+                gout28 += 1 * trr_11y * trr_11z;
+                gout29 += 1 * trr_01y * trr_21z;
+                double trr_02z = cpz * trr_01z + 1*b01 * wt;
+                gout30 += trr_20x * fac1 * trr_02z;
+                gout31 += trr_10x * trr_10y * trr_02z;
+                double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z;
+                gout32 += trr_10x * fac1 * trr_12z;
+                gout33 += 1 * trr_20y * trr_02z;
+                gout34 += 1 * trr_10y * trr_12z;
+                double trr_22z = cpz * trr_21z + 1*b01 * trr_20z + 2*b00 * trr_11z;
+                gout35 += 1 * fac1 * trr_22z;
+            }
+        }
+        int naux = bounds.naux;
+        double *eri_tensor = out + shl_pair_idx * 6 * naux + ksh_in_auxmol * 6;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout6;
+        eri_tensor[0*naux + 2] = gout12;
+        eri_tensor[0*naux + 3] = gout18;
+        eri_tensor[0*naux + 4] = gout24;
+        eri_tensor[0*naux + 5] = gout30;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout7;
+        eri_tensor[1*naux + 2] = gout13;
+        eri_tensor[1*naux + 3] = gout19;
+        eri_tensor[1*naux + 4] = gout25;
+        eri_tensor[1*naux + 5] = gout31;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout8;
+        eri_tensor[2*naux + 2] = gout14;
+        eri_tensor[2*naux + 3] = gout20;
+        eri_tensor[2*naux + 4] = gout26;
+        eri_tensor[2*naux + 5] = gout32;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[3*naux + 1] = gout9;
+        eri_tensor[3*naux + 2] = gout15;
+        eri_tensor[3*naux + 3] = gout21;
+        eri_tensor[3*naux + 4] = gout27;
+        eri_tensor[3*naux + 5] = gout33;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[4*naux + 1] = gout10;
+        eri_tensor[4*naux + 2] = gout16;
+        eri_tensor[4*naux + 3] = gout22;
+        eri_tensor[4*naux + 4] = gout28;
+        eri_tensor[4*naux + 5] = gout34;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[5*naux + 1] = gout11;
+        eri_tensor[5*naux + 2] = gout17;
+        eri_tensor[5*naux + 3] = gout23;
+        eri_tensor[5*naux + 4] = gout29;
+        eri_tensor[5*naux + 5] = gout35;
+    }
+}
+
+__global__
+void int3c2e_212(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds)
+{
+    int st_id = threadIdx.x;
+    int gout_id = threadIdx.y;
+    int batch_id = blockIdx.x;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = bounds.nroots;
+    int *bas = envs.bas;
+    int nbas = envs.nbas;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+    double *gx = rw + nroots * 128;
+    double *gy = gx + 1152;
+    double *gz = gy + 1152;
+    double *Rpq = gz + 1152;
+    double *rjri = Rpq + 192;
+    if (gout_id == 0) {
+        gx[0] = 1.;
+    }
+
+    int nksh = bounds.nksh;
+    int nshl_pair = bounds.nshl_pair;
+    int nst = nksh * nshl_pair;
+    int st0 = batch_id * 64 * BATCHES_PER_BLOCK;
+    int st1 = MIN(nst, st0 + 64 * BATCHES_PER_BLOCK);
+    for (int ijk_idx = st0+st_id; ijk_idx < st1+st_id; ijk_idx += 64) {
+        int ksh_in_auxmol = ijk_idx % nksh;
+        int ksh = ksh_in_auxmol + bounds.ksh0;
+        int shl_pair_idx = ijk_idx / nksh;
+        __syncthreads();
+        if (ijk_idx >= nst) {
+            shl_pair_idx = st0 / nksh;
+            if (gout_id == 0) {
+                gx[0] = 0.;
+            }
+        }
+        int bas_ij = bounds.bas_ij_idx[shl_pair_idx];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        if (gout_id == 0) {
+            double xjxi = rj[0] - ri[0];
+            double yjyi = rj[1] - ri[1];
+            double zjzi = rj[2] - ri[2];
+            double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+            rjri[0] = xjxi;
+            rjri[64] = yjyi;
+            rjri[128] = zjzi;
+            rjri[192] = rr_ij;
+        }
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        double s0, s1, s2;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double aj_aij = aj / aij;
+            __syncthreads();
+            double xij = rjri[0] * aj_aij + ri[0];
+            double yij = rjri[64] * aj_aij + ri[1];
+            double zij = rjri[128] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            if (gout_id == 0) {
+                double cijk = ci[ip] * cj[jp] * ck[kp];
+                double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+                double theta_ij = ai * aj_aij;
+                double Kab = theta_ij * rjri[192];
+                gy[0] = fac * exp(-Kab);
+                Rpq[0] = xpq;
+                Rpq[64] = ypq;
+                Rpq[128] = zpq;
+            }
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(3, theta_rr, rw, 64, gout_id, 4);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4);
+                __syncthreads();
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = gout_id; irys < 3; irys+=4) {
+                    rw[ irys*2   *64] *= theta_fac;
+                    rw[(irys*2+1)*64] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 384;
+                rys_roots(3, theta_rr, rw1, 64, gout_id, 4);
+                rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4);
+                __syncthreads();
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = gout_id; irys < 3; irys+=4) {
+                    rw[ irys*2   *64] *= theta_fac;
+                    rw[(irys*2+1)*64] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                __syncthreads();
+                double rt = rw[irys*128];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double rt_ak = rt_aa * aij;
+                double b00 = .5 * rt_aa;
+                double b01 = .5/ak * (1 - rt_ak);
+                for (int n = gout_id; n < 3; n += 4) {
+                    if (n == 2) {
+                        gz[0] = rw[irys*128+64];
+                    }
+                    double *_gx = gx + n * 1152;
+                    double xjxi = rjri[n * 64];
+                    double Rpa = xjxi * aj_aij;
+                    double c0x = Rpa - rt_aij * Rpq[n * 64];
+                    s0 = _gx[0];
+                    s1 = c0x * s0;
+                    _gx[64] = s1;
+                    s2 = c0x * s1 + 1 * b10 * s0;
+                    _gx[128] = s2;
+                    s0 = s1;
+                    s1 = s2;
+                    s2 = c0x * s1 + 2 * b10 * s0;
+                    _gx[192] = s2;
+                    double cpx = rt_ak * Rpq[n * 64];
+                    s0 = _gx[0];
+                    s1 = cpx * s0;
+                    _gx[384] = s1;
+                    s2 = cpx*s1 + 1 * b01 *s0;
+                    _gx[768] = s2;
+                    s0 = _gx[64];
+                    s1 = cpx * s0;
+                    s1 += 1 * b00 * _gx[0];
+                    _gx[448] = s1;
+                    s2 = cpx*s1 + 1 * b01 *s0;
+                    s2 += 1 * b00 * _gx[384];
+                    _gx[832] = s2;
+                    s0 = _gx[128];
+                    s1 = cpx * s0;
+                    s1 += 2 * b00 * _gx[64];
+                    _gx[512] = s1;
+                    s2 = cpx*s1 + 1 * b01 *s0;
+                    s2 += 2 * b00 * _gx[448];
+                    _gx[896] = s2;
+                    s0 = _gx[192];
+                    s1 = cpx * s0;
+                    s1 += 3 * b00 * _gx[128];
+                    _gx[576] = s1;
+                    s2 = cpx*s1 + 1 * b01 *s0;
+                    s2 += 3 * b00 * _gx[512];
+                    _gx[960] = s2;
+                    s1 = _gx[192];
+                    s0 = _gx[128];
+                    _gx[320] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[64];
+                    _gx[256] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[0];
+                    _gx[192] = s1 - xjxi * s0;
+                    s1 = _gx[576];
+                    s0 = _gx[512];
+                    _gx[704] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[448];
+                    _gx[640] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[384];
+                    _gx[576] = s1 - xjxi * s0;
+                    s1 = _gx[960];
+                    s0 = _gx[896];
+                    _gx[1088] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[832];
+                    _gx[1024] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[768];
+                    _gx[960] = s1 - xjxi * s0;
+                }
+                __syncthreads();
+                switch (gout_id) {
+                case 0:
+                gout0 += gx[1088] * gy[0] * gz[0];
+                gout1 += gx[320] * gy[384] * gz[384];
+                gout2 += gx[640] * gy[64] * gz[384];
+                gout3 += gx[1024] * gy[0] * gz[64];
+                gout4 += gx[256] * gy[384] * gz[448];
+                gout5 += gx[576] * gy[128] * gz[384];
+                gout6 += gx[960] * gy[64] * gz[64];
+                gout7 += gx[192] * gy[448] * gz[448];
+                gout8 += gx[576] * gy[0] * gz[512];
+                gout9 += gx[896] * gy[192] * gz[0];
+                gout10 += gx[128] * gy[576] * gz[384];
+                gout11 += gx[448] * gy[256] * gz[384];
+                gout12 += gx[832] * gy[192] * gz[64];
+                gout13 += gx[64] * gy[576] * gz[448];
+                gout14 += gx[384] * gy[320] * gz[384];
+                gout15 += gx[768] * gy[256] * gz[64];
+                gout16 += gx[0] * gy[640] * gz[448];
+                gout17 += gx[384] * gy[192] * gz[512];
+                gout18 += gx[896] * gy[0] * gz[192];
+                gout19 += gx[128] * gy[384] * gz[576];
+                gout20 += gx[448] * gy[64] * gz[576];
+                gout21 += gx[832] * gy[0] * gz[256];
+                gout22 += gx[64] * gy[384] * gz[640];
+                gout23 += gx[384] * gy[128] * gz[576];
+                gout24 += gx[768] * gy[64] * gz[256];
+                gout25 += gx[0] * gy[448] * gz[640];
+                gout26 += gx[384] * gy[0] * gz[704];
+                break;
+                case 1:
+                gout0 += gx[704] * gy[384] * gz[0];
+                gout1 += gx[320] * gy[0] * gz[768];
+                gout2 += gx[256] * gy[832] * gz[0];
+                gout3 += gx[640] * gy[384] * gz[64];
+                gout4 += gx[256] * gy[0] * gz[832];
+                gout5 += gx[192] * gy[896] * gz[0];
+                gout6 += gx[576] * gy[448] * gz[64];
+                gout7 += gx[192] * gy[64] * gz[832];
+                gout8 += gx[192] * gy[768] * gz[128];
+                gout9 += gx[512] * gy[576] * gz[0];
+                gout10 += gx[128] * gy[192] * gz[768];
+                gout11 += gx[64] * gy[1024] * gz[0];
+                gout12 += gx[448] * gy[576] * gz[64];
+                gout13 += gx[64] * gy[192] * gz[832];
+                gout14 += gx[0] * gy[1088] * gz[0];
+                gout15 += gx[384] * gy[640] * gz[64];
+                gout16 += gx[0] * gy[256] * gz[832];
+                gout17 += gx[0] * gy[960] * gz[128];
+                gout18 += gx[512] * gy[384] * gz[192];
+                gout19 += gx[128] * gy[0] * gz[960];
+                gout20 += gx[64] * gy[832] * gz[192];
+                gout21 += gx[448] * gy[384] * gz[256];
+                gout22 += gx[64] * gy[0] * gz[1024];
+                gout23 += gx[0] * gy[896] * gz[192];
+                gout24 += gx[384] * gy[448] * gz[256];
+                gout25 += gx[0] * gy[64] * gz[1024];
+                gout26 += gx[0] * gy[768] * gz[320];
+                break;
+                case 2:
+                gout0 += gx[704] * gy[0] * gz[384];
+                gout1 += gx[1024] * gy[64] * gz[0];
+                gout2 += gx[256] * gy[448] * gz[384];
+                gout3 += gx[640] * gy[0] * gz[448];
+                gout4 += gx[960] * gy[128] * gz[0];
+                gout5 += gx[192] * gy[512] * gz[384];
+                gout6 += gx[576] * gy[64] * gz[448];
+                gout7 += gx[960] * gy[0] * gz[128];
+                gout8 += gx[192] * gy[384] * gz[512];
+                gout9 += gx[512] * gy[192] * gz[384];
+                gout10 += gx[832] * gy[256] * gz[0];
+                gout11 += gx[64] * gy[640] * gz[384];
+                gout12 += gx[448] * gy[192] * gz[448];
+                gout13 += gx[768] * gy[320] * gz[0];
+                gout14 += gx[0] * gy[704] * gz[384];
+                gout15 += gx[384] * gy[256] * gz[448];
+                gout16 += gx[768] * gy[192] * gz[128];
+                gout17 += gx[0] * gy[576] * gz[512];
+                gout18 += gx[512] * gy[0] * gz[576];
+                gout19 += gx[832] * gy[64] * gz[192];
+                gout20 += gx[64] * gy[448] * gz[576];
+                gout21 += gx[448] * gy[0] * gz[640];
+                gout22 += gx[768] * gy[128] * gz[192];
+                gout23 += gx[0] * gy[512] * gz[576];
+                gout24 += gx[384] * gy[64] * gz[640];
+                gout25 += gx[768] * gy[0] * gz[320];
+                gout26 += gx[0] * gy[384] * gz[704];
+                break;
+                case 3:
+                gout0 += gx[320] * gy[768] * gz[0];
+                gout1 += gx[640] * gy[448] * gz[0];
+                gout2 += gx[256] * gy[64] * gz[768];
+                gout3 += gx[256] * gy[768] * gz[64];
+                gout4 += gx[576] * gy[512] * gz[0];
+                gout5 += gx[192] * gy[128] * gz[768];
+                gout6 += gx[192] * gy[832] * gz[64];
+                gout7 += gx[576] * gy[384] * gz[128];
+                gout8 += gx[192] * gy[0] * gz[896];
+                gout9 += gx[128] * gy[960] * gz[0];
+                gout10 += gx[448] * gy[640] * gz[0];
+                gout11 += gx[64] * gy[256] * gz[768];
+                gout12 += gx[64] * gy[960] * gz[64];
+                gout13 += gx[384] * gy[704] * gz[0];
+                gout14 += gx[0] * gy[320] * gz[768];
+                gout15 += gx[0] * gy[1024] * gz[64];
+                gout16 += gx[384] * gy[576] * gz[128];
+                gout17 += gx[0] * gy[192] * gz[896];
+                gout18 += gx[128] * gy[768] * gz[192];
+                gout19 += gx[448] * gy[448] * gz[192];
+                gout20 += gx[64] * gy[64] * gz[960];
+                gout21 += gx[64] * gy[768] * gz[256];
+                gout22 += gx[384] * gy[512] * gz[192];
+                gout23 += gx[0] * gy[128] * gz[960];
+                gout24 += gx[0] * gy[832] * gz[256];
+                gout25 += gx[384] * gy[384] * gz[320];
+                gout26 += gx[0] * gy[0] * gz[1088];
+                break;
+                }
+            }
+        }
+        if (ijk_idx < nst) {
+            int naux = bounds.naux;
+            double *eri_tensor = out + shl_pair_idx * 18 * naux + ksh_in_auxmol * 6;
+            switch (gout_id) {
+            case 0:
+            eri_tensor[0*naux + 0] = gout0;
+            eri_tensor[0*naux + 4] = gout1;
+            eri_tensor[1*naux + 2] = gout2;
+            eri_tensor[2*naux + 0] = gout3;
+            eri_tensor[2*naux + 4] = gout4;
+            eri_tensor[3*naux + 2] = gout5;
+            eri_tensor[4*naux + 0] = gout6;
+            eri_tensor[4*naux + 4] = gout7;
+            eri_tensor[5*naux + 2] = gout8;
+            eri_tensor[6*naux + 0] = gout9;
+            eri_tensor[6*naux + 4] = gout10;
+            eri_tensor[7*naux + 2] = gout11;
+            eri_tensor[8*naux + 0] = gout12;
+            eri_tensor[8*naux + 4] = gout13;
+            eri_tensor[9*naux + 2] = gout14;
+            eri_tensor[10*naux + 0] = gout15;
+            eri_tensor[10*naux + 4] = gout16;
+            eri_tensor[11*naux + 2] = gout17;
+            eri_tensor[12*naux + 0] = gout18;
+            eri_tensor[12*naux + 4] = gout19;
+            eri_tensor[13*naux + 2] = gout20;
+            eri_tensor[14*naux + 0] = gout21;
+            eri_tensor[14*naux + 4] = gout22;
+            eri_tensor[15*naux + 2] = gout23;
+            eri_tensor[16*naux + 0] = gout24;
+            eri_tensor[16*naux + 4] = gout25;
+            eri_tensor[17*naux + 2] = gout26;
+            break;
+            case 1:
+            eri_tensor[0*naux + 1] = gout0;
+            eri_tensor[0*naux + 5] = gout1;
+            eri_tensor[1*naux + 3] = gout2;
+            eri_tensor[2*naux + 1] = gout3;
+            eri_tensor[2*naux + 5] = gout4;
+            eri_tensor[3*naux + 3] = gout5;
+            eri_tensor[4*naux + 1] = gout6;
+            eri_tensor[4*naux + 5] = gout7;
+            eri_tensor[5*naux + 3] = gout8;
+            eri_tensor[6*naux + 1] = gout9;
+            eri_tensor[6*naux + 5] = gout10;
+            eri_tensor[7*naux + 3] = gout11;
+            eri_tensor[8*naux + 1] = gout12;
+            eri_tensor[8*naux + 5] = gout13;
+            eri_tensor[9*naux + 3] = gout14;
+            eri_tensor[10*naux + 1] = gout15;
+            eri_tensor[10*naux + 5] = gout16;
+            eri_tensor[11*naux + 3] = gout17;
+            eri_tensor[12*naux + 1] = gout18;
+            eri_tensor[12*naux + 5] = gout19;
+            eri_tensor[13*naux + 3] = gout20;
+            eri_tensor[14*naux + 1] = gout21;
+            eri_tensor[14*naux + 5] = gout22;
+            eri_tensor[15*naux + 3] = gout23;
+            eri_tensor[16*naux + 1] = gout24;
+            eri_tensor[16*naux + 5] = gout25;
+            eri_tensor[17*naux + 3] = gout26;
+            break;
+            case 2:
+            eri_tensor[0*naux + 2] = gout0;
+            eri_tensor[1*naux + 0] = gout1;
+            eri_tensor[1*naux + 4] = gout2;
+            eri_tensor[2*naux + 2] = gout3;
+            eri_tensor[3*naux + 0] = gout4;
+            eri_tensor[3*naux + 4] = gout5;
+            eri_tensor[4*naux + 2] = gout6;
+            eri_tensor[5*naux + 0] = gout7;
+            eri_tensor[5*naux + 4] = gout8;
+            eri_tensor[6*naux + 2] = gout9;
+            eri_tensor[7*naux + 0] = gout10;
+            eri_tensor[7*naux + 4] = gout11;
+            eri_tensor[8*naux + 2] = gout12;
+            eri_tensor[9*naux + 0] = gout13;
+            eri_tensor[9*naux + 4] = gout14;
+            eri_tensor[10*naux + 2] = gout15;
+            eri_tensor[11*naux + 0] = gout16;
+            eri_tensor[11*naux + 4] = gout17;
+            eri_tensor[12*naux + 2] = gout18;
+            eri_tensor[13*naux + 0] = gout19;
+            eri_tensor[13*naux + 4] = gout20;
+            eri_tensor[14*naux + 2] = gout21;
+            eri_tensor[15*naux + 0] = gout22;
+            eri_tensor[15*naux + 4] = gout23;
+            eri_tensor[16*naux + 2] = gout24;
+            eri_tensor[17*naux + 0] = gout25;
+            eri_tensor[17*naux + 4] = gout26;
+            break;
+            case 3:
+            eri_tensor[0*naux + 3] = gout0;
+            eri_tensor[1*naux + 1] = gout1;
+            eri_tensor[1*naux + 5] = gout2;
+            eri_tensor[2*naux + 3] = gout3;
+            eri_tensor[3*naux + 1] = gout4;
+            eri_tensor[3*naux + 5] = gout5;
+            eri_tensor[4*naux + 3] = gout6;
+            eri_tensor[5*naux + 1] = gout7;
+            eri_tensor[5*naux + 5] = gout8;
+            eri_tensor[6*naux + 3] = gout9;
+            eri_tensor[7*naux + 1] = gout10;
+            eri_tensor[7*naux + 5] = gout11;
+            eri_tensor[8*naux + 3] = gout12;
+            eri_tensor[9*naux + 1] = gout13;
+            eri_tensor[9*naux + 5] = gout14;
+            eri_tensor[10*naux + 3] = gout15;
+            eri_tensor[11*naux + 1] = gout16;
+            eri_tensor[11*naux + 5] = gout17;
+            eri_tensor[12*naux + 3] = gout18;
+            eri_tensor[13*naux + 1] = gout19;
+            eri_tensor[13*naux + 5] = gout20;
+            eri_tensor[14*naux + 3] = gout21;
+            eri_tensor[15*naux + 1] = gout22;
+            eri_tensor[15*naux + 5] = gout23;
+            eri_tensor[16*naux + 3] = gout24;
+            eri_tensor[17*naux + 1] = gout25;
+            eri_tensor[17*naux + 5] = gout26;
+            break;
+            }
+        }
+    }
+}
+
+int int3c2e_unrolled(double *out, Int3c2eEnvVars *envs, Int3c2eBounds *bounds)
+{
+    int li = bounds->li;
+    int lj = bounds->lj;
+    int lk = bounds->lk;
+    int kij = lk*25 + li*5 + lj;
+    int nroots = bounds->nroots;
+    int nshl_pair = bounds->nshl_pair;
+    int nksh = bounds->nksh;
+    int nst_per_block = 256;
+    int gout_stride = 1;
+
+    switch (kij) {
+    case 37:
+        nst_per_block = 64;
+        gout_stride = 4;
+        break;
+    case 61:
+        nst_per_block = 64;
+        gout_stride = 4;
+        break;
+    }
+
+#if CUDA_VERSION >= 12040
+    switch (kij) {
+    case 0: nst_per_block *= 2; break;
+    case 5: nst_per_block *= 2; break;
+    case 6: nst_per_block *= 2; break;
+    case 10: nst_per_block *= 2; break;
+    case 11: nst_per_block *= 2; break;
+    case 25: nst_per_block *= 2; break;
+    case 30: nst_per_block *= 2; break;
+    case 35: nst_per_block *= 2; break;
+    case 50: nst_per_block *= 2; break;
+    case 55: nst_per_block *= 2; break;
+    }
+#endif
+
+    dim3 threads(nst_per_block, gout_stride);
+    int tasks_per_block = BATCHES_PER_BLOCK * nst_per_block;
+    int st_blocks = (nksh*nshl_pair + tasks_per_block - 1) / tasks_per_block;
+    int buflen = nroots*2 * nst_per_block;
+    switch (kij) {
+    case 0:
+        int3c2e_000<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 5:
+        int3c2e_100<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 6:
+        int3c2e_110<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 10:
+        int3c2e_200<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 11:
+        int3c2e_210<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 12:
+        int3c2e_220<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 25:
+        int3c2e_001<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 30:
+        int3c2e_101<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 31:
+        int3c2e_111<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 35:
+        int3c2e_201<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 36:
+        int3c2e_211<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 37:
+        buflen += 3904;
+        int3c2e_221<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 50:
+        int3c2e_002<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 55:
+        int3c2e_102<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 56:
+        int3c2e_112<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 60:
+        int3c2e_202<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    case 61:
+        buflen += 3904;
+        int3c2e_212<<<st_blocks, threads, buflen*sizeof(double)>>>(out, *envs, *bounds); break;
+    default: return 0;
+    }
+    return 1;
+}
diff --git a/gpu4pyscf/lib/gint-rys/unrolled_int3c2e_bdiv.cu b/gpu4pyscf/lib/gint-rys/unrolled_int3c2e_bdiv.cu
new file mode 100644
index 00000000..e58e4986
--- /dev/null
+++ b/gpu4pyscf/lib/gint-rys/unrolled_int3c2e_bdiv.cu
@@ -0,0 +1,4093 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+#include "gvhf-rys/vhf.cuh"
+#include "gvhf-rys/rys_roots.cu"
+#include "int3c2e.cuh"
+
+__device__
+void int3c2e_bdiv_000(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 1;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(1, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 1; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 2*nst_per_block;
+                rys_roots(1, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 1; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                gout0 += 1 * fac1 * wt;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 1 * naux + ksh_in_block * 1;
+        eri_tensor[0*naux + 0] = gout0;
+    }
+}
+
+__device__
+void int3c2e_bdiv_100(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 1;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(1, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 1; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 2*nst_per_block;
+                rys_roots(1, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 1; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                gout0 += trr_10x * fac1 * wt;
+                double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += 1 * trr_10y * wt;
+                double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += 1 * fac1 * trr_10z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 3 * naux + ksh_in_block * 1;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[2*naux + 0] = gout2;
+    }
+}
+
+__device__
+void int3c2e_bdiv_110(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 2;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double hrr_110x = trr_20x - xjxi * trr_10x;
+                gout0 += hrr_110x * fac1 * wt;
+                double hrr_010x = trr_10x - xjxi * 1;
+                double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += hrr_010x * trr_10y * wt;
+                double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += hrr_010x * fac1 * trr_10z;
+                double hrr_010y = trr_10y - yjyi * fac1;
+                gout3 += trr_10x * hrr_010y * wt;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                double hrr_110y = trr_20y - yjyi * trr_10y;
+                gout4 += 1 * hrr_110y * wt;
+                gout5 += 1 * hrr_010y * trr_10z;
+                double hrr_010z = trr_10z - zjzi * wt;
+                gout6 += trr_10x * fac1 * hrr_010z;
+                gout7 += 1 * trr_10y * hrr_010z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                double hrr_110z = trr_20z - zjzi * trr_10z;
+                gout8 += 1 * fac1 * hrr_110z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 9 * naux + ksh_in_block * 1;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[6*naux + 0] = gout6;
+        eri_tensor[7*naux + 0] = gout7;
+        eri_tensor[8*naux + 0] = gout8;
+    }
+}
+
+__device__
+void int3c2e_bdiv_200(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 2;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                gout0 += trr_20x * fac1 * wt;
+                double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += trr_10x * trr_10y * wt;
+                double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += trr_10x * fac1 * trr_10z;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                gout3 += 1 * trr_20y * wt;
+                gout4 += 1 * trr_10y * trr_10z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                gout5 += 1 * fac1 * trr_20z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 6 * naux + ksh_in_block * 1;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[5*naux + 0] = gout5;
+    }
+}
+
+__device__
+void int3c2e_bdiv_210(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 2;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
+                double hrr_210x = trr_30x - xjxi * trr_20x;
+                gout0 += hrr_210x * fac1 * wt;
+                double hrr_110x = trr_20x - xjxi * trr_10x;
+                double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += hrr_110x * trr_10y * wt;
+                double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += hrr_110x * fac1 * trr_10z;
+                double hrr_010x = trr_10x - xjxi * 1;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                gout3 += hrr_010x * trr_20y * wt;
+                gout4 += hrr_010x * trr_10y * trr_10z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                gout5 += hrr_010x * fac1 * trr_20z;
+                double hrr_010y = trr_10y - yjyi * fac1;
+                gout6 += trr_20x * hrr_010y * wt;
+                double hrr_110y = trr_20y - yjyi * trr_10y;
+                gout7 += trr_10x * hrr_110y * wt;
+                gout8 += trr_10x * hrr_010y * trr_10z;
+                double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
+                double hrr_210y = trr_30y - yjyi * trr_20y;
+                gout9 += 1 * hrr_210y * wt;
+                gout10 += 1 * hrr_110y * trr_10z;
+                gout11 += 1 * hrr_010y * trr_20z;
+                double hrr_010z = trr_10z - zjzi * wt;
+                gout12 += trr_20x * fac1 * hrr_010z;
+                gout13 += trr_10x * trr_10y * hrr_010z;
+                double hrr_110z = trr_20z - zjzi * trr_10z;
+                gout14 += trr_10x * fac1 * hrr_110z;
+                gout15 += 1 * trr_20y * hrr_010z;
+                gout16 += 1 * trr_10y * hrr_110z;
+                double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
+                double hrr_210z = trr_30z - zjzi * trr_20z;
+                gout17 += 1 * fac1 * hrr_210z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 18 * naux + ksh_in_block * 1;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[6*naux + 0] = gout6;
+        eri_tensor[7*naux + 0] = gout7;
+        eri_tensor[8*naux + 0] = gout8;
+        eri_tensor[9*naux + 0] = gout9;
+        eri_tensor[10*naux + 0] = gout10;
+        eri_tensor[11*naux + 0] = gout11;
+        eri_tensor[12*naux + 0] = gout12;
+        eri_tensor[13*naux + 0] = gout13;
+        eri_tensor[14*naux + 0] = gout14;
+        eri_tensor[15*naux + 0] = gout15;
+        eri_tensor[16*naux + 0] = gout16;
+        eri_tensor[17*naux + 0] = gout17;
+    }
+}
+
+__device__
+void int3c2e_bdiv_220(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 3;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        double gout27 = 0;
+        double gout28 = 0;
+        double gout29 = 0;
+        double gout30 = 0;
+        double gout31 = 0;
+        double gout32 = 0;
+        double gout33 = 0;
+        double gout34 = 0;
+        double gout35 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(3, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 6*nst_per_block;
+                rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
+                double trr_40x = c0x * trr_30x + 3*b10 * trr_20x;
+                double hrr_310x = trr_40x - xjxi * trr_30x;
+                double hrr_210x = trr_30x - xjxi * trr_20x;
+                double hrr_220x = hrr_310x - xjxi * hrr_210x;
+                gout0 += hrr_220x * fac1 * wt;
+                double hrr_110x = trr_20x - xjxi * trr_10x;
+                double hrr_120x = hrr_210x - xjxi * hrr_110x;
+                double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += hrr_120x * trr_10y * wt;
+                double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += hrr_120x * fac1 * trr_10z;
+                double hrr_010x = trr_10x - xjxi * 1;
+                double hrr_020x = hrr_110x - xjxi * hrr_010x;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                gout3 += hrr_020x * trr_20y * wt;
+                gout4 += hrr_020x * trr_10y * trr_10z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                gout5 += hrr_020x * fac1 * trr_20z;
+                double hrr_010y = trr_10y - yjyi * fac1;
+                gout6 += hrr_210x * hrr_010y * wt;
+                double hrr_110y = trr_20y - yjyi * trr_10y;
+                gout7 += hrr_110x * hrr_110y * wt;
+                gout8 += hrr_110x * hrr_010y * trr_10z;
+                double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
+                double hrr_210y = trr_30y - yjyi * trr_20y;
+                gout9 += hrr_010x * hrr_210y * wt;
+                gout10 += hrr_010x * hrr_110y * trr_10z;
+                gout11 += hrr_010x * hrr_010y * trr_20z;
+                double hrr_010z = trr_10z - zjzi * wt;
+                gout12 += hrr_210x * fac1 * hrr_010z;
+                gout13 += hrr_110x * trr_10y * hrr_010z;
+                double hrr_110z = trr_20z - zjzi * trr_10z;
+                gout14 += hrr_110x * fac1 * hrr_110z;
+                gout15 += hrr_010x * trr_20y * hrr_010z;
+                gout16 += hrr_010x * trr_10y * hrr_110z;
+                double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
+                double hrr_210z = trr_30z - zjzi * trr_20z;
+                gout17 += hrr_010x * fac1 * hrr_210z;
+                double hrr_020y = hrr_110y - yjyi * hrr_010y;
+                gout18 += trr_20x * hrr_020y * wt;
+                double hrr_120y = hrr_210y - yjyi * hrr_110y;
+                gout19 += trr_10x * hrr_120y * wt;
+                gout20 += trr_10x * hrr_020y * trr_10z;
+                double trr_40y = c0y * trr_30y + 3*b10 * trr_20y;
+                double hrr_310y = trr_40y - yjyi * trr_30y;
+                double hrr_220y = hrr_310y - yjyi * hrr_210y;
+                gout21 += 1 * hrr_220y * wt;
+                gout22 += 1 * hrr_120y * trr_10z;
+                gout23 += 1 * hrr_020y * trr_20z;
+                gout24 += trr_20x * hrr_010y * hrr_010z;
+                gout25 += trr_10x * hrr_110y * hrr_010z;
+                gout26 += trr_10x * hrr_010y * hrr_110z;
+                gout27 += 1 * hrr_210y * hrr_010z;
+                gout28 += 1 * hrr_110y * hrr_110z;
+                gout29 += 1 * hrr_010y * hrr_210z;
+                double hrr_020z = hrr_110z - zjzi * hrr_010z;
+                gout30 += trr_20x * fac1 * hrr_020z;
+                gout31 += trr_10x * trr_10y * hrr_020z;
+                double hrr_120z = hrr_210z - zjzi * hrr_110z;
+                gout32 += trr_10x * fac1 * hrr_120z;
+                gout33 += 1 * trr_20y * hrr_020z;
+                gout34 += 1 * trr_10y * hrr_120z;
+                double trr_40z = c0z * trr_30z + 3*b10 * trr_20z;
+                double hrr_310z = trr_40z - zjzi * trr_30z;
+                double hrr_220z = hrr_310z - zjzi * hrr_210z;
+                gout35 += 1 * fac1 * hrr_220z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 36 * naux + ksh_in_block * 1;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[6*naux + 0] = gout6;
+        eri_tensor[7*naux + 0] = gout7;
+        eri_tensor[8*naux + 0] = gout8;
+        eri_tensor[9*naux + 0] = gout9;
+        eri_tensor[10*naux + 0] = gout10;
+        eri_tensor[11*naux + 0] = gout11;
+        eri_tensor[12*naux + 0] = gout12;
+        eri_tensor[13*naux + 0] = gout13;
+        eri_tensor[14*naux + 0] = gout14;
+        eri_tensor[15*naux + 0] = gout15;
+        eri_tensor[16*naux + 0] = gout16;
+        eri_tensor[17*naux + 0] = gout17;
+        eri_tensor[18*naux + 0] = gout18;
+        eri_tensor[19*naux + 0] = gout19;
+        eri_tensor[20*naux + 0] = gout20;
+        eri_tensor[21*naux + 0] = gout21;
+        eri_tensor[22*naux + 0] = gout22;
+        eri_tensor[23*naux + 0] = gout23;
+        eri_tensor[24*naux + 0] = gout24;
+        eri_tensor[25*naux + 0] = gout25;
+        eri_tensor[26*naux + 0] = gout26;
+        eri_tensor[27*naux + 0] = gout27;
+        eri_tensor[28*naux + 0] = gout28;
+        eri_tensor[29*naux + 0] = gout29;
+        eri_tensor[30*naux + 0] = gout30;
+        eri_tensor[31*naux + 0] = gout31;
+        eri_tensor[32*naux + 0] = gout32;
+        eri_tensor[33*naux + 0] = gout33;
+        eri_tensor[34*naux + 0] = gout34;
+        eri_tensor[35*naux + 0] = gout35;
+    }
+}
+
+__device__
+void int3c2e_bdiv_001(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 1;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(1, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 1; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 2*nst_per_block;
+                rys_roots(1, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 1; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_ak = rt_aa * aij;
+                double cpx = xpq*rt_ak;
+                double trr_01x = cpx * 1;
+                gout0 += trr_01x * fac1 * wt;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout1 += 1 * trr_01y * wt;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout2 += 1 * fac1 * trr_01z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 1 * naux + ksh_in_block * 3;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout1;
+        eri_tensor[0*naux + 2] = gout2;
+    }
+}
+
+__device__
+void int3c2e_bdiv_101(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 2;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                gout0 += trr_11x * fac1 * wt;
+                double trr_01x = cpx * 1;
+                double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += trr_01x * trr_10y * wt;
+                double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += trr_01x * fac1 * trr_10z;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout3 += trr_10x * trr_01y * wt;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout4 += 1 * trr_11y * wt;
+                gout5 += 1 * trr_01y * trr_10z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout6 += trr_10x * fac1 * trr_01z;
+                gout7 += 1 * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout8 += 1 * fac1 * trr_11z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 3 * naux + ksh_in_block * 3;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout3;
+        eri_tensor[0*naux + 2] = gout6;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout4;
+        eri_tensor[1*naux + 2] = gout7;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout5;
+        eri_tensor[2*naux + 2] = gout8;
+    }
+}
+
+__device__
+void int3c2e_bdiv_111(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 2;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                double hrr_111x = trr_21x - xjxi * trr_11x;
+                gout0 += hrr_111x * fac1 * wt;
+                double trr_01x = cpx * 1;
+                double hrr_011x = trr_11x - xjxi * trr_01x;
+                double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += hrr_011x * trr_10y * wt;
+                double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += hrr_011x * fac1 * trr_10z;
+                double hrr_010y = trr_10y - yjyi * fac1;
+                gout3 += trr_11x * hrr_010y * wt;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                double hrr_110y = trr_20y - yjyi * trr_10y;
+                gout4 += trr_01x * hrr_110y * wt;
+                gout5 += trr_01x * hrr_010y * trr_10z;
+                double hrr_010z = trr_10z - zjzi * wt;
+                gout6 += trr_11x * fac1 * hrr_010z;
+                gout7 += trr_01x * trr_10y * hrr_010z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                double hrr_110z = trr_20z - zjzi * trr_10z;
+                gout8 += trr_01x * fac1 * hrr_110z;
+                double hrr_110x = trr_20x - xjxi * trr_10x;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout9 += hrr_110x * trr_01y * wt;
+                double hrr_010x = trr_10x - xjxi * 1;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout10 += hrr_010x * trr_11y * wt;
+                gout11 += hrr_010x * trr_01y * trr_10z;
+                double hrr_011y = trr_11y - yjyi * trr_01y;
+                gout12 += trr_10x * hrr_011y * wt;
+                double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
+                double hrr_111y = trr_21y - yjyi * trr_11y;
+                gout13 += 1 * hrr_111y * wt;
+                gout14 += 1 * hrr_011y * trr_10z;
+                gout15 += trr_10x * trr_01y * hrr_010z;
+                gout16 += 1 * trr_11y * hrr_010z;
+                gout17 += 1 * trr_01y * hrr_110z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout18 += hrr_110x * fac1 * trr_01z;
+                gout19 += hrr_010x * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout20 += hrr_010x * fac1 * trr_11z;
+                gout21 += trr_10x * hrr_010y * trr_01z;
+                gout22 += 1 * hrr_110y * trr_01z;
+                gout23 += 1 * hrr_010y * trr_11z;
+                double hrr_011z = trr_11z - zjzi * trr_01z;
+                gout24 += trr_10x * fac1 * hrr_011z;
+                gout25 += 1 * trr_10y * hrr_011z;
+                double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
+                double hrr_111z = trr_21z - zjzi * trr_11z;
+                gout26 += 1 * fac1 * hrr_111z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 9 * naux + ksh_in_block * 3;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout9;
+        eri_tensor[0*naux + 2] = gout18;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout10;
+        eri_tensor[1*naux + 2] = gout19;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout11;
+        eri_tensor[2*naux + 2] = gout20;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[3*naux + 1] = gout12;
+        eri_tensor[3*naux + 2] = gout21;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[4*naux + 1] = gout13;
+        eri_tensor[4*naux + 2] = gout22;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[5*naux + 1] = gout14;
+        eri_tensor[5*naux + 2] = gout23;
+        eri_tensor[6*naux + 0] = gout6;
+        eri_tensor[6*naux + 1] = gout15;
+        eri_tensor[6*naux + 2] = gout24;
+        eri_tensor[7*naux + 0] = gout7;
+        eri_tensor[7*naux + 1] = gout16;
+        eri_tensor[7*naux + 2] = gout25;
+        eri_tensor[8*naux + 0] = gout8;
+        eri_tensor[8*naux + 1] = gout17;
+        eri_tensor[8*naux + 2] = gout26;
+    }
+}
+
+__device__
+void int3c2e_bdiv_201(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 2;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
+                gout0 += trr_21x * fac1 * wt;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += trr_11x * trr_10y * wt;
+                double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += trr_11x * fac1 * trr_10z;
+                double trr_01x = cpx * 1;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                gout3 += trr_01x * trr_20y * wt;
+                gout4 += trr_01x * trr_10y * trr_10z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                gout5 += trr_01x * fac1 * trr_20z;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout6 += trr_20x * trr_01y * wt;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout7 += trr_10x * trr_11y * wt;
+                gout8 += trr_10x * trr_01y * trr_10z;
+                double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
+                gout9 += 1 * trr_21y * wt;
+                gout10 += 1 * trr_11y * trr_10z;
+                gout11 += 1 * trr_01y * trr_20z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout12 += trr_20x * fac1 * trr_01z;
+                gout13 += trr_10x * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout14 += trr_10x * fac1 * trr_11z;
+                gout15 += 1 * trr_20y * trr_01z;
+                gout16 += 1 * trr_10y * trr_11z;
+                double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
+                gout17 += 1 * fac1 * trr_21z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 6 * naux + ksh_in_block * 3;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout6;
+        eri_tensor[0*naux + 2] = gout12;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout7;
+        eri_tensor[1*naux + 2] = gout13;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout8;
+        eri_tensor[2*naux + 2] = gout14;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[3*naux + 1] = gout9;
+        eri_tensor[3*naux + 2] = gout15;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[4*naux + 1] = gout10;
+        eri_tensor[4*naux + 2] = gout16;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[5*naux + 1] = gout11;
+        eri_tensor[5*naux + 2] = gout17;
+    }
+}
+
+__device__
+void int3c2e_bdiv_211(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 3;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        double gout27 = 0;
+        double gout28 = 0;
+        double gout29 = 0;
+        double gout30 = 0;
+        double gout31 = 0;
+        double gout32 = 0;
+        double gout33 = 0;
+        double gout34 = 0;
+        double gout35 = 0;
+        double gout36 = 0;
+        double gout37 = 0;
+        double gout38 = 0;
+        double gout39 = 0;
+        double gout40 = 0;
+        double gout41 = 0;
+        double gout42 = 0;
+        double gout43 = 0;
+        double gout44 = 0;
+        double gout45 = 0;
+        double gout46 = 0;
+        double gout47 = 0;
+        double gout48 = 0;
+        double gout49 = 0;
+        double gout50 = 0;
+        double gout51 = 0;
+        double gout52 = 0;
+        double gout53 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(3, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 6*nst_per_block;
+                rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
+                double trr_31x = cpx * trr_30x + 3*b00 * trr_20x;
+                double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
+                double hrr_211x = trr_31x - xjxi * trr_21x;
+                gout0 += hrr_211x * fac1 * wt;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                double hrr_111x = trr_21x - xjxi * trr_11x;
+                double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += hrr_111x * trr_10y * wt;
+                double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += hrr_111x * fac1 * trr_10z;
+                double trr_01x = cpx * 1;
+                double hrr_011x = trr_11x - xjxi * trr_01x;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                gout3 += hrr_011x * trr_20y * wt;
+                gout4 += hrr_011x * trr_10y * trr_10z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                gout5 += hrr_011x * fac1 * trr_20z;
+                double hrr_010y = trr_10y - yjyi * fac1;
+                gout6 += trr_21x * hrr_010y * wt;
+                double hrr_110y = trr_20y - yjyi * trr_10y;
+                gout7 += trr_11x * hrr_110y * wt;
+                gout8 += trr_11x * hrr_010y * trr_10z;
+                double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
+                double hrr_210y = trr_30y - yjyi * trr_20y;
+                gout9 += trr_01x * hrr_210y * wt;
+                gout10 += trr_01x * hrr_110y * trr_10z;
+                gout11 += trr_01x * hrr_010y * trr_20z;
+                double hrr_010z = trr_10z - zjzi * wt;
+                gout12 += trr_21x * fac1 * hrr_010z;
+                gout13 += trr_11x * trr_10y * hrr_010z;
+                double hrr_110z = trr_20z - zjzi * trr_10z;
+                gout14 += trr_11x * fac1 * hrr_110z;
+                gout15 += trr_01x * trr_20y * hrr_010z;
+                gout16 += trr_01x * trr_10y * hrr_110z;
+                double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
+                double hrr_210z = trr_30z - zjzi * trr_20z;
+                gout17 += trr_01x * fac1 * hrr_210z;
+                double hrr_210x = trr_30x - xjxi * trr_20x;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout18 += hrr_210x * trr_01y * wt;
+                double hrr_110x = trr_20x - xjxi * trr_10x;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout19 += hrr_110x * trr_11y * wt;
+                gout20 += hrr_110x * trr_01y * trr_10z;
+                double hrr_010x = trr_10x - xjxi * 1;
+                double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
+                gout21 += hrr_010x * trr_21y * wt;
+                gout22 += hrr_010x * trr_11y * trr_10z;
+                gout23 += hrr_010x * trr_01y * trr_20z;
+                double hrr_011y = trr_11y - yjyi * trr_01y;
+                gout24 += trr_20x * hrr_011y * wt;
+                double hrr_111y = trr_21y - yjyi * trr_11y;
+                gout25 += trr_10x * hrr_111y * wt;
+                gout26 += trr_10x * hrr_011y * trr_10z;
+                double trr_31y = cpy * trr_30y + 3*b00 * trr_20y;
+                double hrr_211y = trr_31y - yjyi * trr_21y;
+                gout27 += 1 * hrr_211y * wt;
+                gout28 += 1 * hrr_111y * trr_10z;
+                gout29 += 1 * hrr_011y * trr_20z;
+                gout30 += trr_20x * trr_01y * hrr_010z;
+                gout31 += trr_10x * trr_11y * hrr_010z;
+                gout32 += trr_10x * trr_01y * hrr_110z;
+                gout33 += 1 * trr_21y * hrr_010z;
+                gout34 += 1 * trr_11y * hrr_110z;
+                gout35 += 1 * trr_01y * hrr_210z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout36 += hrr_210x * fac1 * trr_01z;
+                gout37 += hrr_110x * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout38 += hrr_110x * fac1 * trr_11z;
+                gout39 += hrr_010x * trr_20y * trr_01z;
+                gout40 += hrr_010x * trr_10y * trr_11z;
+                double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
+                gout41 += hrr_010x * fac1 * trr_21z;
+                gout42 += trr_20x * hrr_010y * trr_01z;
+                gout43 += trr_10x * hrr_110y * trr_01z;
+                gout44 += trr_10x * hrr_010y * trr_11z;
+                gout45 += 1 * hrr_210y * trr_01z;
+                gout46 += 1 * hrr_110y * trr_11z;
+                gout47 += 1 * hrr_010y * trr_21z;
+                double hrr_011z = trr_11z - zjzi * trr_01z;
+                gout48 += trr_20x * fac1 * hrr_011z;
+                gout49 += trr_10x * trr_10y * hrr_011z;
+                double hrr_111z = trr_21z - zjzi * trr_11z;
+                gout50 += trr_10x * fac1 * hrr_111z;
+                gout51 += 1 * trr_20y * hrr_011z;
+                gout52 += 1 * trr_10y * hrr_111z;
+                double trr_31z = cpz * trr_30z + 3*b00 * trr_20z;
+                double hrr_211z = trr_31z - zjzi * trr_21z;
+                gout53 += 1 * fac1 * hrr_211z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 18 * naux + ksh_in_block * 3;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout18;
+        eri_tensor[0*naux + 2] = gout36;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout19;
+        eri_tensor[1*naux + 2] = gout37;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout20;
+        eri_tensor[2*naux + 2] = gout38;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[3*naux + 1] = gout21;
+        eri_tensor[3*naux + 2] = gout39;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[4*naux + 1] = gout22;
+        eri_tensor[4*naux + 2] = gout40;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[5*naux + 1] = gout23;
+        eri_tensor[5*naux + 2] = gout41;
+        eri_tensor[6*naux + 0] = gout6;
+        eri_tensor[6*naux + 1] = gout24;
+        eri_tensor[6*naux + 2] = gout42;
+        eri_tensor[7*naux + 0] = gout7;
+        eri_tensor[7*naux + 1] = gout25;
+        eri_tensor[7*naux + 2] = gout43;
+        eri_tensor[8*naux + 0] = gout8;
+        eri_tensor[8*naux + 1] = gout26;
+        eri_tensor[8*naux + 2] = gout44;
+        eri_tensor[9*naux + 0] = gout9;
+        eri_tensor[9*naux + 1] = gout27;
+        eri_tensor[9*naux + 2] = gout45;
+        eri_tensor[10*naux + 0] = gout10;
+        eri_tensor[10*naux + 1] = gout28;
+        eri_tensor[10*naux + 2] = gout46;
+        eri_tensor[11*naux + 0] = gout11;
+        eri_tensor[11*naux + 1] = gout29;
+        eri_tensor[11*naux + 2] = gout47;
+        eri_tensor[12*naux + 0] = gout12;
+        eri_tensor[12*naux + 1] = gout30;
+        eri_tensor[12*naux + 2] = gout48;
+        eri_tensor[13*naux + 0] = gout13;
+        eri_tensor[13*naux + 1] = gout31;
+        eri_tensor[13*naux + 2] = gout49;
+        eri_tensor[14*naux + 0] = gout14;
+        eri_tensor[14*naux + 1] = gout32;
+        eri_tensor[14*naux + 2] = gout50;
+        eri_tensor[15*naux + 0] = gout15;
+        eri_tensor[15*naux + 1] = gout33;
+        eri_tensor[15*naux + 2] = gout51;
+        eri_tensor[16*naux + 0] = gout16;
+        eri_tensor[16*naux + 1] = gout34;
+        eri_tensor[16*naux + 2] = gout52;
+        eri_tensor[17*naux + 0] = gout17;
+        eri_tensor[17*naux + 1] = gout35;
+        eri_tensor[17*naux + 2] = gout53;
+    }
+}
+
+__device__
+void int3c2e_bdiv_221(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int thread_id = threadIdx.x;
+    int st_id = thread_id % 64;
+    int gout_id = thread_id / 64;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 3;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+    double *gx = rw + nroots * 128;
+    double *gy = gx + 1152;
+    double *gz = gy + 1152;
+    double *Rpq = gz + 1152;
+    double *rjri = Rpq + 192;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    if (gout_id == 0) {
+        gx[0] = 1.;
+    }
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst+st_id; ijk_idx += 64) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        if (ijk_idx >= nst) {
+            shl_pair_in_block = 0;
+            if (gout_id == 0) {
+                gx[0] = 0.;
+            }
+        }
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        if (gout_id == 0) {
+            double xjxi = rj[0] - ri[0];
+            double yjyi = rj[1] - ri[1];
+            double zjzi = rj[2] - ri[2];
+            double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+            rjri[0] = xjxi;
+            rjri[64] = yjyi;
+            rjri[128] = zjzi;
+            rjri[192] = rr_ij;
+        }
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        double s0, s1, s2;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double aj_aij = aj / aij;
+            __syncthreads();
+            double xij = rjri[0] * aj_aij + ri[0];
+            double yij = rjri[64] * aj_aij + ri[1];
+            double zij = rjri[128] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            if (gout_id == 0) {
+                double cijk = ci[ip] * cj[jp] * ck[kp];
+                double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+                double theta_ij = ai * aj_aij;
+                double Kab = theta_ij * rjri[192];
+                gy[0] = fac * exp(-Kab);
+                Rpq[0] = xpq;
+                Rpq[64] = ypq;
+                Rpq[128] = zpq;
+            }
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(3, theta_rr, rw, 64, gout_id, 4);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4);
+                __syncthreads();
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = gout_id; irys < 3; irys+=4) {
+                    rw[ irys*2   *64] *= theta_fac;
+                    rw[(irys*2+1)*64] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 384;
+                rys_roots(3, theta_rr, rw1, 64, gout_id, 4);
+                rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4);
+                __syncthreads();
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = gout_id; irys < 3; irys+=4) {
+                    rw[ irys*2   *64] *= theta_fac;
+                    rw[(irys*2+1)*64] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                __syncthreads();
+                double rt = rw[irys*128];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double rt_ak = rt_aa * aij;
+                double b00 = .5 * rt_aa;
+                for (int n = gout_id; n < 3; n += 4) {
+                    if (n == 2) {
+                        gz[0] = rw[irys*128+64];
+                    }
+                    double *_gx = gx + n * 1152;
+                    double xjxi = rjri[n * 64];
+                    double Rpa = xjxi * aj_aij;
+                    double c0x = Rpa - rt_aij * Rpq[n * 64];
+                    s0 = _gx[0];
+                    s1 = c0x * s0;
+                    _gx[64] = s1;
+                    s2 = c0x * s1 + 1 * b10 * s0;
+                    _gx[128] = s2;
+                    s0 = s1;
+                    s1 = s2;
+                    s2 = c0x * s1 + 2 * b10 * s0;
+                    _gx[192] = s2;
+                    s0 = s1;
+                    s1 = s2;
+                    s2 = c0x * s1 + 3 * b10 * s0;
+                    _gx[256] = s2;
+                    double cpx = rt_ak * Rpq[n * 64];
+                    s0 = _gx[0];
+                    s1 = cpx * s0;
+                    _gx[576] = s1;
+                    s0 = _gx[64];
+                    s1 = cpx * s0;
+                    s1 += 1 * b00 * _gx[0];
+                    _gx[640] = s1;
+                    s0 = _gx[128];
+                    s1 = cpx * s0;
+                    s1 += 2 * b00 * _gx[64];
+                    _gx[704] = s1;
+                    s0 = _gx[192];
+                    s1 = cpx * s0;
+                    s1 += 3 * b00 * _gx[128];
+                    _gx[768] = s1;
+                    s0 = _gx[256];
+                    s1 = cpx * s0;
+                    s1 += 4 * b00 * _gx[192];
+                    _gx[832] = s1;
+                    s1 = _gx[256];
+                    s0 = _gx[192];
+                    _gx[384] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[128];
+                    _gx[320] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[64];
+                    _gx[256] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[0];
+                    _gx[192] = s1 - xjxi * s0;
+                    s1 = _gx[384];
+                    s0 = _gx[320];
+                    _gx[512] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[256];
+                    _gx[448] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[192];
+                    _gx[384] = s1 - xjxi * s0;
+                    s1 = _gx[832];
+                    s0 = _gx[768];
+                    _gx[960] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[704];
+                    _gx[896] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[640];
+                    _gx[832] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[576];
+                    _gx[768] = s1 - xjxi * s0;
+                    s1 = _gx[960];
+                    s0 = _gx[896];
+                    _gx[1088] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[832];
+                    _gx[1024] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[768];
+                    _gx[960] = s1 - xjxi * s0;
+                }
+                __syncthreads();
+                switch (gout_id) {
+                case 0:
+                gout0 += gx[1088] * gy[0] * gz[0];
+                gout1 += gx[448] * gy[640] * gz[0];
+                gout2 += gx[448] * gy[0] * gz[640];
+                gout3 += gx[960] * gy[64] * gz[64];
+                gout4 += gx[384] * gy[576] * gz[128];
+                gout5 += gx[320] * gy[192] * gz[576];
+                gout6 += gx[832] * gy[192] * gz[64];
+                gout7 += gx[192] * gy[896] * gz[0];
+                gout8 += gx[192] * gy[256] * gz[640];
+                gout9 += gx[896] * gy[0] * gz[192];
+                gout10 += gx[256] * gy[640] * gz[192];
+                gout11 += gx[256] * gy[0] * gz[832];
+                gout12 += gx[768] * gy[64] * gz[256];
+                gout13 += gx[192] * gy[576] * gz[320];
+                gout14 += gx[128] * gy[384] * gz[576];
+                gout15 += gx[640] * gy[384] * gz[64];
+                gout16 += gx[0] * gy[1088] * gz[0];
+                gout17 += gx[0] * gy[448] * gz[640];
+                gout18 += gx[704] * gy[192] * gz[192];
+                gout19 += gx[64] * gy[832] * gz[192];
+                gout20 += gx[64] * gy[192] * gz[832];
+                gout21 += gx[576] * gy[256] * gz[256];
+                gout22 += gx[0] * gy[768] * gz[320];
+                gout23 += gx[128] * gy[0] * gz[960];
+                gout24 += gx[640] * gy[0] * gz[448];
+                gout25 += gx[0] * gy[704] * gz[384];
+                gout26 += gx[0] * gy[64] * gz[1024];
+                break;
+                case 1:
+                gout0 += gx[512] * gy[576] * gz[0];
+                gout1 += gx[448] * gy[64] * gz[576];
+                gout2 += gx[960] * gy[128] * gz[0];
+                gout3 += gx[384] * gy[640] * gz[64];
+                gout4 += gx[384] * gy[0] * gz[704];
+                gout5 += gx[832] * gy[256] * gz[0];
+                gout6 += gx[256] * gy[768] * gz[64];
+                gout7 += gx[192] * gy[320] * gz[576];
+                gout8 += gx[768] * gy[192] * gz[128];
+                gout9 += gx[320] * gy[576] * gz[192];
+                gout10 += gx[256] * gy[64] * gz[768];
+                gout11 += gx[768] * gy[128] * gz[192];
+                gout12 += gx[192] * gy[640] * gz[256];
+                gout13 += gx[192] * gy[0] * gz[896];
+                gout14 += gx[640] * gy[448] * gz[0];
+                gout15 += gx[64] * gy[960] * gz[64];
+                gout16 += gx[0] * gy[512] * gz[576];
+                gout17 += gx[576] * gy[384] * gz[128];
+                gout18 += gx[128] * gy[768] * gz[192];
+                gout19 += gx[64] * gy[256] * gz[768];
+                gout20 += gx[576] * gy[320] * gz[192];
+                gout21 += gx[0] * gy[832] * gz[256];
+                gout22 += gx[0] * gy[192] * gz[896];
+                gout23 += gx[640] * gy[64] * gz[384];
+                gout24 += gx[64] * gy[576] * gz[448];
+                gout25 += gx[0] * gy[128] * gz[960];
+                gout26 += gx[576] * gy[0] * gz[512];
+                break;
+                case 2:
+                gout0 += gx[512] * gy[0] * gz[576];
+                gout1 += gx[1024] * gy[0] * gz[64];
+                gout2 += gx[384] * gy[704] * gz[0];
+                gout3 += gx[384] * gy[64] * gz[640];
+                gout4 += gx[896] * gy[192] * gz[0];
+                gout5 += gx[256] * gy[832] * gz[0];
+                gout6 += gx[256] * gy[192] * gz[640];
+                gout7 += gx[768] * gy[256] * gz[64];
+                gout8 += gx[192] * gy[768] * gz[128];
+                gout9 += gx[320] * gy[0] * gz[768];
+                gout10 += gx[832] * gy[0] * gz[256];
+                gout11 += gx[192] * gy[704] * gz[192];
+                gout12 += gx[192] * gy[64] * gz[832];
+                gout13 += gx[704] * gy[384] * gz[0];
+                gout14 += gx[64] * gy[1024] * gz[0];
+                gout15 += gx[64] * gy[384] * gz[640];
+                gout16 += gx[576] * gy[448] * gz[64];
+                gout17 += gx[0] * gy[960] * gz[128];
+                gout18 += gx[128] * gy[192] * gz[768];
+                gout19 += gx[640] * gy[192] * gz[256];
+                gout20 += gx[0] * gy[896] * gz[192];
+                gout21 += gx[0] * gy[256] * gz[832];
+                gout22 += gx[704] * gy[0] * gz[384];
+                gout23 += gx[64] * gy[640] * gz[384];
+                gout24 += gx[64] * gy[0] * gz[1024];
+                gout25 += gx[576] * gy[64] * gz[448];
+                gout26 += gx[0] * gy[576] * gz[512];
+                break;
+                case 3:
+                gout0 += gx[1024] * gy[64] * gz[0];
+                gout1 += gx[448] * gy[576] * gz[64];
+                gout2 += gx[384] * gy[128] * gz[576];
+                gout3 += gx[960] * gy[0] * gz[128];
+                gout4 += gx[320] * gy[768] * gz[0];
+                gout5 += gx[256] * gy[256] * gz[576];
+                gout6 += gx[768] * gy[320] * gz[0];
+                gout7 += gx[192] * gy[832] * gz[64];
+                gout8 += gx[192] * gy[192] * gz[704];
+                gout9 += gx[832] * gy[64] * gz[192];
+                gout10 += gx[256] * gy[576] * gz[256];
+                gout11 += gx[192] * gy[128] * gz[768];
+                gout12 += gx[768] * gy[0] * gz[320];
+                gout13 += gx[128] * gy[960] * gz[0];
+                gout14 += gx[64] * gy[448] * gz[576];
+                gout15 += gx[576] * gy[512] * gz[0];
+                gout16 += gx[0] * gy[1024] * gz[64];
+                gout17 += gx[0] * gy[384] * gz[704];
+                gout18 += gx[640] * gy[256] * gz[192];
+                gout19 += gx[64] * gy[768] * gz[256];
+                gout20 += gx[0] * gy[320] * gz[768];
+                gout21 += gx[576] * gy[192] * gz[320];
+                gout22 += gx[128] * gy[576] * gz[384];
+                gout23 += gx[64] * gy[64] * gz[960];
+                gout24 += gx[576] * gy[128] * gz[384];
+                gout25 += gx[0] * gy[640] * gz[448];
+                gout26 += gx[0] * gy[0] * gz[1088];
+                break;
+                }
+            }
+        }
+        if (ijk_idx < nst) {
+            int *ao_loc = envs.ao_loc;
+            int k0 = ao_loc[ksh0] - ao_loc[nbas];
+            double *eri_tensor = out_local + shl_pair_in_block * 36 * naux + k0 + ksh_in_block * 3;
+            switch (gout_id) {
+            case 0:
+            eri_tensor[0*naux + 0] = gout0;
+            eri_tensor[1*naux + 1] = gout1;
+            eri_tensor[2*naux + 2] = gout2;
+            eri_tensor[4*naux + 0] = gout3;
+            eri_tensor[5*naux + 1] = gout4;
+            eri_tensor[6*naux + 2] = gout5;
+            eri_tensor[8*naux + 0] = gout6;
+            eri_tensor[9*naux + 1] = gout7;
+            eri_tensor[10*naux + 2] = gout8;
+            eri_tensor[12*naux + 0] = gout9;
+            eri_tensor[13*naux + 1] = gout10;
+            eri_tensor[14*naux + 2] = gout11;
+            eri_tensor[16*naux + 0] = gout12;
+            eri_tensor[17*naux + 1] = gout13;
+            eri_tensor[18*naux + 2] = gout14;
+            eri_tensor[20*naux + 0] = gout15;
+            eri_tensor[21*naux + 1] = gout16;
+            eri_tensor[22*naux + 2] = gout17;
+            eri_tensor[24*naux + 0] = gout18;
+            eri_tensor[25*naux + 1] = gout19;
+            eri_tensor[26*naux + 2] = gout20;
+            eri_tensor[28*naux + 0] = gout21;
+            eri_tensor[29*naux + 1] = gout22;
+            eri_tensor[30*naux + 2] = gout23;
+            eri_tensor[32*naux + 0] = gout24;
+            eri_tensor[33*naux + 1] = gout25;
+            eri_tensor[34*naux + 2] = gout26;
+            break;
+            case 1:
+            eri_tensor[0*naux + 1] = gout0;
+            eri_tensor[1*naux + 2] = gout1;
+            eri_tensor[3*naux + 0] = gout2;
+            eri_tensor[4*naux + 1] = gout3;
+            eri_tensor[5*naux + 2] = gout4;
+            eri_tensor[7*naux + 0] = gout5;
+            eri_tensor[8*naux + 1] = gout6;
+            eri_tensor[9*naux + 2] = gout7;
+            eri_tensor[11*naux + 0] = gout8;
+            eri_tensor[12*naux + 1] = gout9;
+            eri_tensor[13*naux + 2] = gout10;
+            eri_tensor[15*naux + 0] = gout11;
+            eri_tensor[16*naux + 1] = gout12;
+            eri_tensor[17*naux + 2] = gout13;
+            eri_tensor[19*naux + 0] = gout14;
+            eri_tensor[20*naux + 1] = gout15;
+            eri_tensor[21*naux + 2] = gout16;
+            eri_tensor[23*naux + 0] = gout17;
+            eri_tensor[24*naux + 1] = gout18;
+            eri_tensor[25*naux + 2] = gout19;
+            eri_tensor[27*naux + 0] = gout20;
+            eri_tensor[28*naux + 1] = gout21;
+            eri_tensor[29*naux + 2] = gout22;
+            eri_tensor[31*naux + 0] = gout23;
+            eri_tensor[32*naux + 1] = gout24;
+            eri_tensor[33*naux + 2] = gout25;
+            eri_tensor[35*naux + 0] = gout26;
+            break;
+            case 2:
+            eri_tensor[0*naux + 2] = gout0;
+            eri_tensor[2*naux + 0] = gout1;
+            eri_tensor[3*naux + 1] = gout2;
+            eri_tensor[4*naux + 2] = gout3;
+            eri_tensor[6*naux + 0] = gout4;
+            eri_tensor[7*naux + 1] = gout5;
+            eri_tensor[8*naux + 2] = gout6;
+            eri_tensor[10*naux + 0] = gout7;
+            eri_tensor[11*naux + 1] = gout8;
+            eri_tensor[12*naux + 2] = gout9;
+            eri_tensor[14*naux + 0] = gout10;
+            eri_tensor[15*naux + 1] = gout11;
+            eri_tensor[16*naux + 2] = gout12;
+            eri_tensor[18*naux + 0] = gout13;
+            eri_tensor[19*naux + 1] = gout14;
+            eri_tensor[20*naux + 2] = gout15;
+            eri_tensor[22*naux + 0] = gout16;
+            eri_tensor[23*naux + 1] = gout17;
+            eri_tensor[24*naux + 2] = gout18;
+            eri_tensor[26*naux + 0] = gout19;
+            eri_tensor[27*naux + 1] = gout20;
+            eri_tensor[28*naux + 2] = gout21;
+            eri_tensor[30*naux + 0] = gout22;
+            eri_tensor[31*naux + 1] = gout23;
+            eri_tensor[32*naux + 2] = gout24;
+            eri_tensor[34*naux + 0] = gout25;
+            eri_tensor[35*naux + 1] = gout26;
+            break;
+            case 3:
+            eri_tensor[1*naux + 0] = gout0;
+            eri_tensor[2*naux + 1] = gout1;
+            eri_tensor[3*naux + 2] = gout2;
+            eri_tensor[5*naux + 0] = gout3;
+            eri_tensor[6*naux + 1] = gout4;
+            eri_tensor[7*naux + 2] = gout5;
+            eri_tensor[9*naux + 0] = gout6;
+            eri_tensor[10*naux + 1] = gout7;
+            eri_tensor[11*naux + 2] = gout8;
+            eri_tensor[13*naux + 0] = gout9;
+            eri_tensor[14*naux + 1] = gout10;
+            eri_tensor[15*naux + 2] = gout11;
+            eri_tensor[17*naux + 0] = gout12;
+            eri_tensor[18*naux + 1] = gout13;
+            eri_tensor[19*naux + 2] = gout14;
+            eri_tensor[21*naux + 0] = gout15;
+            eri_tensor[22*naux + 1] = gout16;
+            eri_tensor[23*naux + 2] = gout17;
+            eri_tensor[25*naux + 0] = gout18;
+            eri_tensor[26*naux + 1] = gout19;
+            eri_tensor[27*naux + 2] = gout20;
+            eri_tensor[29*naux + 0] = gout21;
+            eri_tensor[30*naux + 1] = gout22;
+            eri_tensor[31*naux + 2] = gout23;
+            eri_tensor[33*naux + 0] = gout24;
+            eri_tensor[34*naux + 1] = gout25;
+            eri_tensor[35*naux + 2] = gout26;
+            break;
+            }
+        }
+    }
+}
+
+__device__
+void int3c2e_bdiv_002(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 2;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double rt_ak = rt_aa * aij;
+                double b01 = .5/ak * (1 - rt_ak);
+                double cpx = xpq*rt_ak;
+                double trr_01x = cpx * 1;
+                double trr_02x = cpx * trr_01x + 1*b01 * 1;
+                gout0 += trr_02x * fac1 * wt;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout1 += trr_01x * trr_01y * wt;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout2 += trr_01x * fac1 * trr_01z;
+                double trr_02y = cpy * trr_01y + 1*b01 * fac1;
+                gout3 += 1 * trr_02y * wt;
+                gout4 += 1 * trr_01y * trr_01z;
+                double trr_02z = cpz * trr_01z + 1*b01 * wt;
+                gout5 += 1 * fac1 * trr_02z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 1 * naux + ksh_in_block * 6;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout1;
+        eri_tensor[0*naux + 2] = gout2;
+        eri_tensor[0*naux + 3] = gout3;
+        eri_tensor[0*naux + 4] = gout4;
+        eri_tensor[0*naux + 5] = gout5;
+    }
+}
+
+__device__
+void int3c2e_bdiv_102(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 2;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(2, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 4*nst_per_block;
+                rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 2; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double b01 = .5/ak * (1 - rt_ak);
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                double trr_01x = cpx * 1;
+                double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x;
+                gout0 += trr_12x * fac1 * wt;
+                double trr_02x = cpx * trr_01x + 1*b01 * 1;
+                double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += trr_02x * trr_10y * wt;
+                double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += trr_02x * fac1 * trr_10z;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout3 += trr_11x * trr_01y * wt;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout4 += trr_01x * trr_11y * wt;
+                gout5 += trr_01x * trr_01y * trr_10z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout6 += trr_11x * fac1 * trr_01z;
+                gout7 += trr_01x * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout8 += trr_01x * fac1 * trr_11z;
+                double trr_02y = cpy * trr_01y + 1*b01 * fac1;
+                gout9 += trr_10x * trr_02y * wt;
+                double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y;
+                gout10 += 1 * trr_12y * wt;
+                gout11 += 1 * trr_02y * trr_10z;
+                gout12 += trr_10x * trr_01y * trr_01z;
+                gout13 += 1 * trr_11y * trr_01z;
+                gout14 += 1 * trr_01y * trr_11z;
+                double trr_02z = cpz * trr_01z + 1*b01 * wt;
+                gout15 += trr_10x * fac1 * trr_02z;
+                gout16 += 1 * trr_10y * trr_02z;
+                double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z;
+                gout17 += 1 * fac1 * trr_12z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 3 * naux + ksh_in_block * 6;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout3;
+        eri_tensor[0*naux + 2] = gout6;
+        eri_tensor[0*naux + 3] = gout9;
+        eri_tensor[0*naux + 4] = gout12;
+        eri_tensor[0*naux + 5] = gout15;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout4;
+        eri_tensor[1*naux + 2] = gout7;
+        eri_tensor[1*naux + 3] = gout10;
+        eri_tensor[1*naux + 4] = gout13;
+        eri_tensor[1*naux + 5] = gout16;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout5;
+        eri_tensor[2*naux + 2] = gout8;
+        eri_tensor[2*naux + 3] = gout11;
+        eri_tensor[2*naux + 4] = gout14;
+        eri_tensor[2*naux + 5] = gout17;
+    }
+}
+
+__device__
+void int3c2e_bdiv_112(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 3;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        double gout27 = 0;
+        double gout28 = 0;
+        double gout29 = 0;
+        double gout30 = 0;
+        double gout31 = 0;
+        double gout32 = 0;
+        double gout33 = 0;
+        double gout34 = 0;
+        double gout35 = 0;
+        double gout36 = 0;
+        double gout37 = 0;
+        double gout38 = 0;
+        double gout39 = 0;
+        double gout40 = 0;
+        double gout41 = 0;
+        double gout42 = 0;
+        double gout43 = 0;
+        double gout44 = 0;
+        double gout45 = 0;
+        double gout46 = 0;
+        double gout47 = 0;
+        double gout48 = 0;
+        double gout49 = 0;
+        double gout50 = 0;
+        double gout51 = 0;
+        double gout52 = 0;
+        double gout53 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(3, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 6*nst_per_block;
+                rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double b01 = .5/ak * (1 - rt_ak);
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                double trr_22x = cpx * trr_21x + 1*b01 * trr_20x + 2*b00 * trr_11x;
+                double trr_01x = cpx * 1;
+                double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x;
+                double hrr_112x = trr_22x - xjxi * trr_12x;
+                gout0 += hrr_112x * fac1 * wt;
+                double trr_02x = cpx * trr_01x + 1*b01 * 1;
+                double hrr_012x = trr_12x - xjxi * trr_02x;
+                double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += hrr_012x * trr_10y * wt;
+                double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += hrr_012x * fac1 * trr_10z;
+                double hrr_010y = trr_10y - yjyi * fac1;
+                gout3 += trr_12x * hrr_010y * wt;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                double hrr_110y = trr_20y - yjyi * trr_10y;
+                gout4 += trr_02x * hrr_110y * wt;
+                gout5 += trr_02x * hrr_010y * trr_10z;
+                double hrr_010z = trr_10z - zjzi * wt;
+                gout6 += trr_12x * fac1 * hrr_010z;
+                gout7 += trr_02x * trr_10y * hrr_010z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                double hrr_110z = trr_20z - zjzi * trr_10z;
+                gout8 += trr_02x * fac1 * hrr_110z;
+                double hrr_111x = trr_21x - xjxi * trr_11x;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout9 += hrr_111x * trr_01y * wt;
+                double hrr_011x = trr_11x - xjxi * trr_01x;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout10 += hrr_011x * trr_11y * wt;
+                gout11 += hrr_011x * trr_01y * trr_10z;
+                double hrr_011y = trr_11y - yjyi * trr_01y;
+                gout12 += trr_11x * hrr_011y * wt;
+                double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
+                double hrr_111y = trr_21y - yjyi * trr_11y;
+                gout13 += trr_01x * hrr_111y * wt;
+                gout14 += trr_01x * hrr_011y * trr_10z;
+                gout15 += trr_11x * trr_01y * hrr_010z;
+                gout16 += trr_01x * trr_11y * hrr_010z;
+                gout17 += trr_01x * trr_01y * hrr_110z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout18 += hrr_111x * fac1 * trr_01z;
+                gout19 += hrr_011x * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout20 += hrr_011x * fac1 * trr_11z;
+                gout21 += trr_11x * hrr_010y * trr_01z;
+                gout22 += trr_01x * hrr_110y * trr_01z;
+                gout23 += trr_01x * hrr_010y * trr_11z;
+                double hrr_011z = trr_11z - zjzi * trr_01z;
+                gout24 += trr_11x * fac1 * hrr_011z;
+                gout25 += trr_01x * trr_10y * hrr_011z;
+                double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
+                double hrr_111z = trr_21z - zjzi * trr_11z;
+                gout26 += trr_01x * fac1 * hrr_111z;
+                double hrr_110x = trr_20x - xjxi * trr_10x;
+                double trr_02y = cpy * trr_01y + 1*b01 * fac1;
+                gout27 += hrr_110x * trr_02y * wt;
+                double hrr_010x = trr_10x - xjxi * 1;
+                double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y;
+                gout28 += hrr_010x * trr_12y * wt;
+                gout29 += hrr_010x * trr_02y * trr_10z;
+                double hrr_012y = trr_12y - yjyi * trr_02y;
+                gout30 += trr_10x * hrr_012y * wt;
+                double trr_22y = cpy * trr_21y + 1*b01 * trr_20y + 2*b00 * trr_11y;
+                double hrr_112y = trr_22y - yjyi * trr_12y;
+                gout31 += 1 * hrr_112y * wt;
+                gout32 += 1 * hrr_012y * trr_10z;
+                gout33 += trr_10x * trr_02y * hrr_010z;
+                gout34 += 1 * trr_12y * hrr_010z;
+                gout35 += 1 * trr_02y * hrr_110z;
+                gout36 += hrr_110x * trr_01y * trr_01z;
+                gout37 += hrr_010x * trr_11y * trr_01z;
+                gout38 += hrr_010x * trr_01y * trr_11z;
+                gout39 += trr_10x * hrr_011y * trr_01z;
+                gout40 += 1 * hrr_111y * trr_01z;
+                gout41 += 1 * hrr_011y * trr_11z;
+                gout42 += trr_10x * trr_01y * hrr_011z;
+                gout43 += 1 * trr_11y * hrr_011z;
+                gout44 += 1 * trr_01y * hrr_111z;
+                double trr_02z = cpz * trr_01z + 1*b01 * wt;
+                gout45 += hrr_110x * fac1 * trr_02z;
+                gout46 += hrr_010x * trr_10y * trr_02z;
+                double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z;
+                gout47 += hrr_010x * fac1 * trr_12z;
+                gout48 += trr_10x * hrr_010y * trr_02z;
+                gout49 += 1 * hrr_110y * trr_02z;
+                gout50 += 1 * hrr_010y * trr_12z;
+                double hrr_012z = trr_12z - zjzi * trr_02z;
+                gout51 += trr_10x * fac1 * hrr_012z;
+                gout52 += 1 * trr_10y * hrr_012z;
+                double trr_22z = cpz * trr_21z + 1*b01 * trr_20z + 2*b00 * trr_11z;
+                double hrr_112z = trr_22z - zjzi * trr_12z;
+                gout53 += 1 * fac1 * hrr_112z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 9 * naux + ksh_in_block * 6;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout9;
+        eri_tensor[0*naux + 2] = gout18;
+        eri_tensor[0*naux + 3] = gout27;
+        eri_tensor[0*naux + 4] = gout36;
+        eri_tensor[0*naux + 5] = gout45;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout10;
+        eri_tensor[1*naux + 2] = gout19;
+        eri_tensor[1*naux + 3] = gout28;
+        eri_tensor[1*naux + 4] = gout37;
+        eri_tensor[1*naux + 5] = gout46;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout11;
+        eri_tensor[2*naux + 2] = gout20;
+        eri_tensor[2*naux + 3] = gout29;
+        eri_tensor[2*naux + 4] = gout38;
+        eri_tensor[2*naux + 5] = gout47;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[3*naux + 1] = gout12;
+        eri_tensor[3*naux + 2] = gout21;
+        eri_tensor[3*naux + 3] = gout30;
+        eri_tensor[3*naux + 4] = gout39;
+        eri_tensor[3*naux + 5] = gout48;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[4*naux + 1] = gout13;
+        eri_tensor[4*naux + 2] = gout22;
+        eri_tensor[4*naux + 3] = gout31;
+        eri_tensor[4*naux + 4] = gout40;
+        eri_tensor[4*naux + 5] = gout49;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[5*naux + 1] = gout14;
+        eri_tensor[5*naux + 2] = gout23;
+        eri_tensor[5*naux + 3] = gout32;
+        eri_tensor[5*naux + 4] = gout41;
+        eri_tensor[5*naux + 5] = gout50;
+        eri_tensor[6*naux + 0] = gout6;
+        eri_tensor[6*naux + 1] = gout15;
+        eri_tensor[6*naux + 2] = gout24;
+        eri_tensor[6*naux + 3] = gout33;
+        eri_tensor[6*naux + 4] = gout42;
+        eri_tensor[6*naux + 5] = gout51;
+        eri_tensor[7*naux + 0] = gout7;
+        eri_tensor[7*naux + 1] = gout16;
+        eri_tensor[7*naux + 2] = gout25;
+        eri_tensor[7*naux + 3] = gout34;
+        eri_tensor[7*naux + 4] = gout43;
+        eri_tensor[7*naux + 5] = gout52;
+        eri_tensor[8*naux + 0] = gout8;
+        eri_tensor[8*naux + 1] = gout17;
+        eri_tensor[8*naux + 2] = gout26;
+        eri_tensor[8*naux + 3] = gout35;
+        eri_tensor[8*naux + 4] = gout44;
+        eri_tensor[8*naux + 5] = gout53;
+    }
+}
+
+__device__
+void int3c2e_bdiv_202(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    // For better load balance, consume blocks in the reversed order
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int nst_per_block = blockDim.x;
+    int st_id = threadIdx.x;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 3;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) {
+        nroots *= 2;
+    }
+    extern __shared__ double rw_buffer[];
+    double *rw = rw_buffer + st_id;
+    double *rjri = rw + nst_per_block * nroots*2;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        double xjxi = rj[0] - ri[0];
+        double yjyi = rj[1] - ri[1];
+        double zjzi = rj[2] - ri[2];
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        rjri[0*nst_per_block] = rj[0] - ri[0];
+        rjri[1*nst_per_block] = rj[1] - ri[1];
+        rjri[2*nst_per_block] = rj[2] - ri[2];
+        rjri[3*nst_per_block] = rr_ij;
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        double gout27 = 0;
+        double gout28 = 0;
+        double gout29 = 0;
+        double gout30 = 0;
+        double gout31 = 0;
+        double gout32 = 0;
+        double gout33 = 0;
+        double gout34 = 0;
+        double gout35 = 0;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double cijk = ci[ip] * cj[jp] * ck[kp];
+            double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+            double aj_aij = aj / aij;
+            double theta_ij = ai * aj_aij;
+            double Kab = theta_ij * rjri[3*nst_per_block];
+            double fac1 = fac * exp(-Kab);
+            double xij = rjri[0*nst_per_block] * aj_aij + ri[0];
+            double yij = rjri[1*nst_per_block] * aj_aij + ri[1];
+            double zij = rjri[2*nst_per_block] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(3, theta_rr, rw, nst_per_block, 0, 1);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 6*nst_per_block;
+                rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1);
+                rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1);
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = 0; irys < 3; ++irys) {
+                    rw[ irys*2   *nst_per_block] *= theta_fac;
+                    rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                double wt = rw[(2*irys+1)*nst_per_block];
+                double rt = rw[ 2*irys   *nst_per_block];
+                double rt_aa = rt / (aij + ak);
+                double b00 = .5 * rt_aa;
+                double rt_ak = rt_aa * aij;
+                double b01 = .5/ak * (1 - rt_ak);
+                double cpx = xpq*rt_ak;
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij;
+                double trr_10x = c0x * 1;
+                double trr_20x = c0x * trr_10x + 1*b10 * 1;
+                double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
+                double trr_11x = cpx * trr_10x + 1*b00 * 1;
+                double trr_22x = cpx * trr_21x + 1*b01 * trr_20x + 2*b00 * trr_11x;
+                gout0 += trr_22x * fac1 * wt;
+                double trr_01x = cpx * 1;
+                double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x;
+                double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij;
+                double trr_10y = c0y * fac1;
+                gout1 += trr_12x * trr_10y * wt;
+                double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij;
+                double trr_10z = c0z * wt;
+                gout2 += trr_12x * fac1 * trr_10z;
+                double trr_02x = cpx * trr_01x + 1*b01 * 1;
+                double trr_20y = c0y * trr_10y + 1*b10 * fac1;
+                gout3 += trr_02x * trr_20y * wt;
+                gout4 += trr_02x * trr_10y * trr_10z;
+                double trr_20z = c0z * trr_10z + 1*b10 * wt;
+                gout5 += trr_02x * fac1 * trr_20z;
+                double cpy = ypq*rt_ak;
+                double trr_01y = cpy * fac1;
+                gout6 += trr_21x * trr_01y * wt;
+                double trr_11y = cpy * trr_10y + 1*b00 * fac1;
+                gout7 += trr_11x * trr_11y * wt;
+                gout8 += trr_11x * trr_01y * trr_10z;
+                double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
+                gout9 += trr_01x * trr_21y * wt;
+                gout10 += trr_01x * trr_11y * trr_10z;
+                gout11 += trr_01x * trr_01y * trr_20z;
+                double cpz = zpq*rt_ak;
+                double trr_01z = cpz * wt;
+                gout12 += trr_21x * fac1 * trr_01z;
+                gout13 += trr_11x * trr_10y * trr_01z;
+                double trr_11z = cpz * trr_10z + 1*b00 * wt;
+                gout14 += trr_11x * fac1 * trr_11z;
+                gout15 += trr_01x * trr_20y * trr_01z;
+                gout16 += trr_01x * trr_10y * trr_11z;
+                double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
+                gout17 += trr_01x * fac1 * trr_21z;
+                double trr_02y = cpy * trr_01y + 1*b01 * fac1;
+                gout18 += trr_20x * trr_02y * wt;
+                double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y;
+                gout19 += trr_10x * trr_12y * wt;
+                gout20 += trr_10x * trr_02y * trr_10z;
+                double trr_22y = cpy * trr_21y + 1*b01 * trr_20y + 2*b00 * trr_11y;
+                gout21 += 1 * trr_22y * wt;
+                gout22 += 1 * trr_12y * trr_10z;
+                gout23 += 1 * trr_02y * trr_20z;
+                gout24 += trr_20x * trr_01y * trr_01z;
+                gout25 += trr_10x * trr_11y * trr_01z;
+                gout26 += trr_10x * trr_01y * trr_11z;
+                gout27 += 1 * trr_21y * trr_01z;
+                gout28 += 1 * trr_11y * trr_11z;
+                gout29 += 1 * trr_01y * trr_21z;
+                double trr_02z = cpz * trr_01z + 1*b01 * wt;
+                gout30 += trr_20x * fac1 * trr_02z;
+                gout31 += trr_10x * trr_10y * trr_02z;
+                double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z;
+                gout32 += trr_10x * fac1 * trr_12z;
+                gout33 += 1 * trr_20y * trr_02z;
+                gout34 += 1 * trr_10y * trr_12z;
+                double trr_22z = cpz * trr_21z + 1*b01 * trr_20z + 2*b00 * trr_11z;
+                gout35 += 1 * fac1 * trr_22z;
+            }
+        }
+        int *ao_loc = envs.ao_loc;
+        int k0 = ao_loc[ksh0] - ao_loc[nbas];
+        double *eri_tensor = out_local + k0 + shl_pair_in_block * 6 * naux + ksh_in_block * 6;
+        eri_tensor[0*naux + 0] = gout0;
+        eri_tensor[0*naux + 1] = gout6;
+        eri_tensor[0*naux + 2] = gout12;
+        eri_tensor[0*naux + 3] = gout18;
+        eri_tensor[0*naux + 4] = gout24;
+        eri_tensor[0*naux + 5] = gout30;
+        eri_tensor[1*naux + 0] = gout1;
+        eri_tensor[1*naux + 1] = gout7;
+        eri_tensor[1*naux + 2] = gout13;
+        eri_tensor[1*naux + 3] = gout19;
+        eri_tensor[1*naux + 4] = gout25;
+        eri_tensor[1*naux + 5] = gout31;
+        eri_tensor[2*naux + 0] = gout2;
+        eri_tensor[2*naux + 1] = gout8;
+        eri_tensor[2*naux + 2] = gout14;
+        eri_tensor[2*naux + 3] = gout20;
+        eri_tensor[2*naux + 4] = gout26;
+        eri_tensor[2*naux + 5] = gout32;
+        eri_tensor[3*naux + 0] = gout3;
+        eri_tensor[3*naux + 1] = gout9;
+        eri_tensor[3*naux + 2] = gout15;
+        eri_tensor[3*naux + 3] = gout21;
+        eri_tensor[3*naux + 4] = gout27;
+        eri_tensor[3*naux + 5] = gout33;
+        eri_tensor[4*naux + 0] = gout4;
+        eri_tensor[4*naux + 1] = gout10;
+        eri_tensor[4*naux + 2] = gout16;
+        eri_tensor[4*naux + 3] = gout22;
+        eri_tensor[4*naux + 4] = gout28;
+        eri_tensor[4*naux + 5] = gout34;
+        eri_tensor[5*naux + 0] = gout5;
+        eri_tensor[5*naux + 1] = gout11;
+        eri_tensor[5*naux + 2] = gout17;
+        eri_tensor[5*naux + 3] = gout23;
+        eri_tensor[5*naux + 4] = gout29;
+        eri_tensor[5*naux + 5] = gout35;
+    }
+}
+
+__device__
+void int3c2e_bdiv_212(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int ksh1 = bounds.ksh_offsets[ksh_block_id+1];
+    int nksh = ksh1 - ksh0;
+    int nshl_pair = shl_pair1 - shl_pair0;
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int thread_id = threadIdx.x;
+    int st_id = thread_id % 64;
+    int gout_id = thread_id / 64;
+    int *bas = envs.bas;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int ijkprim = ijprim * kprim;
+    int nroots = 3;
+    double *env = envs.env;
+    double omega = env[PTR_RANGE_OMEGA];
+    extern __shared__ double rw_cache[];
+    double *rw = rw_cache + st_id;
+    double *gx = rw + nroots * 128;
+    double *gy = gx + 1152;
+    double *gz = gy + 1152;
+    double *Rpq = gz + 1152;
+    double *rjri = Rpq + 192;
+    int naux = bounds.naux;
+    double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux;
+
+    if (gout_id == 0) {
+        gx[0] = 1.;
+    }
+
+    int nst = nshl_pair * nksh;
+    for (int ijk_idx = st_id; ijk_idx < nst+st_id; ijk_idx += 64) {
+        int shl_pair_in_block = ijk_idx / nksh;
+        int ksh_in_block = ijk_idx % nksh;
+        if (ijk_idx >= nst) {
+            shl_pair_in_block = 0;
+            if (gout_id == 0) {
+                gx[0] = 0.;
+            }
+        }
+        int ksh = ksh_in_block + ksh0;
+        int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0];
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+        if (gout_id == 0) {
+            double xjxi = rj[0] - ri[0];
+            double yjyi = rj[1] - ri[1];
+            double zjzi = rj[2] - ri[2];
+            double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+            rjri[0] = xjxi;
+            rjri[64] = yjyi;
+            rjri[128] = zjzi;
+            rjri[192] = rr_ij;
+        }
+        double gout0 = 0;
+        double gout1 = 0;
+        double gout2 = 0;
+        double gout3 = 0;
+        double gout4 = 0;
+        double gout5 = 0;
+        double gout6 = 0;
+        double gout7 = 0;
+        double gout8 = 0;
+        double gout9 = 0;
+        double gout10 = 0;
+        double gout11 = 0;
+        double gout12 = 0;
+        double gout13 = 0;
+        double gout14 = 0;
+        double gout15 = 0;
+        double gout16 = 0;
+        double gout17 = 0;
+        double gout18 = 0;
+        double gout19 = 0;
+        double gout20 = 0;
+        double gout21 = 0;
+        double gout22 = 0;
+        double gout23 = 0;
+        double gout24 = 0;
+        double gout25 = 0;
+        double gout26 = 0;
+        double s0, s1, s2;
+        for (int ijkp = 0; ijkp < ijkprim; ++ijkp) {
+            int ijp = ijkp / kprim;
+            int kp = ijkp % kprim;
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ak = expk[kp];
+            double aij = ai + aj;
+            double aj_aij = aj / aij;
+            __syncthreads();
+            double xij = rjri[0] * aj_aij + ri[0];
+            double yij = rjri[64] * aj_aij + ri[1];
+            double zij = rjri[128] * aj_aij + ri[2];
+            double xpq = xij - rk[0];
+            double ypq = yij - rk[1];
+            double zpq = zij - rk[2];
+            if (gout_id == 0) {
+                double cijk = ci[ip] * cj[jp] * ck[kp];
+                double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak));
+                double theta_ij = ai * aj_aij;
+                double Kab = theta_ij * rjri[192];
+                gy[0] = fac * exp(-Kab);
+                Rpq[0] = xpq;
+                Rpq[64] = ypq;
+                Rpq[128] = zpq;
+            }
+            double rr = xpq * xpq + ypq * ypq + zpq * zpq;
+            double theta = aij * ak / (aij + ak);
+            double theta_rr = theta * rr;
+            if (omega == 0) {
+                rys_roots(3, theta_rr, rw, 64, gout_id, 4);
+            } else if (omega > 0) {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4);
+                __syncthreads();
+                double sqrt_theta_fac = sqrt(theta_fac);
+                for (int irys = gout_id; irys < 3; irys+=4) {
+                    rw[ irys*2   *64] *= theta_fac;
+                    rw[(irys*2+1)*64] *= sqrt_theta_fac;
+                }
+            } else {
+                double omega2 = omega * omega;
+                double theta_fac = omega2 / (omega2 + theta);
+                double *rw1 = rw + 384;
+                rys_roots(3, theta_rr, rw1, 64, gout_id, 4);
+                rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4);
+                __syncthreads();
+                double sqrt_theta_fac = -sqrt(theta_fac);
+                for (int irys = gout_id; irys < 3; irys+=4) {
+                    rw[ irys*2   *64] *= theta_fac;
+                    rw[(irys*2+1)*64] *= sqrt_theta_fac;
+                }
+            }
+            for (int irys = 0; irys < nroots; ++irys) {
+                __syncthreads();
+                double rt = rw[irys*128];
+                double rt_aa = rt / (aij + ak);
+                double rt_aij = rt_aa * ak;
+                double b10 = .5/aij * (1 - rt_aij);
+                double rt_ak = rt_aa * aij;
+                double b00 = .5 * rt_aa;
+                double b01 = .5/ak * (1 - rt_ak);
+                for (int n = gout_id; n < 3; n += 4) {
+                    if (n == 2) {
+                        gz[0] = rw[irys*128+64];
+                    }
+                    double *_gx = gx + n * 1152;
+                    double xjxi = rjri[n * 64];
+                    double Rpa = xjxi * aj_aij;
+                    double c0x = Rpa - rt_aij * Rpq[n * 64];
+                    s0 = _gx[0];
+                    s1 = c0x * s0;
+                    _gx[64] = s1;
+                    s2 = c0x * s1 + 1 * b10 * s0;
+                    _gx[128] = s2;
+                    s0 = s1;
+                    s1 = s2;
+                    s2 = c0x * s1 + 2 * b10 * s0;
+                    _gx[192] = s2;
+                    double cpx = rt_ak * Rpq[n * 64];
+                    s0 = _gx[0];
+                    s1 = cpx * s0;
+                    _gx[384] = s1;
+                    s2 = cpx*s1 + 1 * b01 *s0;
+                    _gx[768] = s2;
+                    s0 = _gx[64];
+                    s1 = cpx * s0;
+                    s1 += 1 * b00 * _gx[0];
+                    _gx[448] = s1;
+                    s2 = cpx*s1 + 1 * b01 *s0;
+                    s2 += 1 * b00 * _gx[384];
+                    _gx[832] = s2;
+                    s0 = _gx[128];
+                    s1 = cpx * s0;
+                    s1 += 2 * b00 * _gx[64];
+                    _gx[512] = s1;
+                    s2 = cpx*s1 + 1 * b01 *s0;
+                    s2 += 2 * b00 * _gx[448];
+                    _gx[896] = s2;
+                    s0 = _gx[192];
+                    s1 = cpx * s0;
+                    s1 += 3 * b00 * _gx[128];
+                    _gx[576] = s1;
+                    s2 = cpx*s1 + 1 * b01 *s0;
+                    s2 += 3 * b00 * _gx[512];
+                    _gx[960] = s2;
+                    s1 = _gx[192];
+                    s0 = _gx[128];
+                    _gx[320] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[64];
+                    _gx[256] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[0];
+                    _gx[192] = s1 - xjxi * s0;
+                    s1 = _gx[576];
+                    s0 = _gx[512];
+                    _gx[704] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[448];
+                    _gx[640] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[384];
+                    _gx[576] = s1 - xjxi * s0;
+                    s1 = _gx[960];
+                    s0 = _gx[896];
+                    _gx[1088] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[832];
+                    _gx[1024] = s1 - xjxi * s0;
+                    s1 = s0;
+                    s0 = _gx[768];
+                    _gx[960] = s1 - xjxi * s0;
+                }
+                __syncthreads();
+                switch (gout_id) {
+                case 0:
+                gout0 += gx[1088] * gy[0] * gz[0];
+                gout1 += gx[320] * gy[384] * gz[384];
+                gout2 += gx[640] * gy[64] * gz[384];
+                gout3 += gx[1024] * gy[0] * gz[64];
+                gout4 += gx[256] * gy[384] * gz[448];
+                gout5 += gx[576] * gy[128] * gz[384];
+                gout6 += gx[960] * gy[64] * gz[64];
+                gout7 += gx[192] * gy[448] * gz[448];
+                gout8 += gx[576] * gy[0] * gz[512];
+                gout9 += gx[896] * gy[192] * gz[0];
+                gout10 += gx[128] * gy[576] * gz[384];
+                gout11 += gx[448] * gy[256] * gz[384];
+                gout12 += gx[832] * gy[192] * gz[64];
+                gout13 += gx[64] * gy[576] * gz[448];
+                gout14 += gx[384] * gy[320] * gz[384];
+                gout15 += gx[768] * gy[256] * gz[64];
+                gout16 += gx[0] * gy[640] * gz[448];
+                gout17 += gx[384] * gy[192] * gz[512];
+                gout18 += gx[896] * gy[0] * gz[192];
+                gout19 += gx[128] * gy[384] * gz[576];
+                gout20 += gx[448] * gy[64] * gz[576];
+                gout21 += gx[832] * gy[0] * gz[256];
+                gout22 += gx[64] * gy[384] * gz[640];
+                gout23 += gx[384] * gy[128] * gz[576];
+                gout24 += gx[768] * gy[64] * gz[256];
+                gout25 += gx[0] * gy[448] * gz[640];
+                gout26 += gx[384] * gy[0] * gz[704];
+                break;
+                case 1:
+                gout0 += gx[704] * gy[384] * gz[0];
+                gout1 += gx[320] * gy[0] * gz[768];
+                gout2 += gx[256] * gy[832] * gz[0];
+                gout3 += gx[640] * gy[384] * gz[64];
+                gout4 += gx[256] * gy[0] * gz[832];
+                gout5 += gx[192] * gy[896] * gz[0];
+                gout6 += gx[576] * gy[448] * gz[64];
+                gout7 += gx[192] * gy[64] * gz[832];
+                gout8 += gx[192] * gy[768] * gz[128];
+                gout9 += gx[512] * gy[576] * gz[0];
+                gout10 += gx[128] * gy[192] * gz[768];
+                gout11 += gx[64] * gy[1024] * gz[0];
+                gout12 += gx[448] * gy[576] * gz[64];
+                gout13 += gx[64] * gy[192] * gz[832];
+                gout14 += gx[0] * gy[1088] * gz[0];
+                gout15 += gx[384] * gy[640] * gz[64];
+                gout16 += gx[0] * gy[256] * gz[832];
+                gout17 += gx[0] * gy[960] * gz[128];
+                gout18 += gx[512] * gy[384] * gz[192];
+                gout19 += gx[128] * gy[0] * gz[960];
+                gout20 += gx[64] * gy[832] * gz[192];
+                gout21 += gx[448] * gy[384] * gz[256];
+                gout22 += gx[64] * gy[0] * gz[1024];
+                gout23 += gx[0] * gy[896] * gz[192];
+                gout24 += gx[384] * gy[448] * gz[256];
+                gout25 += gx[0] * gy[64] * gz[1024];
+                gout26 += gx[0] * gy[768] * gz[320];
+                break;
+                case 2:
+                gout0 += gx[704] * gy[0] * gz[384];
+                gout1 += gx[1024] * gy[64] * gz[0];
+                gout2 += gx[256] * gy[448] * gz[384];
+                gout3 += gx[640] * gy[0] * gz[448];
+                gout4 += gx[960] * gy[128] * gz[0];
+                gout5 += gx[192] * gy[512] * gz[384];
+                gout6 += gx[576] * gy[64] * gz[448];
+                gout7 += gx[960] * gy[0] * gz[128];
+                gout8 += gx[192] * gy[384] * gz[512];
+                gout9 += gx[512] * gy[192] * gz[384];
+                gout10 += gx[832] * gy[256] * gz[0];
+                gout11 += gx[64] * gy[640] * gz[384];
+                gout12 += gx[448] * gy[192] * gz[448];
+                gout13 += gx[768] * gy[320] * gz[0];
+                gout14 += gx[0] * gy[704] * gz[384];
+                gout15 += gx[384] * gy[256] * gz[448];
+                gout16 += gx[768] * gy[192] * gz[128];
+                gout17 += gx[0] * gy[576] * gz[512];
+                gout18 += gx[512] * gy[0] * gz[576];
+                gout19 += gx[832] * gy[64] * gz[192];
+                gout20 += gx[64] * gy[448] * gz[576];
+                gout21 += gx[448] * gy[0] * gz[640];
+                gout22 += gx[768] * gy[128] * gz[192];
+                gout23 += gx[0] * gy[512] * gz[576];
+                gout24 += gx[384] * gy[64] * gz[640];
+                gout25 += gx[768] * gy[0] * gz[320];
+                gout26 += gx[0] * gy[384] * gz[704];
+                break;
+                case 3:
+                gout0 += gx[320] * gy[768] * gz[0];
+                gout1 += gx[640] * gy[448] * gz[0];
+                gout2 += gx[256] * gy[64] * gz[768];
+                gout3 += gx[256] * gy[768] * gz[64];
+                gout4 += gx[576] * gy[512] * gz[0];
+                gout5 += gx[192] * gy[128] * gz[768];
+                gout6 += gx[192] * gy[832] * gz[64];
+                gout7 += gx[576] * gy[384] * gz[128];
+                gout8 += gx[192] * gy[0] * gz[896];
+                gout9 += gx[128] * gy[960] * gz[0];
+                gout10 += gx[448] * gy[640] * gz[0];
+                gout11 += gx[64] * gy[256] * gz[768];
+                gout12 += gx[64] * gy[960] * gz[64];
+                gout13 += gx[384] * gy[704] * gz[0];
+                gout14 += gx[0] * gy[320] * gz[768];
+                gout15 += gx[0] * gy[1024] * gz[64];
+                gout16 += gx[384] * gy[576] * gz[128];
+                gout17 += gx[0] * gy[192] * gz[896];
+                gout18 += gx[128] * gy[768] * gz[192];
+                gout19 += gx[448] * gy[448] * gz[192];
+                gout20 += gx[64] * gy[64] * gz[960];
+                gout21 += gx[64] * gy[768] * gz[256];
+                gout22 += gx[384] * gy[512] * gz[192];
+                gout23 += gx[0] * gy[128] * gz[960];
+                gout24 += gx[0] * gy[832] * gz[256];
+                gout25 += gx[384] * gy[384] * gz[320];
+                gout26 += gx[0] * gy[0] * gz[1088];
+                break;
+                }
+            }
+        }
+        if (ijk_idx < nst) {
+            int *ao_loc = envs.ao_loc;
+            int k0 = ao_loc[ksh0] - ao_loc[nbas];
+            double *eri_tensor = out_local + shl_pair_in_block * 18 * naux + k0 + ksh_in_block * 6;
+            switch (gout_id) {
+            case 0:
+            eri_tensor[0*naux + 0] = gout0;
+            eri_tensor[0*naux + 4] = gout1;
+            eri_tensor[1*naux + 2] = gout2;
+            eri_tensor[2*naux + 0] = gout3;
+            eri_tensor[2*naux + 4] = gout4;
+            eri_tensor[3*naux + 2] = gout5;
+            eri_tensor[4*naux + 0] = gout6;
+            eri_tensor[4*naux + 4] = gout7;
+            eri_tensor[5*naux + 2] = gout8;
+            eri_tensor[6*naux + 0] = gout9;
+            eri_tensor[6*naux + 4] = gout10;
+            eri_tensor[7*naux + 2] = gout11;
+            eri_tensor[8*naux + 0] = gout12;
+            eri_tensor[8*naux + 4] = gout13;
+            eri_tensor[9*naux + 2] = gout14;
+            eri_tensor[10*naux + 0] = gout15;
+            eri_tensor[10*naux + 4] = gout16;
+            eri_tensor[11*naux + 2] = gout17;
+            eri_tensor[12*naux + 0] = gout18;
+            eri_tensor[12*naux + 4] = gout19;
+            eri_tensor[13*naux + 2] = gout20;
+            eri_tensor[14*naux + 0] = gout21;
+            eri_tensor[14*naux + 4] = gout22;
+            eri_tensor[15*naux + 2] = gout23;
+            eri_tensor[16*naux + 0] = gout24;
+            eri_tensor[16*naux + 4] = gout25;
+            eri_tensor[17*naux + 2] = gout26;
+            break;
+            case 1:
+            eri_tensor[0*naux + 1] = gout0;
+            eri_tensor[0*naux + 5] = gout1;
+            eri_tensor[1*naux + 3] = gout2;
+            eri_tensor[2*naux + 1] = gout3;
+            eri_tensor[2*naux + 5] = gout4;
+            eri_tensor[3*naux + 3] = gout5;
+            eri_tensor[4*naux + 1] = gout6;
+            eri_tensor[4*naux + 5] = gout7;
+            eri_tensor[5*naux + 3] = gout8;
+            eri_tensor[6*naux + 1] = gout9;
+            eri_tensor[6*naux + 5] = gout10;
+            eri_tensor[7*naux + 3] = gout11;
+            eri_tensor[8*naux + 1] = gout12;
+            eri_tensor[8*naux + 5] = gout13;
+            eri_tensor[9*naux + 3] = gout14;
+            eri_tensor[10*naux + 1] = gout15;
+            eri_tensor[10*naux + 5] = gout16;
+            eri_tensor[11*naux + 3] = gout17;
+            eri_tensor[12*naux + 1] = gout18;
+            eri_tensor[12*naux + 5] = gout19;
+            eri_tensor[13*naux + 3] = gout20;
+            eri_tensor[14*naux + 1] = gout21;
+            eri_tensor[14*naux + 5] = gout22;
+            eri_tensor[15*naux + 3] = gout23;
+            eri_tensor[16*naux + 1] = gout24;
+            eri_tensor[16*naux + 5] = gout25;
+            eri_tensor[17*naux + 3] = gout26;
+            break;
+            case 2:
+            eri_tensor[0*naux + 2] = gout0;
+            eri_tensor[1*naux + 0] = gout1;
+            eri_tensor[1*naux + 4] = gout2;
+            eri_tensor[2*naux + 2] = gout3;
+            eri_tensor[3*naux + 0] = gout4;
+            eri_tensor[3*naux + 4] = gout5;
+            eri_tensor[4*naux + 2] = gout6;
+            eri_tensor[5*naux + 0] = gout7;
+            eri_tensor[5*naux + 4] = gout8;
+            eri_tensor[6*naux + 2] = gout9;
+            eri_tensor[7*naux + 0] = gout10;
+            eri_tensor[7*naux + 4] = gout11;
+            eri_tensor[8*naux + 2] = gout12;
+            eri_tensor[9*naux + 0] = gout13;
+            eri_tensor[9*naux + 4] = gout14;
+            eri_tensor[10*naux + 2] = gout15;
+            eri_tensor[11*naux + 0] = gout16;
+            eri_tensor[11*naux + 4] = gout17;
+            eri_tensor[12*naux + 2] = gout18;
+            eri_tensor[13*naux + 0] = gout19;
+            eri_tensor[13*naux + 4] = gout20;
+            eri_tensor[14*naux + 2] = gout21;
+            eri_tensor[15*naux + 0] = gout22;
+            eri_tensor[15*naux + 4] = gout23;
+            eri_tensor[16*naux + 2] = gout24;
+            eri_tensor[17*naux + 0] = gout25;
+            eri_tensor[17*naux + 4] = gout26;
+            break;
+            case 3:
+            eri_tensor[0*naux + 3] = gout0;
+            eri_tensor[1*naux + 1] = gout1;
+            eri_tensor[1*naux + 5] = gout2;
+            eri_tensor[2*naux + 3] = gout3;
+            eri_tensor[3*naux + 1] = gout4;
+            eri_tensor[3*naux + 5] = gout5;
+            eri_tensor[4*naux + 3] = gout6;
+            eri_tensor[5*naux + 1] = gout7;
+            eri_tensor[5*naux + 5] = gout8;
+            eri_tensor[6*naux + 3] = gout9;
+            eri_tensor[7*naux + 1] = gout10;
+            eri_tensor[7*naux + 5] = gout11;
+            eri_tensor[8*naux + 3] = gout12;
+            eri_tensor[9*naux + 1] = gout13;
+            eri_tensor[9*naux + 5] = gout14;
+            eri_tensor[10*naux + 3] = gout15;
+            eri_tensor[11*naux + 1] = gout16;
+            eri_tensor[11*naux + 5] = gout17;
+            eri_tensor[12*naux + 3] = gout18;
+            eri_tensor[13*naux + 1] = gout19;
+            eri_tensor[13*naux + 5] = gout20;
+            eri_tensor[14*naux + 3] = gout21;
+            eri_tensor[15*naux + 1] = gout22;
+            eri_tensor[15*naux + 5] = gout23;
+            eri_tensor[16*naux + 3] = gout24;
+            eri_tensor[17*naux + 1] = gout25;
+            eri_tensor[17*naux + 5] = gout26;
+            break;
+            }
+        }
+    }
+}
+
+__device__
+int int3c2e_bdiv_unrolled(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds)
+{
+    int sp_block_id = gridDim.x - blockIdx.x - 1;
+    int ksh_block_id = gridDim.y - blockIdx.y - 1;
+    int shl_pair0 = bounds.shl_pair_offsets[sp_block_id];
+    int ksh0 = bounds.ksh_offsets[ksh_block_id];
+    int bas_ij0 = bounds.bas_ij_idx[shl_pair0];
+    int nbas = envs.nbas;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+    int *bas = envs.bas;
+    int li = bas[ish0*BAS_SLOTS+ANG_OF];
+    int lj = bas[jsh0*BAS_SLOTS+ANG_OF];
+    int lk = bas[ksh0*BAS_SLOTS+ANG_OF];
+    int kij_type = lk*25 + li*5 + lj;
+    switch (kij_type) {
+    case 0:  int3c2e_bdiv_000(out, envs, bounds); break;
+    case 5:  int3c2e_bdiv_100(out, envs, bounds); break;
+    case 6:  int3c2e_bdiv_110(out, envs, bounds); break;
+    case 10: int3c2e_bdiv_200(out, envs, bounds); break;
+    case 11: int3c2e_bdiv_210(out, envs, bounds); break;
+    case 12: int3c2e_bdiv_220(out, envs, bounds); break;
+    case 25: int3c2e_bdiv_001(out, envs, bounds); break;
+    case 30: int3c2e_bdiv_101(out, envs, bounds); break;
+    case 31: int3c2e_bdiv_111(out, envs, bounds); break;
+    case 35: int3c2e_bdiv_201(out, envs, bounds); break;
+    case 36: int3c2e_bdiv_211(out, envs, bounds); break;
+    case 37: int3c2e_bdiv_221(out, envs, bounds); break;
+    case 50: int3c2e_bdiv_002(out, envs, bounds); break;
+    case 55: int3c2e_bdiv_102(out, envs, bounds); break;
+    case 56: int3c2e_bdiv_112(out, envs, bounds); break;
+    case 60: int3c2e_bdiv_202(out, envs, bounds); break;
+    case 61: int3c2e_bdiv_212(out, envs, bounds); break;
+    default: return 0;
+    }
+    return 1;
+}
diff --git a/gpu4pyscf/pbc/df/int3c2e.py b/gpu4pyscf/pbc/df/int3c2e.py
index f92b6ef6..8e53289f 100644
--- a/gpu4pyscf/pbc/df/int3c2e.py
+++ b/gpu4pyscf/pbc/df/int3c2e.py
@@ -287,7 +287,8 @@ def int3c2e_kernel(self, cutoff=None, verbose=None):
         gen_img_idx = create_img_idx(cell, bvkcell, auxcell, Ls, int3c2e_envs)
 
         uniq_l = uniq_l_ctr[:,0]
-        n_groups = np.count_nonzero(uniq_l <= LMAX)
+        assert uniq_l.max() <= LMAX
+        n_groups = len(uniq_l)
         init_constant(cell)
         kern = libpbc.fill_int3c2e
         cp.cuda.Stream.null.synchronize()
diff --git a/gpu4pyscf/scf/int4c2e.py b/gpu4pyscf/scf/int4c2e.py
index c0874ca1..b40377cc 100644
--- a/gpu4pyscf/scf/int4c2e.py
+++ b/gpu4pyscf/scf/int4c2e.py
@@ -20,7 +20,7 @@
 import cupy
 from pyscf import gto
 from pyscf.scf import _vhf
-from gpu4pyscf.lib.cupy_helper import block_c2s_diag, cart2sph, block_diag, contract, load_library, c2s_l
+from gpu4pyscf.lib.cupy_helper import block_c2s_diag, cart2sph, block_diag, contract, load_library
 from gpu4pyscf.lib import logger
 from gpu4pyscf.gto.mole import basis_seg_contraction
 

From c65406b865a4a1f4469ad51697e0347c437ef0c9 Mon Sep 17 00:00:00 2001
From: "xiaojie.wu" <xiaojie.wu@bytedance.com>
Date: Thu, 27 Feb 2025 03:14:07 +0800
Subject: [PATCH 5/6] disable unused build

---
 builder/build_libxc.sh       |  2 +-
 gpu4pyscf/lib/CMakeLists.txt | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/builder/build_libxc.sh b/builder/build_libxc.sh
index 486b935f..dece8b43 100644
--- a/builder/build_libxc.sh
+++ b/builder/build_libxc.sh
@@ -23,7 +23,7 @@ rm -rf /gpu4pyscf/put4pyscf/lib/*.so
 
 setup_dir=$(dirname $0)
 
-cmake -S /gpu4pyscf/gpu4pyscf/lib -B build/temp.gpu4pyscf-libxc -DBUILD_DFTD3=OFF -DBUILD_DFTD4=OFF -DBUILD_GINT=OFF -DBUILD_GVHF=OFF -DBUILD_GDFT=OFF -DBUILD_CUPY_HELPER=OFF -DBUILD_SOLVENT=OFF
+cmake -S /gpu4pyscf/gpu4pyscf/lib -B build/temp.gpu4pyscf-libxc -DBUILD_GINT=OFF -DBUILD_GVHF=OFF -DBUILD_GDFT=OFF -DBUILD_CUPY_HELPER=OFF -DBUILD_SOLVENT=OFF -DBUILD_GVHF_RYS=OFF -DBUILD_GVHF_MD=OFF -DBUILD_PBC=OFF -DCUDA_ARCHITECTURES="70-real"
 cmake --build build/temp.gpu4pyscf-libxc -j 4
 
 mkdir -p build/lib.gpu4pyscf-libxc/gpu4pyscf/lib/deps/lib
diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt
index 521c2361..bb275d18 100644
--- a/gpu4pyscf/lib/CMakeLists.txt
+++ b/gpu4pyscf/lib/CMakeLists.txt
@@ -144,9 +144,20 @@ if(BUILD_SOLVENT)
   add_subdirectory(solvent)
 endif()
 
-add_subdirectory(gvhf-rys)
-add_subdirectory(gvhf-md)
-add_subdirectory(pbc)
+option(BUILD_GVHF_RYS "Using gvhf-rys" ON)
+if(BUILD_GVHF_RYS)
+  add_subdirectory(gvhf-rys)
+endif()
+
+option(BUILD_GVHF_MD "Using gvhf-md" ON)
+if(BUILD_GVHF_MD)
+  add_subdirectory(gvhf-md)
+endif()
+
+option(BUILD_PBC "Using PBC" ON)
+if(BUILD_PBC)
+  add_subdirectory(pbc)
+endif()
 
 option(BUILD_LIBXC "Using libxc for DFT" ON)
 if(BUILD_LIBXC)

From 1d805bd9cf7d2af99e707d4c5d364c94dc0fc226 Mon Sep 17 00:00:00 2001
From: "xiaojie.wu" <xiaojie.wu@bytedance.com>
Date: Thu, 27 Feb 2025 03:28:13 +0800
Subject: [PATCH 6/6] add gint-rys back

---
 gpu4pyscf/lib/CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt
index bb275d18..286060be 100644
--- a/gpu4pyscf/lib/CMakeLists.txt
+++ b/gpu4pyscf/lib/CMakeLists.txt
@@ -144,6 +144,11 @@ if(BUILD_SOLVENT)
   add_subdirectory(solvent)
 endif()
 
+option(BUILD_GINT_RYS "Using gint-rys" ON)
+if(BUILD_GINT_RYS)
+  add_subdirectory(gint-rys)
+endif()
+
 option(BUILD_GVHF_RYS "Using gvhf-rys" ON)
 if(BUILD_GVHF_RYS)
   add_subdirectory(gvhf-rys)
@@ -154,7 +159,7 @@ if(BUILD_GVHF_MD)
   add_subdirectory(gvhf-md)
 endif()
 
-option(BUILD_PBC "Using PBC" ON)
+option(BUILD_PBC "Using pbc" ON)
 if(BUILD_PBC)
   add_subdirectory(pbc)
 endif()