From b5050f9fff77b1a32f347186ca6962e5734e475c Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 25 Feb 2025 20:23:58 -0800 Subject: [PATCH 1/6] Enable building KXC --- gpu4pyscf/lib/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt index 91e1d87a..521c2361 100644 --- a/gpu4pyscf/lib/CMakeLists.txt +++ b/gpu4pyscf/lib/CMakeLists.txt @@ -157,7 +157,7 @@ if(BUILD_LIBXC) PREFIX ${PROJECT_BINARY_DIR}/deps INSTALL_DIR ${PROJECT_SOURCE_DIR}/deps CMAKE_ARGS -DBUILD_SHARED_LIBS=ON -DENABLE_CUDA=ON - -DENABLE_FORTRAN=OFF -DDISABLE_KXC=ON -DDISABLE_LXC=ON -DDISABLE_FHC=ON + -DENABLE_FORTRAN=OFF -DDISABLE_KXC=OFF -DDISABLE_LXC=ON -DDISABLE_FHC=ON -DCMAKE_INSTALL_PREFIX:PATH= -DCMAKE_INSTALL_LIBDIR:PATH=lib -DCMAKE_C_CREATE_SHARED_LIBRARY=${C_LINK_TEMPLATE} From c716d7ab30ea52c392549f31936f6f2b1c091fa5 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 25 Feb 2025 20:27:42 -0800 Subject: [PATCH 2/6] Bump version --- builder/setup_libxc.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/builder/setup_libxc.py b/builder/setup_libxc.py index e4e94af6..e19bf94e 100644 --- a/builder/setup_libxc.py +++ b/builder/setup_libxc.py @@ -27,15 +27,15 @@ from distutils.util import get_platform NAME = 'gpu4pyscf-libxc' -AUTHOR = 'Qiming Sun' -AUTHOR_EMAIL = 'osirpt.sun@gmail.com' -DESCRIPTION = 'GPU extensions for PySCF' -LICENSE = 'GPLv3' +AUTHOR = 'PySCF developers' +AUTHOR_EMAIL = None +DESCRIPTION = 'Customized LibXC for GPU4PySCF' +LICENSE = 'Apache-2.0' URL = None DOWNLOAD_URL = None CLASSIFIERS = None PLATFORMS = None -VERSION = '0.5' +VERSION = '0.6' def get_cuda_version(): nvcc_out = subprocess.check_output(["nvcc", "--version"]).decode('utf-8') From 13f7083b3fd5cb7fd5cdcb2314047fc65381aa40 Mon Sep 17 00:00:00 2001 From: puzhichen <147788878+puzhichen@users.noreply.github.com> Date: Wed, 26 Feb 2025 23:36:03 +0800 Subject: [PATCH 3/6] add tddft in nightly build (#338) : --- .github/workflows/nightly_build.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml index 29ec300f..b1e26090 100644 --- a/.github/workflows/nightly_build.yml +++ b/.github/workflows/nightly_build.yml @@ -46,3 +46,8 @@ jobs: echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_uks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/ + - name: Test TDDFT + run: | + echo $GITHUB_WORKSPACE + export PYTHONPATH="${PYTHONPATH}:$(pwd)" + pytest gpu4pyscf/tests/test_benchmark_tddft.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_tddft_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/ From 3f3fad7786cbc76b8bd20f5717a8373edc7ca2f2 Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Wed, 26 Feb 2025 09:28:38 -0800 Subject: [PATCH 4/6] Block divergent optimization for the molecular int3c2e integral tensor (#337) * Block-divergent int3c2e * int3c2e correct * int3c2e block-divergent version correct * unroll int3c2e * add unrolled_int3c2e_bdiv * Add sort_orbitals and unsort_orbitals functions for int3c2e_bdiv * Compatibility between new int3c2e and existing implmentations * fixes * Fixes for int3c2e_bdiv version * Remove unused code * Removing debug code * Add missing file * Import circular dependency * Fix merging --------- Co-authored-by: Qiming Sun --- gpu4pyscf/df/df.py | 53 + gpu4pyscf/df/int3c2e_bdiv.py | 485 ++ gpu4pyscf/df/tests/test_df_int3c2e.py | 130 + gpu4pyscf/gto/mole.py | 79 +- gpu4pyscf/lib/CMakeLists.txt | 1 + gpu4pyscf/lib/cupy_helper.py | 24 +- gpu4pyscf/lib/gint-rys/CMakeLists.txt | 13 + gpu4pyscf/lib/gint-rys/fill_int3c2e.cu | 302 ++ gpu4pyscf/lib/gint-rys/fill_int3c2e_bdiv.cu | 330 ++ gpu4pyscf/lib/gint-rys/gint_driver.cu | 145 + gpu4pyscf/lib/gint-rys/int3c2e.cuh | 75 + gpu4pyscf/lib/gint-rys/rys_roots_dat.cu | 1 + gpu4pyscf/lib/gint-rys/unrolled_int3c2e.cu | 3947 ++++++++++++++++ .../lib/gint-rys/unrolled_int3c2e_bdiv.cu | 4093 +++++++++++++++++ gpu4pyscf/pbc/df/int3c2e.py | 3 +- gpu4pyscf/scf/int4c2e.py | 2 +- 16 files changed, 9651 insertions(+), 32 deletions(-) create mode 100644 gpu4pyscf/df/int3c2e_bdiv.py create mode 100644 gpu4pyscf/df/tests/test_df_int3c2e.py create mode 100644 gpu4pyscf/lib/gint-rys/CMakeLists.txt create mode 100644 gpu4pyscf/lib/gint-rys/fill_int3c2e.cu create mode 100644 gpu4pyscf/lib/gint-rys/fill_int3c2e_bdiv.cu create mode 100644 gpu4pyscf/lib/gint-rys/gint_driver.cu create mode 100644 gpu4pyscf/lib/gint-rys/int3c2e.cuh create mode 100644 gpu4pyscf/lib/gint-rys/rys_roots_dat.cu create mode 100644 gpu4pyscf/lib/gint-rys/unrolled_int3c2e.cu create mode 100644 gpu4pyscf/lib/gint-rys/unrolled_int3c2e_bdiv.cu diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index c58c1428..9bcda36d 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -23,6 +23,7 @@ from gpu4pyscf.lib.cupy_helper import (cholesky, tag_array, get_avail_mem, cart2sph, p2p_transfer, copy_array) from gpu4pyscf.df import int3c2e, df_jk +from gpu4pyscf.df import int3c2e_bdiv from gpu4pyscf.lib import logger from gpu4pyscf import __config__ from gpu4pyscf.__config__ import _streams, num_devices @@ -30,6 +31,7 @@ MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128) ALIGNED = getattr(__config__, 'ao_aligned', 32) GB = 1024*1024*1024 +INT3C2E_V2 = False LINEAR_DEP_THR = incore.LINEAR_DEP_THR GROUP_SIZE = 256 @@ -82,6 +84,42 @@ def build(self, direct_scf_tol=1e-14, omega=None): if auxmol is None: self.auxmol = auxmol = addons.make_auxmol(mol, self.auxbasis) + if INT3C2E_V2: + self.intopt = intopt = int3c2e_bdiv.Int3c2eOpt(mol, auxmol) + self._cderi = {} + self._cderi[0] = _cholesky_eri_bdiv(intopt, omega=omega) + ao_pair_mapping = intopt.create_ao_pair_mapping(cart=mol.cart) + rows, cols = divmod(cupy.asarray(ao_pair_mapping), mol.nao) + intopt.cderi_row = rows + intopt.cderi_col = cols + + # intopt.cderi_diag stores the indices for cderi_row that + # corresponds to the diagonal blocks. Note this index array can + # contain some of the off-diagonal elements which happen to be the + # off-diagonal elements while within the diagonal blocks. + uniq_l = intopt.uniq_l_ctr[:,0] + if mol.cart: + nf = (uniq_l + 1) * (uniq_l + 2) // 2 + else: + nf = uniq_l * 2 + 1 + n_groups = len(uniq_l) + ij_tasks = ((i, j) for i in range(n_groups) for j in range(i+1)) + nbas = intopt.sorted_mol.nbas + offset = 0 + cderi_diag = [] + for (i, j), bas_ij_idx in zip(ij_tasks, intopt.shl_pair_idx): + nfi = nf[i] + nfj = nf[j] + if i == j: # the diagonal blocks + ish, jsh = divmod(bas_ij_idx, nbas) + idx = np.where(ish == jsh)[0] + addr = offset + idx[:,None] * (nfi*nfi) + np.arange(nfi*nfi) + cderi_diag.append(addr.ravel()) + offset += bas_ij_idx.size * nfi * nfj + intopt.cderi_diag = cupy.asarray(np.hstack(cderi_diag)) + log.timer_debug1('cholesky_eri', *t0) + return self + if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): j2c_cpu = auxmol.intor('int2c2e', hermi=1) @@ -364,3 +402,18 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize, _cderi[0][:,ij0:ij1] = cderi_block t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1) return + +# Generate CDERI using the new int3c2e_bdiv algorithm +def _cholesky_eri_bdiv(intopt, omega=None): + assert isinstance(intopt, int3c2e_bdiv.Int3c2eOpt) + assert omega is None + eri3c = intopt.int3c2e_bdiv_kernel() + if intopt.mol.cart: + eri3c = intopt.orbital_pair_cart2sph(eri3c) + auxmol = intopt.auxmol + j2c = cupy.asarray(auxmol.intor('int2c2e', hermi=1), order='C') + cd_low = cholesky(j2c) + aux_coeff = cupy.array(intopt.aux_coeff, copy=True) + cd_low = solve_triangular(cd_low, aux_coeff.T, lower=True, overwrite_b=True) + cderi = cd_low.dot(eri3c.T) + return cderi diff --git a/gpu4pyscf/df/int3c2e_bdiv.py b/gpu4pyscf/df/int3c2e_bdiv.py new file mode 100644 index 00000000..d915d78f --- /dev/null +++ b/gpu4pyscf/df/int3c2e_bdiv.py @@ -0,0 +1,485 @@ +# Copyright 2025 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +''' +3-center 2-electron Coulomb integral helper functions +''' + +import ctypes +import math +import numpy as np +import cupy as cp +from pyscf import lib +from pyscf.lib.parameters import ANGULAR +from pyscf.gto.mole import ANG_OF, ATOM_OF, PTR_COORD, PTR_EXP, conc_env +from gpu4pyscf.lib import logger +from gpu4pyscf.lib.cupy_helper import load_library, contract +from gpu4pyscf.gto.mole import group_basis, PTR_BAS_COORD +from gpu4pyscf.scf.jk import g_pair_idx, _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE +from gpu4pyscf.gto.mole import basis_seg_contraction, extract_pgto_params, cart2sph_by_l + +__all__ = [ + 'aux_e2', +] +libgint_rys = load_library('libgint_rys') +libgint_rys.fill_int3c2e.restype = ctypes.c_int +libgint_rys.fill_int3c2e_bdiv.restype = ctypes.c_int +libgint_rys.init_constant.restype = ctypes.c_int + +LMAX = 4 +L_AUX_MAX = 6 +GOUT_WIDTH = 45 +THREADS = 256 + +def aux_e2(mol, auxmol): + r''' + Short-range 3-center integrals (ij|k). The auxiliary basis functions are + placed at the second electron. + ''' + int3c2e_opt = Int3c2eOpt(mol, auxmol).build() + ao_pair_mapping = cp.asarray(int3c2e_opt.create_ao_pair_mapping()) + nao, nao_orig = int3c2e_opt.coeff.shape + naux = int3c2e_opt.aux_coeff.shape[0] + out = cp.zeros((nao*nao, naux)) + p0 = p1 = 0 + for ij_shls, eri3c in int3c2e_opt.int3c2e_kernel(): + p0, p1 = p1, p1 + eri3c.shape[0] + addr = ao_pair_mapping[p0:p1] + out[addr] = eri3c + i, j = divmod(addr, nao) + out[j*nao+i] = eri3c + log = logger.new_logger(mol) + t1 = log.init_timer() + out = out.reshape(nao, nao, naux) + aux_coeff = cp.asarray(int3c2e_opt.aux_coeff) + coeff = cp.asarray(int3c2e_opt.coeff) + out = contract('pqr,rk->pqk', out, aux_coeff) + out = contract('pqk,qj->pjk', out, coeff) + out = contract('pjk,pi->ijk', out, coeff) + t1 = log.timer_debug1('aux_e2: transform basis ordering', *t1) + return out + +class Int3c2eOpt: + def __init__(self, mol, auxmol): + self.mol = mol + self.auxmol = auxmol + self.sorted_mol = None + + def build(self, cutoff=1e-14): + log = logger.new_logger(self.mol) + t0 = log.init_timer() + # allow_replica=True to transform the general contracted basis sets into + # segment contracted sets + mol, c2s = basis_seg_contraction(self.mol, allow_replica=True) + mol, coeff, uniq_l_ctr, l_ctr_counts, bas_mapping = group_basis( + mol, tile=1, return_bas_mapping=True) + self.sorted_mol = mol + self.uniq_l_ctr = uniq_l_ctr + l_ctr_offsets = self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)) + self.coeff = coeff.dot(c2s).get() + # Sorted AO indices, allow using the fancyindices to transform tensors + # between sorted_mol and mol (see function sort_orbitals) + ao_loc = mol.ao_loc_nr(cart=self.mol.cart) + ao_idx = np.array_split(np.arange(self.mol.nao), ao_loc[1:-1]) + self.ao_idx = np.hstack([ao_idx[i] for i in bas_mapping]).argsort() + + auxmol, coeff, uniq_l_ctr_aux, l_ctr_aux_counts = group_basis(self.auxmol, tile=1) + self.sorted_auxmol = auxmol + self.uniq_l_ctr_aux = uniq_l_ctr_aux + l_ctr_aux_offsets = self.l_ctr_aux_offsets = np.append(0, np.cumsum(l_ctr_aux_counts)) + self.aux_coeff = coeff.get() + + _atm_cpu, _bas_cpu, _env_cpu = conc_env( + mol._atm, mol._bas, _scale_sp_ctr_coeff(mol), + auxmol._atm, auxmol._bas, _scale_sp_ctr_coeff(auxmol)) + #NOTE: PTR_BAS_COORD is not updated in conc_env() + off = _bas_cpu[mol.nbas,PTR_EXP] - auxmol._bas[0,PTR_EXP] + _bas_cpu[mol.nbas:,PTR_BAS_COORD] += off + self._atm = _atm_cpu + self._bas = _bas_cpu + self._env = _env_cpu + + ao_loc_cpu = mol.ao_loc + aux_loc = auxmol.ao_loc + + _atm = cp.array(_atm_cpu, dtype=np.int32) + _bas = cp.array(_bas_cpu, dtype=np.int32) + _env = cp.array(_env_cpu, dtype=np.float64) + ao_loc = cp.asarray(_conc_locs(ao_loc_cpu, aux_loc), dtype=np.int32) + self.int3c2e_envs = Int3c2eEnvVars( + mol.natm, mol.nbas, _atm.data.ptr, _bas.data.ptr, _env.data.ptr, + ao_loc.data.ptr, math.log(cutoff), + ) + # Keep a reference to these arrays, prevent releasing them upon returning the closure + self.int3c2e_envs._env_ref_holder = (_atm, _bas, _env, ao_loc) + + nksh_per_block = 16 + # the auxiliary function offset (address) in the output tensor for each blockIdx.y + ksh_offsets = [] + for ksh0, ksh1 in zip(l_ctr_aux_offsets[:-1], l_ctr_aux_offsets[1:]): + ksh_offsets.append(np.arange(ksh0, ksh1, nksh_per_block, dtype=np.int32)) + ksh_offsets.append(l_ctr_aux_offsets[-1]) + ksh_offsets = np.hstack(ksh_offsets) + ksh_offsets += mol.nbas + self.ksh_offsets = ksh_offsets + + uniq_l = uniq_l_ctr[:,0] + assert uniq_l.max() <= LMAX + n_groups = len(uniq_l) + ij_tasks = ((i, j) for i in range(n_groups) for j in range(i+1)) + + ovlp = estimate_shl_ovlp(mol) + mask = np.tril(ovlp > cutoff) + # The effective shell pair = ish*nbas+jsh + shl_pair_idx = [] + # the bas_ij_idx offset for each blockIdx.x + shl_pair_offsets = [] + # the AO-pair offset (address) in the output tensor for each blockIdx.x + ao_pair_loc = [] + nao_pair0 = nao_pair = 0 + sp0 = sp1 = 0 + nbas = mol.nbas + for i, j in ij_tasks: + li = uniq_l[i] + lj = uniq_l[j] + ish0, ish1 = l_ctr_offsets[i], l_ctr_offsets[i+1] + jsh0, jsh1 = l_ctr_offsets[j], l_ctr_offsets[j+1] + ish, jsh = np.where(mask[ish0:ish1,jsh0:jsh1]) + ish += ish0 + jsh += jsh0 + idx = ish * nbas + jsh + nshl_pair = idx.size + shl_pair_idx.append(idx) + nfi = (li + 1) * (li + 2) // 2 + nfj = (lj + 1) * (lj + 2) // 2 + nfij = nfi * nfj + nao_pair0, nao_pair = nao_pair, nao_pair + nfij * nshl_pair + + sp0, sp1 = sp1, sp1 + nshl_pair + nsp_per_block = _estimate_shl_pairs_per_block(li, lj, nshl_pair) + shl_pair_offsets.append(np.arange(sp0, sp1, nsp_per_block, dtype=np.int32)) + ao_pair_loc.append( + np.arange(nao_pair0, nao_pair, nsp_per_block*nfij, dtype=np.int32)) + if log.verbose >= logger.DEBUG2: + log.debug2('group=(%d,%d), li,lj=(%d,%d), sp range(%d,%d,%d), ' + 'nao_pair offset=%d', + i, j, li, lj, sp0, sp1, nsp_per_block, nao_pair0) + + self.shl_pair_idx = shl_pair_idx + shl_pair_offsets.append([sp1]) + self.shl_pair_offsets = np.hstack(shl_pair_offsets) + ao_pair_loc.append(nao_pair) + self.ao_pair_loc = np.hstack(ao_pair_loc) + if log.verbose >= logger.DEBUG1: + log.timer_debug1('initialize int3c2e_kernel', *t0) + return self + + def int3c2e_kernel(self, cutoff=1e-14, verbose=None): + if self.sorted_mol is None: + self.build(cutoff) + log = logger.new_logger(self.mol, verbose) + t0 = t1 = log.init_timer() + l_ctr_offsets = self.l_ctr_offsets + l_ctr_aux_offsets = self.l_ctr_aux_offsets + int3c2e_envs = self.int3c2e_envs + _atm_cpu = self._atm + _bas_cpu = self._bas + _env_cpu = self._env + mol = self.sorted_mol + aux_loc = self.sorted_auxmol.ao_loc + naux = aux_loc[-1] + + uniq_l = self.uniq_l_ctr[:,0] + nfcart = (uniq_l + 1) * (uniq_l + 2) // 2 + n_groups = len(uniq_l) + ij_tasks = [(i, j) for i in range(n_groups) for j in range(i+1)] + npair_ij = 0 + for (i, j), bas_ij_idx in zip(ij_tasks, self.shl_pair_idx): + nfij = nfcart[i] * nfcart[j] + npair_ij = max(npair_ij, len(bas_ij_idx) * nfij) + buf = cp.empty((npair_ij, naux)) + + init_constant(mol) + kern = libgint_rys.fill_int3c2e + timing_collection = {} + kern_counts = 0 + + for (i, j), bas_ij_idx in zip(ij_tasks, self.shl_pair_idx): + ish0, ish1 = l_ctr_offsets[i], l_ctr_offsets[i+1] + jsh0, jsh1 = l_ctr_offsets[j], l_ctr_offsets[j+1] + npair_ij = len(bas_ij_idx) + bas_ij_idx = cp.asarray(bas_ij_idx, dtype=np.int32) + li = uniq_l[i] + lj = uniq_l[j] + nfij = nfcart[i] * nfcart[j] + eri3c = cp.ndarray((npair_ij*nfij, naux), dtype=np.float64, memptr=buf.data) + + for k, lk in enumerate(self.uniq_l_ctr_aux[:,0]): + ksh0, ksh1 = l_ctr_aux_offsets[k:k+2] + shls_slice = ish0, ish1, jsh0, jsh1, ksh0, ksh1 + lll = f'({ANGULAR[li]}{ANGULAR[lj]}|{ANGULAR[lk]})' + scheme = int3c2e_scheme(li, lj, lk) + log.debug2('int3c2e_scheme for %s: %s', lll, scheme) + err = kern( + ctypes.cast(eri3c.data.ptr, ctypes.c_void_p), + ctypes.byref(int3c2e_envs), (ctypes.c_int*3)(*scheme), + (ctypes.c_int*6)(*shls_slice), aux_loc.ctypes, + ctypes.c_int(naux), ctypes.c_int(npair_ij), + ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p), + _atm_cpu.ctypes, ctypes.c_int(mol.natm), + _bas_cpu.ctypes, ctypes.c_int(mol.nbas), _env_cpu.ctypes) + if err != 0: + raise RuntimeError(f'fill_int3c2e kernel for {lll} failed') + if log.verbose >= logger.DEBUG1: + t1, t1p = log.timer_debug1(f'processing {lll}', *t1), t1 + if lll not in timing_collection: + timing_collection[lll] = 0 + timing_collection[lll] += t1[1] - t1p[1] + kern_counts += 1 + + ij_shls = ish0, ish1, jsh0, jsh1 + yield ij_shls, eri3c + + if log.verbose >= logger.DEBUG1: + cp.cuda.Stream.null.synchronize() + log.timer('int3c2e', *t0) + log.debug1('kernel launches %d', kern_counts) + for lll, t in timing_collection.items(): + log.debug1('%s wall time %.2f', lll, t) + + def int3c2e_bdiv_kernel(self, cutoff=1e-14, verbose=None): + '''Construct the entire block using the block-divergent parallelism''' + if self.sorted_mol is None: + self.build(cutoff) + log = logger.new_logger(self.mol, verbose) + t0 = log.init_timer() + int3c2e_envs = self.int3c2e_envs + _atm_cpu = self._atm + _bas_cpu = self._bas + _env_cpu = self._env + mol = self.sorted_mol + aux_loc = self.sorted_auxmol.ao_loc + naux = aux_loc[-1] + nao_pair = self.ao_pair_loc[-1] + + # nst_lookup stores the nst_per_block for each (li,lj,lk) pattern + nst_lookup = cp.asarray(create_nst_lookup_table(), dtype=np.int32) + + shl_pair_idx = cp.asarray(np.hstack(self.shl_pair_idx), dtype=np.int32) + shl_pair_offsets = cp.asarray(self.shl_pair_offsets, dtype=np.int32) + ksh_offsets = cp.asarray(self.ksh_offsets, dtype=np.int32) + nbatches_shl_pair = len(shl_pair_offsets) - 1 + nbatches_ksh = len(ksh_offsets) - 1 + ao_pair_loc = cp.asarray(self.ao_pair_loc, dtype=np.int32) + log.debug1('sp_blocks = %d, ksh_blocks = %d', nbatches_shl_pair, nbatches_ksh) + + init_constant(mol) + kern = libgint_rys.fill_int3c2e_bdiv + eri3c = cp.empty((nao_pair, naux)) + err = kern( + ctypes.cast(eri3c.data.ptr, ctypes.c_void_p), + ctypes.byref(int3c2e_envs), + ctypes.c_int(SHM_SIZE), ctypes.c_int(naux), + ctypes.c_int(nbatches_shl_pair), ctypes.c_int(nbatches_ksh), + ctypes.cast(shl_pair_offsets.data.ptr, ctypes.c_void_p), + ctypes.cast(ao_pair_loc.data.ptr, ctypes.c_void_p), + ctypes.cast(ksh_offsets.data.ptr, ctypes.c_void_p), + ctypes.cast(shl_pair_idx.data.ptr, ctypes.c_void_p), + ctypes.cast(nst_lookup.data.ptr, ctypes.c_void_p), + _atm_cpu.ctypes, ctypes.c_int(mol.natm), + _bas_cpu.ctypes, ctypes.c_int(mol.nbas), _env_cpu.ctypes) + if err != 0: + raise RuntimeError('fill_int3c2e_bdiv kernel failed') + if log.verbose >= logger.DEBUG1: + cp.cuda.Stream.null.synchronize() + log.timer_debug1('processing int3c2e_bdiv_kernel', *t0) + return eri3c + + def create_ao_pair_mapping(self, cart=True): + '''ao_pair_mapping stores AO-pair addresses in the nao x nao matrix, + which allows the decompression for the CUDA kernel generated compressed_eri3c: + sparse_eri3c[ao_pair_mapping] = compressed_eri3c + + int3c2e CUDA kernel stores intgrals as [ij_shl,j,i,k,ksh]. + ao_pair_mapping indicates the ij addresses in eri3c[k,i,j]; + ''' + mol = self.sorted_mol + ao_loc = mol.ao_loc_nr(cart) + nao = ao_loc[-1] + uniq_l = self.uniq_l_ctr[:,0] + if cart: + nf = (uniq_l + 1) * (uniq_l + 2) // 2 + else: + nf = uniq_l * 2 + 1 + n_groups = len(uniq_l) + ij_tasks = ((i, j) for i in range(n_groups) for j in range(i+1)) + nbas = mol.nbas + ao_pair_mapping = [] + for (i, j), bas_ij_idx in zip(ij_tasks, self.shl_pair_idx): + ish, jsh = divmod(bas_ij_idx, nbas) + nfi = nf[i] + nfj = nf[j] + iaddr = ao_loc[ish,None] + np.arange(nfi) + jaddr = ao_loc[jsh,None] + np.arange(nfj) + ao_pair_mapping.append((iaddr[:,None,:] * nao + jaddr[:,:,None]).ravel()) + return np.hstack(ao_pair_mapping) + + def orbital_pair_cart2sph(self, compressed_eri3c, inplace=True): + '''Transforms the AO of the compressed eri3c from Cartesian to spherical basis''' + if inplace: + out = compressed_eri3c + else: + out = compressed_eri3c.copy() + uniq_l = self.uniq_l_ctr[:,0] + n_groups = len(uniq_l) + ij_tasks = ((i, j) for i in range(n_groups) for j in range(i+1)) + c2s = [cart2sph_by_l(l) for l in uniq_l] + naux = compressed_eri3c.shape[1] + npair0 = npair = 0 + p0 = p1 = 0 + for (i, j), bas_ij_idx in zip(ij_tasks, self.shl_pair_idx): + nshl_pair = bas_ij_idx.size + ci = c2s[i] + cj = c2s[j] + nfi, di = ci.shape + nfj, dj = cj.shape + npair0, npair = npair, npair + nfi*nfj * nshl_pair + p0, p1 = p1, p1 + di*dj * nshl_pair + if npair0 > len(compressed_eri3c): + raise RuntimeError('Size mismatch. The eri3c may have been transformed') + t = compressed_eri3c[npair0:npair].reshape(nshl_pair,nfj,nfi,naux) + t = contract('mpqr,pj->mjqr', t, cj) + t = contract('mjqr,qi->mjir', t, ci) + out[p0:p1] = t.reshape(p1-p0,naux) + return out[:p1] + + def sort_orbitals(self, mat, axis=[]): + ''' Transform given axis of a matrix into sorted AO''' + ndim_to_transform = len(axis) + assert ndim_to_transform <= 2 + if ndim_to_transform == 0: + return mat + + idx = self.ao_idx + fancy_index = [slice(None)] * mat.ndim + if ndim_to_transform == 1: + fancy_index[axis[0]] = idx + elif ndim_to_transform == 2: + assert abs(axis[0] - axis[1]) == 1, 'Must be adjacent axes' + fancy_index[axis[0]] = idx[:,None] + fancy_index[axis[1]] = idx + return mat[tuple(fancy_index)] + + def unsort_orbitals(self, sorted_mat, axis=[]): + '''sort_orbitals reversed, transform the matrix in sorted AOs back to + the original matrix. + ''' + ndim_to_transform = len(axis) + assert ndim_to_transform <= 2 + if ndim_to_transform == 0: + return sorted_mat + + idx = self.ao_idx + fancy_index = [slice(None)] * sorted_mat.ndim + if ndim_to_transform == 1: + fancy_index[axis[0]] = idx + elif ndim_to_transform == 2: + assert abs(axis[0] - axis[1]) == 1, 'Must be adjacent axes' + fancy_index[axis[0]] = idx[:,None] + fancy_index[axis[1]] = idx + mat = cp.empty_like(sorted_mat) + mat[tuple(fancy_index)] = sorted_mat + return mat + +def _conc_locs(ao_loc1, ao_loc2): + return np.append(ao_loc1[:-1], ao_loc1[-1] + ao_loc2) + +class Int3c2eEnvVars(ctypes.Structure): + _fields_ = [ + ('natm', ctypes.c_uint16), + ('nbas', ctypes.c_uint16), + ('atm', ctypes.c_void_p), + ('bas', ctypes.c_void_p), + ('env', ctypes.c_void_p), + ('ao_loc', ctypes.c_void_p), + ('log_cutoff', ctypes.c_float), + ] + +def init_constant(mol): + g_idx, offsets = g_pair_idx() + err = libgint_rys.init_constant( + g_idx.ctypes, offsets.ctypes, mol._env.ctypes, ctypes.c_int(mol._env.size), + ctypes.c_int(SHM_SIZE)) + if err != 0: + raise RuntimeError('CUDA kernel initialization') + +def int3c2e_scheme(li, lj, lk, shm_size=SHM_SIZE): + order = li + lj + lk + nroots = (order//2 + 1) * 2 + + g_size = (li+1)*(lj+1)*(lk+1) + unit = g_size*3 + nroots*2 + 7 + nst_max = shm_size//(unit*8) + nst_max = _nearest_power2(nst_max) + + nfi = (li + 1) * (li + 2) // 2 + nfj = (lj + 1) * (lj + 2) // 2 + nfk = (lk + 1) * (lk + 2) // 2 + gout_size = nfi * nfj * nfk + gout_stride = (gout_size + GOUT_WIDTH-1) // GOUT_WIDTH + # Round up to the next 2^n + gout_stride = _nearest_power2(gout_stride, return_leq=False) + gout_stride = min(gout_stride, 64) + + nst_per_block = min(nst_max, THREADS // gout_stride) + gout_stride = THREADS // nst_per_block + return nst_per_block, gout_stride + +def _estimate_shl_pairs_per_block(li, lj, nshl_pair): + return _nearest_power2(THREADS*2 // ((li+1)*(lj+1)), return_leq=False) + +def create_nst_lookup_table(): + nst_lookup = np.empty([L_AUX_MAX+1]*3, dtype=np.int32) + for lk in range(L_AUX_MAX+1): + for li in range(lk+1): + for lj in range(li+1): + nst_lookup[lk,li,lj] = int3c2e_scheme(li, lj, lk)[0] + idx = np.arange(L_AUX_MAX+1) + z, y, x = np.sort(np.meshgrid(idx, idx, idx), axis=0) + nst_lookup = nst_lookup[x, y, z] + return nst_lookup[:,:LMAX+1,:LMAX+1] + +def estimate_shl_ovlp(mol): + # consider only the most diffused component of a basis + exps, cs = extract_pgto_params(mol, 'diffused') + ls = mol._bas[:,ANG_OF] + bas_coords = mol.atom_coords()[mol._bas[:,ATOM_OF]] + + norm = cs * ((2*ls+1)/(4*np.pi))**.5 + aij = exps[:,None] + exps + fi = exps[:,None] / aij + fj = exps[None,:] / aij + theta = exps[:,None] * fj + + rirj = bas_coords[:,None,:] - bas_coords + dr = np.linalg.norm(rirj, axis=2) + dri = fj * dr + drj = fi * dr + li = ls[:,None] + lj = ls[None,:] + fac_dri = (li * .5/aij + dri**2) ** (li*.5) + fac_drj = (lj * .5/aij + drj**2) ** (lj*.5) + fac_norm = norm[:,None]*norm * (np.pi/aij)**1.5 + ovlp = fac_norm * np.exp(-theta*dr**2) * fac_dri * fac_drj + return ovlp diff --git a/gpu4pyscf/df/tests/test_df_int3c2e.py b/gpu4pyscf/df/tests/test_df_int3c2e.py new file mode 100644 index 00000000..79fd7c91 --- /dev/null +++ b/gpu4pyscf/df/tests/test_df_int3c2e.py @@ -0,0 +1,130 @@ +import cupy as cp +import pyscf +from pyscf.df import incore +from gpu4pyscf.df import int3c2e_bdiv +from gpu4pyscf.lib.cupy_helper import contract + +def test_int3c2e(): + mol = pyscf.M( + atom='''C1 1.3 .2 .3 + C2 .19 .1 1.1 + ''', + basis={'C1': [[3, [1.5, 1.], [.9, 1.]], + [4, [2., 1.]]], + 'C2': 'ccpvdz'}) + auxmol = mol.copy() + auxmol.basis = { + 'C1': ''' +C S + 2.9917624900 1.0000000000 +C P + 28.1325940100 1.0000000000 +C P + 9.8364318200 1.0000000000 +C P + 3.3490545000 1.0000000000 +C P + 1.4947618600 1.0000000000 +C P + 0.5769010900 1.0000000000 +C D + 0.1995412500 1.0000000000 ''', + 'C2': [[0, [.5, 1.]], [1, [.8, 1.]], [3, [.9, 1]]], + } + auxmol.build() + dat = int3c2e_bdiv.aux_e2(mol, auxmol) + ref = incore.aux_e2(mol, auxmol) + assert abs(dat.get()-ref).max() < 1e-10 + +def test_int3c2e_bdiv(): + mol = pyscf.M( + atom='''C1 1.3 .2 .3 + C2 .19 .1 1.1 + ''', + basis={'C1': [[3, [1.5, 1.], [.9, 1.]], + [4, [2., 1.]]], + 'C2': 'ccpvdz'}) + + auxmol = mol.copy() + auxmol.basis = { + 'C1':''' +C S + 2.9917624900 1.0000000000 +C P + 28.1325940100 1.0000000000 +C P + 9.8364318200 1.0000000000 +C P + 3.3490545000 1.0000000000 +C P + 1.4947618600 1.0000000000 +C P + 0.5769010900 1.0000000000 +C D + 0.1995412500 1.0000000000 ''', + 'C2':[[0, [.5, 1.]], [1, [.8, 1.]], [3, [.9, 1]]], + } + auxmol.build() + int3c2e_opt = int3c2e_bdiv.Int3c2eOpt(mol, auxmol).build() + nao, nao_orig = int3c2e_opt.coeff.shape + naux = int3c2e_opt.aux_coeff.shape[0] + out = cp.zeros((nao*nao, naux)) + eri3c = int3c2e_opt.int3c2e_bdiv_kernel() + ao_pair_mapping = int3c2e_opt.create_ao_pair_mapping() + out[ao_pair_mapping] = eri3c + i, j = divmod(ao_pair_mapping, nao) + out[j*nao+i] = eri3c + out = out.reshape(nao, nao, naux) + aux_coeff = cp.asarray(int3c2e_opt.aux_coeff) + coeff = cp.asarray(int3c2e_opt.coeff) + out = contract('pqr,rk->pqk', out, aux_coeff) + out = contract('pqk,qj->pjk', out, coeff) + out = contract('pjk,pi->ijk', out, coeff) + ref = incore.aux_e2(mol, auxmol) + assert abs(out.get()-ref).max() < 1e-10 + + eri3c = int3c2e_opt.orbital_pair_cart2sph(eri3c) + ao_pair_mapping = int3c2e_opt.create_ao_pair_mapping(cart=mol.cart) + out = cp.zeros((nao_orig*nao_orig, naux)) + out[ao_pair_mapping] = eri3c + i, j = divmod(ao_pair_mapping, nao_orig) + out[j*nao_orig+i] = eri3c + out = out.reshape(nao_orig, nao_orig, naux) + out = contract('pqr,rk->pqk', out, aux_coeff) + out = int3c2e_opt.unsort_orbitals(out, axis=(0,1)) + assert abs(out.get()-ref).max() < 1e-10 + +def test_int3c2e_sparse(): + mol = pyscf.M( + atom=''' +O 0.873 5.017 1.816 +H 1.128 5.038 2.848 +H 0.173 4.317 1.960 +O 3.665 1.316 1.319 +H 3.904 2.233 1.002 +H 4.224 0.640 0.837 +''', + basis='def2-tzvp' + ) + auxmol = mol.copy() + auxmol.basis = 'ccpvdz-jkfit' + auxmol.build() + int3c2e_opt = int3c2e_bdiv.Int3c2eOpt(mol, auxmol).build() + dat = int3c2e_bdiv.aux_e2(mol, auxmol) + ref = incore.aux_e2(mol, auxmol) + assert abs(dat.get()-ref).max() < 1e-10 + + eri3c = int3c2e_opt.int3c2e_bdiv_kernel() + eri3c = int3c2e_opt.orbital_pair_cart2sph(eri3c) + ao_pair_mapping = int3c2e_opt.create_ao_pair_mapping(cart=mol.cart) + nao, nao_orig = int3c2e_opt.coeff.shape + naux = int3c2e_opt.aux_coeff.shape[0] + out = cp.zeros((nao_orig*nao_orig, naux)) + out[ao_pair_mapping] = eri3c + i, j = divmod(ao_pair_mapping, nao_orig) + out[j*nao_orig+i] = eri3c + out = out.reshape(nao_orig, nao_orig, naux) + aux_coeff = cp.asarray(int3c2e_opt.aux_coeff) + out = contract('pqr,rk->pqk', out, aux_coeff) + out = int3c2e_opt.unsort_orbitals(out, axis=(0,1)) + assert abs(out.get()-ref).max() < 1e-10 diff --git a/gpu4pyscf/gto/mole.py b/gpu4pyscf/gto/mole.py index f3237e96..2f384d93 100644 --- a/gpu4pyscf/gto/mole.py +++ b/gpu4pyscf/gto/mole.py @@ -15,21 +15,17 @@ import functools import numpy as np -import scipy.linalg +import cupy as cp from pyscf import gto from pyscf.gto import (ANG_OF, ATOM_OF, NPRIM_OF, NCTR_OF, PTR_COORD, PTR_COEFF, PTR_EXP) -from gpu4pyscf.lib import logger PTR_BAS_COORD = 7 @functools.lru_cache(20) -def get_cart2sph(lmax=12): - cart2sph = [] - for l in range(lmax): - c2s = gto.mole.cart2sph(l, normalized='sp') - cart2sph.append(np.asarray(c2s, order='C')) - return cart2sph +def cart2sph_by_l(l, normalized='sp'): + c2s = gto.mole.cart2sph(l, normalized='sp') + return cp.asarray(c2s, order='C') def basis_seg_contraction(mol, allow_replica=1): '''transform generally contracted basis to segment contracted basis @@ -40,6 +36,7 @@ def basis_seg_contraction(mol, allow_replica=1): By default, high angular momentum functions (d, f shells) are fully uncontracted. ''' + from gpu4pyscf.lib.cupy_helper import block_diag # Ensure backward compatibility. When allow_replica is True, decontraction # to primitive functions is disabled. When allow_replica is False, all # general contraction are decontracted. @@ -69,13 +66,13 @@ def basis_seg_contraction(mol, allow_replica=1): nctr = shell[NCTR_OF] if nctr == 1: bas_of_ia.append(shell) - coeff.append(np.eye(nf)) + coeff.append(cp.eye(nf)) continue # Only basis with nctr > 1 needs to be decontracted nprim = shell[NPRIM_OF] pcoeff = shell[PTR_COEFF] if l <= allow_replica: - coeff.extend([np.eye(nf)] * nctr) + coeff.extend([cp.eye(nf)] * nctr) bs = np.repeat(shell[np.newaxis], nctr, axis=0) bs[:,NCTR_OF] = 1 bs[:,PTR_COEFF] = np.arange(pcoeff, pcoeff+nprim*nctr, nprim) @@ -87,7 +84,7 @@ def basis_seg_contraction(mol, allow_replica=1): # remove normalization from contraction coefficients c = _env[pcoeff:pcoeff+nprim*nctr].reshape(nctr,nprim) c = np.einsum('ip,p,ef->iepf', c, 1/norm, np.eye(nf)) - coeff.append(c.reshape(nf*nctr, nf*nprim).T) + coeff.append(cp.asarray(c.reshape(nf*nctr, nf*nprim).T)) _env[pcoeff:pcoeff+nprim] = norm bs = np.repeat(shell[np.newaxis], nprim, axis=0) @@ -110,10 +107,11 @@ def basis_seg_contraction(mol, allow_replica=1): pmol.cart = True pmol._bas = np.asarray(np.vstack(_bas), dtype=np.int32) pmol._env = _env - contr_coeff = scipy.linalg.block_diag(*contr_coeff) + contr_coeff = block_diag(contr_coeff) if not mol.cart: - contr_coeff = contr_coeff.dot(mol.cart2sph_coeff()) + c2s = block_diag([cart2sph_by_l(l) for l in pmol._bas[:,ANG_OF]]) + contr_coeff = contr_coeff.dot(c2s) return pmol, contr_coeff def sort_atoms(mol): @@ -160,8 +158,13 @@ def sort_atoms(mol): return [x for heavy_list in full_path for x in heavy_list] -def group_basis(mol, tile=1, group_size=None): - '''Group basis functions according to their [l, nprim] patterns''' +def group_basis(mol, tile=1, group_size=None, return_bas_mapping=False): + '''Group basis functions according to their [l, nprim] patterns. + + bas_mapping is the index that transforms _bas from sorted_mol to mol: + mol._bas = sorted_mol._bas[bas_mapping] + ''' + from gpu4pyscf.lib import logger mol, coeff = basis_seg_contraction(mol) # Sort basis according to angular momentum and contraction patterns so # as to group the basis functions to blocks in GPU kernel. @@ -175,10 +178,11 @@ def group_basis(mol, tile=1, group_size=None): nao_orig = coeff.shape[1] ao_loc = mol.ao_loc - coeff = np.split(coeff, ao_loc[1:-1], axis=0) + coeff = cp.split(coeff, ao_loc[1:-1], axis=0) pad_bas = [] if tile > 1: + assert not return_bas_mapping, 'bas_mapping requires tile=1' l_ctr_counts_orig = l_ctr_counts.copy() pad_inv_idx = [] env_ptr = mol._env.size @@ -196,12 +200,12 @@ def group_basis(mol, tile=1, group_size=None): l = l_ctr[0] nf = (l + 1) * (l + 2) // 2 - coeff.extend([np.zeros((nf, nao_orig))] * padding) + coeff.extend([cp.zeros((nf, nao_orig))] * padding) inv_idx = np.hstack([inv_idx.ravel(), pad_inv_idx]) sorted_idx = np.argsort(inv_idx.ravel(), kind='stable').astype(np.int32) - coeff = np.vstack([coeff[i] for i in sorted_idx]) + coeff = cp.vstack([coeff[i] for i in sorted_idx]) assert coeff.shape[0] < 32768 max_nprims = uniq_l_ctr[:,1].max() @@ -228,7 +232,10 @@ def group_basis(mol, tile=1, group_size=None): # PTR_BAS_COORD is required by various CUDA kernels mol._bas[:,PTR_BAS_COORD] = mol._atm[mol._bas[:,ATOM_OF],PTR_COORD] - return mol, coeff, uniq_l_ctr, l_ctr_counts + if return_bas_mapping: + return mol, coeff, uniq_l_ctr, l_ctr_counts, sorted_idx.argsort() + else: + return mol, coeff, uniq_l_ctr, l_ctr_counts def _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size, align=1): '''Splits l_ctr patterns into small groups with group_size the maximum @@ -257,3 +264,37 @@ def _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size, align=1): uniq_l_ctr = np.vstack(_l_ctrs) l_ctr_counts = np.hstack(_l_ctr_counts) return uniq_l_ctr, l_ctr_counts + +# This function is only available in pyscf-2.8 or later +def extract_pgto_params(mol, op='diffused'): + '''A helper function to extract exponents and contraction coefficients for + estimate_xxx function + ''' + es = [] + cs = [] + if op == 'diffused': + precision = 1e-8 + for i in range(mol.nbas): + e = mol.bas_exp(i) + c = abs(mol._libcint_ctr_coeff(i)).max(axis=1) + l = mol.bas_angular(i) + # A quick estimation for the radius that each primitive GTO vanishes + r2 = np.log(c**2 / precision * 10**l) / e + idx = r2.argmax() + es.append(e[idx]) + cs.append(c[idx].max()) + elif op == 'compact': + precision = 1e-8 + for i in range(mol.nbas): + e = mol.bas_exp(i) + c = abs(mol._libcint_ctr_coeff(i)).max(axis=1) + l = mol.bas_angular(i) + # A quick estimation for the resolution of planewaves that each + # primitive GTO requires + ke = np.log(c**2 / precision * 50**l) * e + idx = ke.argmax() + es.append(e[idx]) + cs.append(c[idx].max()) + else: + raise RuntimeError(f'Unsupported operation {op}') + return np.array(es), np.array(cs) diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt index 91e1d87a..4dbd8f2c 100644 --- a/gpu4pyscf/lib/CMakeLists.txt +++ b/gpu4pyscf/lib/CMakeLists.txt @@ -144,6 +144,7 @@ if(BUILD_SOLVENT) add_subdirectory(solvent) endif() +add_subdirectory(gint-rys) add_subdirectory(gvhf-rys) add_subdirectory(gvhf-md) add_subdirectory(pbc) diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 20b91a8c..490455b5 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -20,7 +20,6 @@ import cupy from pyscf import lib from gpu4pyscf.lib import logger -from gpu4pyscf.gto import mole from gpu4pyscf.lib.cutensor import contract from gpu4pyscf.lib.cusolver import eigh, cholesky #NOQA from gpu4pyscf.lib.memcpy import copy_array, p2p_transfer #NOQA @@ -29,10 +28,6 @@ LMAX_ON_GPU = 7 DSOLVE_LINDEP = 1e-13 -c2s_l = mole.get_cart2sph(lmax=LMAX_ON_GPU) -c2s_offset = np.cumsum([0] + [x.shape[0]*x.shape[1] for x in c2s_l]) -_data = {'c2s': None} - _kernel_registery = {} def load_library(libname): @@ -306,6 +301,14 @@ def dist_matrix(x, y, out=None): raise RuntimeError('failed in calculating distance matrix') return out +@functools.lru_cache(1) +def _initialize_c2s_data(): + from gpu4pyscf.gto import mole + c2s_l = [mole.cart2sph_by_l(l) for l in range(LMAX_ON_GPU)] + c2s_data = cupy.concatenate([x.ravel() for x in c2s_l]) + c2s_offset = np.cumsum([0] + [x.shape[0]*x.shape[1] for x in c2s_l]) + return c2s_l, c2s_data, c2s_offset + def block_c2s_diag(angular, counts): ''' Diagonal blocked cartesian to spherical transformation @@ -313,10 +316,7 @@ def block_c2s_diag(angular, counts): angular (list): angular momentum type, e.g. [0,1,2,3] counts (list): count of each angular momentum ''' - if _data['c2s'] is None: - c2s_data = cupy.concatenate([cupy.asarray(x.ravel()) for x in c2s_l]) - _data['c2s'] = c2s_data - c2s_data = _data['c2s'] + c2s_l, c2s_data, c2s_offset = _initialize_c2s_data() nshells = np.sum(counts) rows = [np.array([0], dtype='int32')] @@ -489,11 +489,12 @@ def cart2sph_cutensor(t, axis=0, ang=1, out=None): ''' transform 'axis' of a tensor from cartesian basis into spherical basis with cutensor ''' + from gpu4pyscf.gto import mole if(ang <= 1): if(out is not None): out[:] = t return t size = list(t.shape) - c2s = cupy.asarray(c2s_l[ang]) + c2s = mole.cart2sph_by_l(ang) if(not t.flags['C_CONTIGUOUS']): t = cupy.asarray(t, order='C') li_size = c2s.shape nli = size[axis] // li_size[0] @@ -511,11 +512,12 @@ def cart2sph(t, axis=0, ang=1, out=None, stream=None): ''' transform 'axis' of a tensor from cartesian basis into spherical basis ''' + from gpu4pyscf.gto import mole if(ang <= 1): if(out is not None): out[:] = t return t size = list(t.shape) - c2s = c2s_l[ang] + c2s = mole.cart2sph_by_l(ang) if(not t.flags['C_CONTIGUOUS']): t = cupy.asarray(t, order='C') li_size = c2s.shape nli = size[axis] // li_size[0] diff --git a/gpu4pyscf/lib/gint-rys/CMakeLists.txt b/gpu4pyscf/lib/gint-rys/CMakeLists.txt new file mode 100644 index 00000000..f0583873 --- /dev/null +++ b/gpu4pyscf/lib/gint-rys/CMakeLists.txt @@ -0,0 +1,13 @@ +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")# -maxrregcount=128") + +add_library(gint_rys SHARED + gint_driver.cu fill_int3c2e.cu unrolled_int3c2e.cu + fill_int3c2e_bdiv.cu unrolled_int3c2e_bdiv.cu + rys_roots_dat.cu +) + +set_target_properties(gint_rys PROPERTIES + LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR} + CUDA_SEPARABLE_COMPILATION ON) + +#target_link_libraries(ft_ao OpenMP::OpenMP_C) diff --git a/gpu4pyscf/lib/gint-rys/fill_int3c2e.cu b/gpu4pyscf/lib/gint-rys/fill_int3c2e.cu new file mode 100644 index 00000000..295b8239 --- /dev/null +++ b/gpu4pyscf/lib/gint-rys/fill_int3c2e.cu @@ -0,0 +1,302 @@ +/* + * Copyright 2025 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "gvhf-rys/vhf.cuh" +#include "gvhf-rys/rys_roots.cu" +#include "int3c2e.cuh" + +// TODO: benchmark performance for 32, 38, 40, 45, 54 +#define GOUT_WIDTH 45 + +__global__ +void int3c2e_kernel(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int gout_stride = blockDim.y; + int st_id = threadIdx.x; + int gout_id = threadIdx.y; + int batch_id = blockIdx.x; + int li = bounds.li; + int lj = bounds.lj; + int lk = bounds.lk; + int lij = li + lj; + int nroots = bounds.nroots; + int nfij = bounds.nfij; + int nfk = bounds.nfk; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int stride_j = bounds.stride_j; + int stride_k = bounds.stride_k; + int g_size = bounds.g_size; + int *idx_ij = c_g_pair_idx + c_g_pair_offsets[li*LMAX1+lj]; + int *idy_ij = idx_ij + nfij; + int *idz_ij = idy_ij + nfij; + int lk_offset = lk * (lk + 1) * (lk + 2) / 2; + int *idx_k = c_g_cart_idx + lk_offset; + int *idy_k = idx_k + nfk; + int *idz_k = idy_k + nfk; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + + int gx_len = g_size * nst_per_block; + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *g = rw + nst_per_block * nroots*2; + double *gx = g; + double *gy = gx + gx_len; + double *gz = gy + gx_len; + double *Rpq = gz + gx_len; + double *rjri = Rpq + nst_per_block * 3; + double gout[GOUT_WIDTH]; + if (gout_id == 0) { + gx[0] = 1.; + } + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0+st_id; ijk_idx < st1+st_id; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + if (ijk_idx >= nst) { + shl_pair_idx = st0 / nksh; + if (gout_id == 0) { + gx[0] = 0.; + } + } + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + if (gout_id == 0) { + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = xjxi; + rjri[1*nst_per_block] = yjyi; + rjri[2*nst_per_block] = zjzi; + rjri[3*nst_per_block] = rr_ij; + } + + for (int gout_start = 0; gout_start < nfij*nfk; + gout_start+=gout_stride*GOUT_WIDTH) { +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; } + + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double aj_aij = aj / aij; + __syncthreads(); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + if (gout_id == 0) { + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + gy[0] = fac * exp(-Kab); + Rpq[0*nst_per_block] = xpq; + Rpq[1*nst_per_block] = ypq; + Rpq[2*nst_per_block] = zpq; + } + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(nroots, theta_rr, rw, nst_per_block, gout_id, gout_stride); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(nroots, theta_fac*theta_rr, rw, nst_per_block, gout_id, gout_stride); + __syncthreads(); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = gout_id; irys < nroots; irys+=gout_stride) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + int _nroots = nroots/2; + rys_roots(_nroots, theta_rr, rw+nroots*nst_per_block, + nst_per_block, gout_id, gout_stride); + rys_roots(_nroots, theta_fac*theta_rr, rw, + nst_per_block, gout_id, gout_stride); + __syncthreads(); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = gout_id; irys < _nroots; irys+=gout_stride) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + double s0x, s1x, s2x; + for (int irys = 0; irys < nroots; ++irys) { + __syncthreads(); + if (gout_id == 0) { + gz[0] = rw[(irys*2+1)*nst_per_block]; + } + double rt = rw[ irys*2 *nst_per_block]; + double rt_aa = rt / (aij + ak); + + if (lij > 0) { + __syncthreads(); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + // gx(0,n+1) = c0*gx(0,n) + n*b10*gx(0,n-1) + for (int n = gout_id; n < 3; n += gout_stride) { + double *_gx = gx + n * gx_len; + double xjxi = rjri[n*nst_per_block]; + double xpa = xjxi * aj_aij; + //double c0x = Rpa[ir] - rt_aij * Rpq[n*nst_per_block]; + double c0x = xpa - rt_aij * Rpq[n*nst_per_block]; + s0x = _gx[0]; + s1x = c0x * s0x; + _gx[nst_per_block] = s1x; + for (int i = 1; i < lij; ++i) { + s2x = c0x * s1x + i * b10 * s0x; + _gx[(i+1)*nst_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + } + } + + if (lk > 0) { + int lij3 = (lij+1)*3; + double rt_ak = rt_aa * aij; + double b00 = .5 * rt_aa; + double b01 = .5/ak * (1 - rt_ak ); + for (int n = gout_id; n < lij3+gout_id; n += gout_stride) { + __syncthreads(); + int i = n / 3; //for i in range(lij+1): + int _ix = n % 3; // TODO: remove _ix for nroots > 2 + double *_gx = gx + (i + _ix * g_size) * nst_per_block; + double cpx = rt_ak * Rpq[_ix*nst_per_block]; + //for i in range(lij+1): + // trr(i,1) = c0p * trr(i,0) + i*b00 * trr(i-1,0) + if (n < lij3) { + s0x = _gx[0]; + s1x = cpx * s0x; + if (i > 0) { + s1x += i * b00 * _gx[-nst_per_block]; + } + _gx[stride_k*nst_per_block] = s1x; + } + //for k in range(1, lk): + // for i in range(lij+1): + // trr(i,k+1) = cp * trr(i,k) + k*b01 * trr(i,k-1) + i*b00 * trr(i-1,k) + for (int k = 1; k < lk; ++k) { + __syncthreads(); + if (n < lij3) { + s2x = cpx*s1x + k*b01*s0x; + if (i > 0) { + s2x += i * b00 * _gx[(k*stride_k-1)*nst_per_block]; + } + _gx[(k*stride_k+stride_k)*nst_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + } + } + } + + // hrr + // g(i,j+1) = rirj * g(i,j) + g(i+1,j) + // g(...,k,l+1) = rkrl * g(...,k,l) + g(...,k+1,l) + if (lj > 0) { + __syncthreads(); + int lk3 = (lk+1)*3; + for (int m = gout_id; m < lk3; m += gout_stride) { + int k = m / 3; + int _ix = m % 3; + double xjxi = rjri[_ix*nst_per_block]; + double *_gx = g + (_ix*g_size + k*stride_k) * + nst_per_block; + for (int j = 0; j < lj; ++j) { + int ij = (lij-j) + j*stride_j; + s1x = _gx[ij*nst_per_block]; + for (--ij; ij >= j*stride_j; --ij) { + s0x = _gx[ij*nst_per_block]; + _gx[(ij+stride_j)*nst_per_block] = s1x - xjxi * s0x; + s1x = s0x; + } + } + } + } + + __syncthreads(); +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { + int ijk = gout_start + n*gout_stride+gout_id; + int k = ijk % nfk; + int ij = ijk / nfk; + if (ij >= nfij) break; + int addrx = (idx_ij[ij] + idx_k[k] * stride_k) * nst_per_block; + int addry = (idy_ij[ij] + idy_k[k] * stride_k) * nst_per_block; + int addrz = (idz_ij[ij] + idz_k[k] * stride_k) * nst_per_block; + gout[n] += gx[addrx] * gy[addry] * gz[addrz]; + } + } + } + + if (ijk_idx < nst) { + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * nfij * naux + + ksh_in_auxmol * nfk; + for (int n = 0; n < GOUT_WIDTH; ++n) { + int ijk = gout_start + n*gout_stride+gout_id; + int k = ijk % nfk; + int ij = ijk / nfk; + if (ij >= nfij) break; + eri_tensor[ij * naux + k] = gout[n]; + } + } + } + } +} diff --git a/gpu4pyscf/lib/gint-rys/fill_int3c2e_bdiv.cu b/gpu4pyscf/lib/gint-rys/fill_int3c2e_bdiv.cu new file mode 100644 index 00000000..4111e657 --- /dev/null +++ b/gpu4pyscf/lib/gint-rys/fill_int3c2e_bdiv.cu @@ -0,0 +1,330 @@ +/* + * Copyright 2025 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "gvhf-rys/vhf.cuh" +#include "gvhf-rys/rys_roots.cu" +#include "int3c2e.cuh" + +// TODO: benchmark performance for 32, 38, 40, 45, 54 +#define GOUT_WIDTH 45 + +__device__ int int3c2e_bdiv_unrolled(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds); + +__global__ +void int3c2e_bdiv_kernel(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + if (int3c2e_bdiv_unrolled(out, envs, bounds)) { + return; + } + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int *bas = envs.bas; + int li = bas[ish0*BAS_SLOTS+ANG_OF]; + int lj = bas[jsh0*BAS_SLOTS+ANG_OF]; + int lk = bas[ksh0*BAS_SLOTS+ANG_OF]; + int lij = li + lj; + int nroots = (lij + lk) / 2 + 1; + int nfi = (li + 1) * (li + 2) / 2; + int nfj = (lj + 1) * (lj + 2) / 2; + int nfk = (lk + 1) * (lk + 2) / 2; + int nfij = nfi * nfj; + int *idx_ij = c_g_pair_idx + c_g_pair_offsets[li*LMAX1+lj]; + int *idy_ij = idx_ij + nfij; + int *idz_ij = idy_ij + nfij; + int lk_offset = lk * (lk + 1) * (lk + 2) / 2; + int *idx_k = c_g_cart_idx + lk_offset; + int *idy_k = idx_k + nfk; + int *idz_k = idy_k + nfk; + int stride_j = li + 1; + int stride_k = stride_j * (lj + 1); + int g_size = stride_k * (lk + 1); + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + + int nst_per_block = blockDim.x; + if (lij + lk > 2) { + nst_per_block = bounds.nst_lookup[(lk*LMAX1+lj)*LMAX1+li]; + } + int gout_stride = blockDim.x / nst_per_block; + int thread_id = threadIdx.x; + int st_id = thread_id % nst_per_block; + int gout_id = thread_id / nst_per_block; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + int gx_len = g_size * nst_per_block; + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *g = rw + nst_per_block * nroots*2; + double *gx = g; + double *gy = gx + gx_len; + double *gz = gy + gx_len; + double *Rpq = gz + gx_len; + double *rjri = Rpq + nst_per_block * 3; + double gout[GOUT_WIDTH]; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + if (gout_id == 0) { + gx[0] = 1.; + } + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst+st_id; ijk_idx += nst_per_block) { + // convert task_id to ish, jsh, ksh + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + __syncthreads(); + if (ijk_idx >= nst) { + shl_pair_in_block = 0; + if (gout_id == 0) { + gx[0] = 0.; + } + } + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + if (gout_id == 0) { + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = xjxi; + rjri[1*nst_per_block] = yjyi; + rjri[2*nst_per_block] = zjzi; + rjri[3*nst_per_block] = rr_ij; + } + + for (int gout_start = 0; gout_start < nfij*nfk; + gout_start+=gout_stride*GOUT_WIDTH) { +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { gout[n] = 0; } + + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double aj_aij = aj / aij; + __syncthreads(); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + if (gout_id == 0) { + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + gy[0] = fac * exp(-Kab); + Rpq[0*nst_per_block] = xpq; + Rpq[1*nst_per_block] = ypq; + Rpq[2*nst_per_block] = zpq; + } + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(nroots, theta_rr, rw, nst_per_block, gout_id, gout_stride); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(nroots, theta_fac*theta_rr, rw, nst_per_block, gout_id, gout_stride); + __syncthreads(); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = gout_id; irys < nroots; irys+=gout_stride) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + int _nroots = nroots/2; + rys_roots(_nroots, theta_rr, rw+nroots*nst_per_block, + nst_per_block, gout_id, gout_stride); + rys_roots(_nroots, theta_fac*theta_rr, rw, + nst_per_block, gout_id, gout_stride); + __syncthreads(); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = gout_id; irys < _nroots; irys+=gout_stride) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + double s0x, s1x, s2x; + for (int irys = 0; irys < nroots; ++irys) { + __syncthreads(); + if (gout_id == 0) { + gz[0] = rw[(irys*2+1)*nst_per_block]; + } + double rt = rw[ irys*2 *nst_per_block]; + double rt_aa = rt / (aij + ak); + + if (lij > 0) { + __syncthreads(); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + // gx(0,n+1) = c0*gx(0,n) + n*b10*gx(0,n-1) + for (int n = gout_id; n < 3; n += gout_stride) { + double *_gx = gx + n * gx_len; + double xjxi = rjri[n*nst_per_block]; + double xpa = xjxi * aj_aij; + //double c0x = Rpa[ir] - rt_aij * Rpq[n*nst_per_block]; + double c0x = xpa - rt_aij * Rpq[n*nst_per_block]; + s0x = _gx[0]; + s1x = c0x * s0x; + _gx[nst_per_block] = s1x; + for (int i = 1; i < lij; ++i) { + s2x = c0x * s1x + i * b10 * s0x; + _gx[(i+1)*nst_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + } + } + + if (lk > 0) { + int lij3 = (lij+1)*3; + double rt_ak = rt_aa * aij; + double b00 = .5 * rt_aa; + double b01 = .5/ak * (1 - rt_ak ); + for (int n = gout_id; n < lij3+gout_id; n += gout_stride) { + __syncthreads(); + int i = n / 3; //for i in range(lij+1): + int _ix = n % 3; // TODO: remove _ix for nroots > 2 + double *_gx = gx + (i + _ix * g_size) * nst_per_block; + double cpx = rt_ak * Rpq[_ix*nst_per_block]; + //for i in range(lij+1): + // trr(i,1) = c0p * trr(i,0) + i*b00 * trr(i-1,0) + if (n < lij3) { + s0x = _gx[0]; + s1x = cpx * s0x; + if (i > 0) { + s1x += i * b00 * _gx[-nst_per_block]; + } + _gx[stride_k*nst_per_block] = s1x; + } + //for k in range(1, lk): + // for i in range(lij+1): + // trr(i,k+1) = cp * trr(i,k) + k*b01 * trr(i,k-1) + i*b00 * trr(i-1,k) + for (int k = 1; k < lk; ++k) { + __syncthreads(); + if (n < lij3) { + s2x = cpx*s1x + k*b01*s0x; + if (i > 0) { + s2x += i * b00 * _gx[(k*stride_k-1)*nst_per_block]; + } + _gx[(k*stride_k+stride_k)*nst_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + } + } + } + + // hrr + // g(i,j+1) = rirj * g(i,j) + g(i+1,j) + // g(...,k,l+1) = rkrl * g(...,k,l) + g(...,k+1,l) + if (lj > 0) { + __syncthreads(); + int lk3 = (lk+1)*3; + for (int m = gout_id; m < lk3; m += gout_stride) { + int k = m / 3; + int _ix = m % 3; + double xjxi = rjri[_ix*nst_per_block]; + double *_gx = g + (_ix*g_size + k*stride_k) * + nst_per_block; + for (int j = 0; j < lj; ++j) { + int ij = (lij-j) + j*stride_j; + s1x = _gx[ij*nst_per_block]; + for (--ij; ij >= j*stride_j; --ij) { + s0x = _gx[ij*nst_per_block]; + _gx[(ij+stride_j)*nst_per_block] = s1x - xjxi * s0x; + s1x = s0x; + } + } + } + } + + __syncthreads(); +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { + int ijk = gout_start + n*gout_stride+gout_id; + int k = ijk % nfk; + int ij = ijk / nfk; + if (ij >= nfij) break; + int addrx = (idx_ij[ij] + idx_k[k] * stride_k) * nst_per_block; + int addry = (idy_ij[ij] + idy_k[k] * stride_k) * nst_per_block; + int addrz = (idz_ij[ij] + idz_k[k] * stride_k) * nst_per_block; + gout[n] += gx[addrx] * gy[addry] * gz[addrz]; + } + } + } + + if (ijk_idx < nst) { + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + shl_pair_in_block * nfij * naux + + k0 + ksh_in_block * nfk; + for (int n = 0; n < GOUT_WIDTH; ++n) { + int ijk = gout_start + n*gout_stride+gout_id; + int k = ijk % nfk; + int ij = ijk / nfk; + if (ij >= nfij) break; + eri_tensor[ij * naux + k] = gout[n]; + } + } + } + } +} diff --git a/gpu4pyscf/lib/gint-rys/gint_driver.cu b/gpu4pyscf/lib/gint-rys/gint_driver.cu new file mode 100644 index 00000000..b7844fd7 --- /dev/null +++ b/gpu4pyscf/lib/gint-rys/gint_driver.cu @@ -0,0 +1,145 @@ +/* + * Copyright 2024 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include "gvhf-rys/vhf.cuh" +#include "int3c2e.cuh" + +__constant__ int c_g_pair_idx[3675]; // corresponding to LMAX=4 +__constant__ int c_g_pair_offsets[LMAX1*LMAX1]; +__constant__ int c_g_cart_idx[252]; // corresponding to LMAX=6 + +extern __global__ +void int3c2e_kernel(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds); +int int3c2e_unrolled(double *out, Int3c2eEnvVars *envs, Int3c2eBounds *bounds); + +extern __global__ +void int3c2e_bdiv_kernel(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds); + +extern "C" { +int fill_int3c2e(double *out, Int3c2eEnvVars *envs, int *scheme, int *shls_slice, + int *aux_loc, int naux, int nshl_pair, int *bas_ij_idx, + int *atm, int natm, int *bas, int nbas, double *env) +{ + uint16_t ish0 = shls_slice[0]; + uint16_t jsh0 = shls_slice[2]; + uint16_t ksh0 = shls_slice[4] + nbas; + uint16_t ksh1 = shls_slice[5] + nbas; + uint16_t nksh = ksh1 - ksh0; + uint8_t li = bas[ANG_OF + ish0*BAS_SLOTS]; + uint8_t lj = bas[ANG_OF + jsh0*BAS_SLOTS]; + uint8_t lk = bas[ANG_OF + ksh0*BAS_SLOTS]; + uint8_t iprim = bas[NPRIM_OF + ish0*BAS_SLOTS]; + uint8_t jprim = bas[NPRIM_OF + jsh0*BAS_SLOTS]; + uint8_t kprim = bas[NPRIM_OF + ksh0*BAS_SLOTS]; + uint8_t nfi = (li+1)*(li+2)/2; + uint8_t nfj = (lj+1)*(lj+2)/2; + uint8_t nfk = (lk+1)*(lk+2)/2; + uint8_t nfij = nfi * nfj; + uint8_t order = li + lj + lk; + uint8_t nroots = order / 2 + 1; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { // SR ERIs + nroots *= 2; + } + uint8_t stride_i = 1; + uint8_t stride_j = li + 1; + uint8_t stride_k = stride_j * (lj + 1); + // up to (gg|i) + uint8_t g_size = stride_k * (lk + 1); + Int3c2eBounds bounds = {li, lj, lk, nroots, nfi, nfij, nfk, + iprim, jprim, kprim, stride_i, stride_j, stride_k, g_size, + (uint16_t)naux, nksh, ksh0, nshl_pair, bas_ij_idx}; + + int k0 = aux_loc[ksh0 - nbas]; + out += k0; // offset when writing output + if (!int3c2e_unrolled(out, envs, &bounds)) { + int nst_per_block = scheme[0]; + int gout_stride = scheme[1]; + dim3 threads(nst_per_block, gout_stride); + int tasks_per_block = BATCHES_PER_BLOCK * nst_per_block; + int st_blocks = (nksh*nshl_pair + tasks_per_block - 1) / tasks_per_block; + int buflen = (nroots*2+g_size*3+7) * nst_per_block * sizeof(double); + int3c2e_kernel<<>>(out, *envs, bounds); + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in int3c2e_kernel: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int fill_int3c2e_bdiv(double *out, Int3c2eEnvVars *envs, int shm_size, int naux, + int nbatches_shl_pair, int nbatches_ksh, + int *shl_pair_offsets, int *ao_pair_loc, int *ksh_offsets, + int *bas_ij_idx, int *nst_lookup, + int *atm, int natm, int *bas, int nbas, double *env) +{ + BDiv3c2eBounds bounds = {naux, bas_ij_idx, shl_pair_offsets, ao_pair_loc, + ksh_offsets, nst_lookup}; + int threads = 256; + dim3 blocks(nbatches_shl_pair, nbatches_ksh); + int3c2e_bdiv_kernel<<>>(out, *envs, bounds); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in int3c2e_bdiv_kernel: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int init_constant(int *g_pair_idx, int *offsets, + double *env, int env_size, int shm_size) +{ + cudaMemcpyToSymbol(c_g_pair_idx, g_pair_idx, 3675*sizeof(int)); + cudaMemcpyToSymbol(c_g_pair_offsets, offsets, sizeof(int) * LMAX1*LMAX1); + + int *g_cart_idx = (int *)malloc(252*sizeof(int)); + int *idx, *idy, *idz; + idx = g_cart_idx; + for (int l = 0; l <= L_AUX_MAX; ++l) { + int nf = (l + 1) * (l + 2) / 2; + idy = idx + nf; + idz = idy + nf; + for (int i = 0, ix = l; ix >= 0; --ix) { + for (int iy = l - ix; iy >= 0; --iy, ++i) { + int iz = l - ix - iy; + idx[i] = ix; + idy[i] = iy; + idz[i] = iz; + } } + idx += nf * 3; + } + cudaMemcpyToSymbol(c_g_cart_idx, g_cart_idx, 252*sizeof(int)); + free(g_cart_idx); + + cudaFuncSetAttribute(int3c2e_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + cudaFuncSetAttribute(int3c2e_bdiv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size, + cudaGetErrorString(err)); + return 1; + } + return 0; +} +} diff --git a/gpu4pyscf/lib/gint-rys/int3c2e.cuh b/gpu4pyscf/lib/gint-rys/int3c2e.cuh new file mode 100644 index 00000000..b6452471 --- /dev/null +++ b/gpu4pyscf/lib/gint-rys/int3c2e.cuh @@ -0,0 +1,75 @@ +/* + * Copyright 2024 The PySCF Developers. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#define BATCHES_PER_BLOCK 16 +#define L_AUX_MAX 6 + +#ifndef HAVE_DEFINED_INT3CENVVAS_H +#define HAVE_DEFINED_INT3CENVVAS_H +typedef struct { + uint16_t natm; + uint16_t nbas; + int *atm; + int *bas; + double *env; + int *ao_loc; + float log_cutoff; +} Int3c2eEnvVars; + +typedef struct { + uint8_t li; + uint8_t lj; + uint8_t lk; + uint8_t nroots; + uint8_t nfi; + uint8_t nfij; + uint8_t nfk; + uint8_t iprim; + uint8_t jprim; + uint8_t kprim; + uint8_t stride_i; + uint8_t stride_j; + uint8_t stride_k; + uint8_t g_size; + uint16_t naux; + uint16_t nksh; + uint16_t ksh0; + int nshl_pair; + // The effective basis pair Id = ish*nbas+jsh + int *bas_ij_idx; +} Int3c2eBounds; + +typedef struct { + int naux; + // The effective basis pair Id = ish*nbas+jsh + int *bas_ij_idx; + // the bas_ij_idx offset for each blockIdx.x + int *shl_pair_offsets; + // the AO-pair offset (address) in the output tensor for each blockIdx.x + int *ao_pair_loc; + // the auxiliary function offset (address) in the output tensor for each blockIdx.y + int *ksh_offsets; + // nst_per_block for each (li,lj,lk) pattern + int *nst_lookup; +} BDiv3c2eBounds; + +#ifdef __CUDACC__ +extern __constant__ int c_g_pair_idx[]; +extern __constant__ int c_g_pair_offsets[]; +extern __constant__ int c_g_cart_idx[]; +#endif +#endif diff --git a/gpu4pyscf/lib/gint-rys/rys_roots_dat.cu b/gpu4pyscf/lib/gint-rys/rys_roots_dat.cu new file mode 100644 index 00000000..1644fc8c --- /dev/null +++ b/gpu4pyscf/lib/gint-rys/rys_roots_dat.cu @@ -0,0 +1 @@ +#include "gvhf-rys/rys_roots_dat.cu" diff --git a/gpu4pyscf/lib/gint-rys/unrolled_int3c2e.cu b/gpu4pyscf/lib/gint-rys/unrolled_int3c2e.cu new file mode 100644 index 00000000..b1714ee2 --- /dev/null +++ b/gpu4pyscf/lib/gint-rys/unrolled_int3c2e.cu @@ -0,0 +1,3947 @@ +#include +#include +#include +#include +#include +#include "gvhf-rys/vhf.cuh" +#include "gvhf-rys/rys_roots.cu" +#include "int3c2e.cuh" + + +#if CUDA_VERSION >= 12040 +__global__ __maxnreg__(128) +#else +__global__ +#endif +void int3c2e_000(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(1, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 1; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 2*nst_per_block; + rys_roots(1, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 1; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + gout0 += 1 * fac1 * wt; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 1 * naux + ksh_in_auxmol * 1; + eri_tensor[0*naux + 0] = gout0; + } +} + +#if CUDA_VERSION >= 12040 +__global__ __maxnreg__(128) +#else +__global__ +#endif +void int3c2e_100(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(1, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 1; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 2*nst_per_block; + rys_roots(1, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 1; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double c0x = xjxi * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + gout0 += trr_10x * fac1 * wt; + double c0y = yjyi * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += 1 * trr_10y * wt; + double c0z = zjzi * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += 1 * fac1 * trr_10z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 3 * naux + ksh_in_auxmol * 1; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[2*naux + 0] = gout2; + } +} + +#if CUDA_VERSION >= 12040 +__global__ __maxnreg__(128) +#else +__global__ +#endif +void int3c2e_110(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = xjxi * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double hrr_110x = trr_20x - xjxi * trr_10x; + gout0 += hrr_110x * fac1 * wt; + double hrr_010x = trr_10x - xjxi * 1; + double c0y = yjyi * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += hrr_010x * trr_10y * wt; + double c0z = zjzi * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += hrr_010x * fac1 * trr_10z; + double hrr_010y = trr_10y - yjyi * fac1; + gout3 += trr_10x * hrr_010y * wt; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + double hrr_110y = trr_20y - yjyi * trr_10y; + gout4 += 1 * hrr_110y * wt; + gout5 += 1 * hrr_010y * trr_10z; + double hrr_010z = trr_10z - zjzi * wt; + gout6 += trr_10x * fac1 * hrr_010z; + gout7 += 1 * trr_10y * hrr_010z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + double hrr_110z = trr_20z - zjzi * trr_10z; + gout8 += 1 * fac1 * hrr_110z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 9 * naux + ksh_in_auxmol * 1; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[6*naux + 0] = gout6; + eri_tensor[7*naux + 0] = gout7; + eri_tensor[8*naux + 0] = gout8; + } +} + +#if CUDA_VERSION >= 12040 +__global__ __maxnreg__(128) +#else +__global__ +#endif +void int3c2e_200(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = xjxi * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + gout0 += trr_20x * fac1 * wt; + double c0y = yjyi * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += trr_10x * trr_10y * wt; + double c0z = zjzi * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += trr_10x * fac1 * trr_10z; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + gout3 += 1 * trr_20y * wt; + gout4 += 1 * trr_10y * trr_10z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + gout5 += 1 * fac1 * trr_20z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 6 * naux + ksh_in_auxmol * 1; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[5*naux + 0] = gout5; + } +} + +#if CUDA_VERSION >= 12040 +__global__ __maxnreg__(128) +#else +__global__ +#endif +void int3c2e_210(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = xjxi * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_30x = c0x * trr_20x + 2*b10 * trr_10x; + double hrr_210x = trr_30x - xjxi * trr_20x; + gout0 += hrr_210x * fac1 * wt; + double hrr_110x = trr_20x - xjxi * trr_10x; + double c0y = yjyi * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += hrr_110x * trr_10y * wt; + double c0z = zjzi * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += hrr_110x * fac1 * trr_10z; + double hrr_010x = trr_10x - xjxi * 1; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + gout3 += hrr_010x * trr_20y * wt; + gout4 += hrr_010x * trr_10y * trr_10z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + gout5 += hrr_010x * fac1 * trr_20z; + double hrr_010y = trr_10y - yjyi * fac1; + gout6 += trr_20x * hrr_010y * wt; + double hrr_110y = trr_20y - yjyi * trr_10y; + gout7 += trr_10x * hrr_110y * wt; + gout8 += trr_10x * hrr_010y * trr_10z; + double trr_30y = c0y * trr_20y + 2*b10 * trr_10y; + double hrr_210y = trr_30y - yjyi * trr_20y; + gout9 += 1 * hrr_210y * wt; + gout10 += 1 * hrr_110y * trr_10z; + gout11 += 1 * hrr_010y * trr_20z; + double hrr_010z = trr_10z - zjzi * wt; + gout12 += trr_20x * fac1 * hrr_010z; + gout13 += trr_10x * trr_10y * hrr_010z; + double hrr_110z = trr_20z - zjzi * trr_10z; + gout14 += trr_10x * fac1 * hrr_110z; + gout15 += 1 * trr_20y * hrr_010z; + gout16 += 1 * trr_10y * hrr_110z; + double trr_30z = c0z * trr_20z + 2*b10 * trr_10z; + double hrr_210z = trr_30z - zjzi * trr_20z; + gout17 += 1 * fac1 * hrr_210z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 18 * naux + ksh_in_auxmol * 1; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[6*naux + 0] = gout6; + eri_tensor[7*naux + 0] = gout7; + eri_tensor[8*naux + 0] = gout8; + eri_tensor[9*naux + 0] = gout9; + eri_tensor[10*naux + 0] = gout10; + eri_tensor[11*naux + 0] = gout11; + eri_tensor[12*naux + 0] = gout12; + eri_tensor[13*naux + 0] = gout13; + eri_tensor[14*naux + 0] = gout14; + eri_tensor[15*naux + 0] = gout15; + eri_tensor[16*naux + 0] = gout16; + eri_tensor[17*naux + 0] = gout17; + } +} + +__global__ +void int3c2e_220(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + double gout27 = 0; + double gout28 = 0; + double gout29 = 0; + double gout30 = 0; + double gout31 = 0; + double gout32 = 0; + double gout33 = 0; + double gout34 = 0; + double gout35 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(3, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 6*nst_per_block; + rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = xjxi * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_30x = c0x * trr_20x + 2*b10 * trr_10x; + double trr_40x = c0x * trr_30x + 3*b10 * trr_20x; + double hrr_310x = trr_40x - xjxi * trr_30x; + double hrr_210x = trr_30x - xjxi * trr_20x; + double hrr_220x = hrr_310x - xjxi * hrr_210x; + gout0 += hrr_220x * fac1 * wt; + double hrr_110x = trr_20x - xjxi * trr_10x; + double hrr_120x = hrr_210x - xjxi * hrr_110x; + double c0y = yjyi * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += hrr_120x * trr_10y * wt; + double c0z = zjzi * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += hrr_120x * fac1 * trr_10z; + double hrr_010x = trr_10x - xjxi * 1; + double hrr_020x = hrr_110x - xjxi * hrr_010x; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + gout3 += hrr_020x * trr_20y * wt; + gout4 += hrr_020x * trr_10y * trr_10z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + gout5 += hrr_020x * fac1 * trr_20z; + double hrr_010y = trr_10y - yjyi * fac1; + gout6 += hrr_210x * hrr_010y * wt; + double hrr_110y = trr_20y - yjyi * trr_10y; + gout7 += hrr_110x * hrr_110y * wt; + gout8 += hrr_110x * hrr_010y * trr_10z; + double trr_30y = c0y * trr_20y + 2*b10 * trr_10y; + double hrr_210y = trr_30y - yjyi * trr_20y; + gout9 += hrr_010x * hrr_210y * wt; + gout10 += hrr_010x * hrr_110y * trr_10z; + gout11 += hrr_010x * hrr_010y * trr_20z; + double hrr_010z = trr_10z - zjzi * wt; + gout12 += hrr_210x * fac1 * hrr_010z; + gout13 += hrr_110x * trr_10y * hrr_010z; + double hrr_110z = trr_20z - zjzi * trr_10z; + gout14 += hrr_110x * fac1 * hrr_110z; + gout15 += hrr_010x * trr_20y * hrr_010z; + gout16 += hrr_010x * trr_10y * hrr_110z; + double trr_30z = c0z * trr_20z + 2*b10 * trr_10z; + double hrr_210z = trr_30z - zjzi * trr_20z; + gout17 += hrr_010x * fac1 * hrr_210z; + double hrr_020y = hrr_110y - yjyi * hrr_010y; + gout18 += trr_20x * hrr_020y * wt; + double hrr_120y = hrr_210y - yjyi * hrr_110y; + gout19 += trr_10x * hrr_120y * wt; + gout20 += trr_10x * hrr_020y * trr_10z; + double trr_40y = c0y * trr_30y + 3*b10 * trr_20y; + double hrr_310y = trr_40y - yjyi * trr_30y; + double hrr_220y = hrr_310y - yjyi * hrr_210y; + gout21 += 1 * hrr_220y * wt; + gout22 += 1 * hrr_120y * trr_10z; + gout23 += 1 * hrr_020y * trr_20z; + gout24 += trr_20x * hrr_010y * hrr_010z; + gout25 += trr_10x * hrr_110y * hrr_010z; + gout26 += trr_10x * hrr_010y * hrr_110z; + gout27 += 1 * hrr_210y * hrr_010z; + gout28 += 1 * hrr_110y * hrr_110z; + gout29 += 1 * hrr_010y * hrr_210z; + double hrr_020z = hrr_110z - zjzi * hrr_010z; + gout30 += trr_20x * fac1 * hrr_020z; + gout31 += trr_10x * trr_10y * hrr_020z; + double hrr_120z = hrr_210z - zjzi * hrr_110z; + gout32 += trr_10x * fac1 * hrr_120z; + gout33 += 1 * trr_20y * hrr_020z; + gout34 += 1 * trr_10y * hrr_120z; + double trr_40z = c0z * trr_30z + 3*b10 * trr_20z; + double hrr_310z = trr_40z - zjzi * trr_30z; + double hrr_220z = hrr_310z - zjzi * hrr_210z; + gout35 += 1 * fac1 * hrr_220z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 36 * naux + ksh_in_auxmol * 1; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[6*naux + 0] = gout6; + eri_tensor[7*naux + 0] = gout7; + eri_tensor[8*naux + 0] = gout8; + eri_tensor[9*naux + 0] = gout9; + eri_tensor[10*naux + 0] = gout10; + eri_tensor[11*naux + 0] = gout11; + eri_tensor[12*naux + 0] = gout12; + eri_tensor[13*naux + 0] = gout13; + eri_tensor[14*naux + 0] = gout14; + eri_tensor[15*naux + 0] = gout15; + eri_tensor[16*naux + 0] = gout16; + eri_tensor[17*naux + 0] = gout17; + eri_tensor[18*naux + 0] = gout18; + eri_tensor[19*naux + 0] = gout19; + eri_tensor[20*naux + 0] = gout20; + eri_tensor[21*naux + 0] = gout21; + eri_tensor[22*naux + 0] = gout22; + eri_tensor[23*naux + 0] = gout23; + eri_tensor[24*naux + 0] = gout24; + eri_tensor[25*naux + 0] = gout25; + eri_tensor[26*naux + 0] = gout26; + eri_tensor[27*naux + 0] = gout27; + eri_tensor[28*naux + 0] = gout28; + eri_tensor[29*naux + 0] = gout29; + eri_tensor[30*naux + 0] = gout30; + eri_tensor[31*naux + 0] = gout31; + eri_tensor[32*naux + 0] = gout32; + eri_tensor[33*naux + 0] = gout33; + eri_tensor[34*naux + 0] = gout34; + eri_tensor[35*naux + 0] = gout35; + } +} + +#if CUDA_VERSION >= 12040 +__global__ __maxnreg__(128) +#else +__global__ +#endif +void int3c2e_001(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(1, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 1; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 2*nst_per_block; + rys_roots(1, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 1; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_ak = rt_aa * aij; + double cpx = xpq*rt_ak; + double trr_01x = cpx * 1; + gout0 += trr_01x * fac1 * wt; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout1 += 1 * trr_01y * wt; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout2 += 1 * fac1 * trr_01z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 1 * naux + ksh_in_auxmol * 3; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout1; + eri_tensor[0*naux + 2] = gout2; + } +} + +#if CUDA_VERSION >= 12040 +__global__ __maxnreg__(128) +#else +__global__ +#endif +void int3c2e_101(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double c0x = xjxi * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + gout0 += trr_11x * fac1 * wt; + double trr_01x = cpx * 1; + double c0y = yjyi * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += trr_01x * trr_10y * wt; + double c0z = zjzi * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += trr_01x * fac1 * trr_10z; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout3 += trr_10x * trr_01y * wt; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout4 += 1 * trr_11y * wt; + gout5 += 1 * trr_01y * trr_10z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout6 += trr_10x * fac1 * trr_01z; + gout7 += 1 * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout8 += 1 * fac1 * trr_11z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 3 * naux + ksh_in_auxmol * 3; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout3; + eri_tensor[0*naux + 2] = gout6; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout4; + eri_tensor[1*naux + 2] = gout7; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout5; + eri_tensor[2*naux + 2] = gout8; + } +} + +__global__ +void int3c2e_111(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = xjxi * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_21x = cpx * trr_20x + 2*b00 * trr_10x; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + double hrr_111x = trr_21x - xjxi * trr_11x; + gout0 += hrr_111x * fac1 * wt; + double trr_01x = cpx * 1; + double hrr_011x = trr_11x - xjxi * trr_01x; + double c0y = yjyi * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += hrr_011x * trr_10y * wt; + double c0z = zjzi * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += hrr_011x * fac1 * trr_10z; + double hrr_010y = trr_10y - yjyi * fac1; + gout3 += trr_11x * hrr_010y * wt; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + double hrr_110y = trr_20y - yjyi * trr_10y; + gout4 += trr_01x * hrr_110y * wt; + gout5 += trr_01x * hrr_010y * trr_10z; + double hrr_010z = trr_10z - zjzi * wt; + gout6 += trr_11x * fac1 * hrr_010z; + gout7 += trr_01x * trr_10y * hrr_010z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + double hrr_110z = trr_20z - zjzi * trr_10z; + gout8 += trr_01x * fac1 * hrr_110z; + double hrr_110x = trr_20x - xjxi * trr_10x; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout9 += hrr_110x * trr_01y * wt; + double hrr_010x = trr_10x - xjxi * 1; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout10 += hrr_010x * trr_11y * wt; + gout11 += hrr_010x * trr_01y * trr_10z; + double hrr_011y = trr_11y - yjyi * trr_01y; + gout12 += trr_10x * hrr_011y * wt; + double trr_21y = cpy * trr_20y + 2*b00 * trr_10y; + double hrr_111y = trr_21y - yjyi * trr_11y; + gout13 += 1 * hrr_111y * wt; + gout14 += 1 * hrr_011y * trr_10z; + gout15 += trr_10x * trr_01y * hrr_010z; + gout16 += 1 * trr_11y * hrr_010z; + gout17 += 1 * trr_01y * hrr_110z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout18 += hrr_110x * fac1 * trr_01z; + gout19 += hrr_010x * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout20 += hrr_010x * fac1 * trr_11z; + gout21 += trr_10x * hrr_010y * trr_01z; + gout22 += 1 * hrr_110y * trr_01z; + gout23 += 1 * hrr_010y * trr_11z; + double hrr_011z = trr_11z - zjzi * trr_01z; + gout24 += trr_10x * fac1 * hrr_011z; + gout25 += 1 * trr_10y * hrr_011z; + double trr_21z = cpz * trr_20z + 2*b00 * trr_10z; + double hrr_111z = trr_21z - zjzi * trr_11z; + gout26 += 1 * fac1 * hrr_111z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 9 * naux + ksh_in_auxmol * 3; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout9; + eri_tensor[0*naux + 2] = gout18; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout10; + eri_tensor[1*naux + 2] = gout19; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout11; + eri_tensor[2*naux + 2] = gout20; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[3*naux + 1] = gout12; + eri_tensor[3*naux + 2] = gout21; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[4*naux + 1] = gout13; + eri_tensor[4*naux + 2] = gout22; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[5*naux + 1] = gout14; + eri_tensor[5*naux + 2] = gout23; + eri_tensor[6*naux + 0] = gout6; + eri_tensor[6*naux + 1] = gout15; + eri_tensor[6*naux + 2] = gout24; + eri_tensor[7*naux + 0] = gout7; + eri_tensor[7*naux + 1] = gout16; + eri_tensor[7*naux + 2] = gout25; + eri_tensor[8*naux + 0] = gout8; + eri_tensor[8*naux + 1] = gout17; + eri_tensor[8*naux + 2] = gout26; + } +} + +#if CUDA_VERSION >= 12040 +__global__ __maxnreg__(128) +#else +__global__ +#endif +void int3c2e_201(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = xjxi * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_21x = cpx * trr_20x + 2*b00 * trr_10x; + gout0 += trr_21x * fac1 * wt; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + double c0y = yjyi * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += trr_11x * trr_10y * wt; + double c0z = zjzi * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += trr_11x * fac1 * trr_10z; + double trr_01x = cpx * 1; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + gout3 += trr_01x * trr_20y * wt; + gout4 += trr_01x * trr_10y * trr_10z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + gout5 += trr_01x * fac1 * trr_20z; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout6 += trr_20x * trr_01y * wt; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout7 += trr_10x * trr_11y * wt; + gout8 += trr_10x * trr_01y * trr_10z; + double trr_21y = cpy * trr_20y + 2*b00 * trr_10y; + gout9 += 1 * trr_21y * wt; + gout10 += 1 * trr_11y * trr_10z; + gout11 += 1 * trr_01y * trr_20z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout12 += trr_20x * fac1 * trr_01z; + gout13 += trr_10x * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout14 += trr_10x * fac1 * trr_11z; + gout15 += 1 * trr_20y * trr_01z; + gout16 += 1 * trr_10y * trr_11z; + double trr_21z = cpz * trr_20z + 2*b00 * trr_10z; + gout17 += 1 * fac1 * trr_21z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 6 * naux + ksh_in_auxmol * 3; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout6; + eri_tensor[0*naux + 2] = gout12; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout7; + eri_tensor[1*naux + 2] = gout13; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout8; + eri_tensor[2*naux + 2] = gout14; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[3*naux + 1] = gout9; + eri_tensor[3*naux + 2] = gout15; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[4*naux + 1] = gout10; + eri_tensor[4*naux + 2] = gout16; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[5*naux + 1] = gout11; + eri_tensor[5*naux + 2] = gout17; + } +} + +__global__ +void int3c2e_211(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + double gout27 = 0; + double gout28 = 0; + double gout29 = 0; + double gout30 = 0; + double gout31 = 0; + double gout32 = 0; + double gout33 = 0; + double gout34 = 0; + double gout35 = 0; + double gout36 = 0; + double gout37 = 0; + double gout38 = 0; + double gout39 = 0; + double gout40 = 0; + double gout41 = 0; + double gout42 = 0; + double gout43 = 0; + double gout44 = 0; + double gout45 = 0; + double gout46 = 0; + double gout47 = 0; + double gout48 = 0; + double gout49 = 0; + double gout50 = 0; + double gout51 = 0; + double gout52 = 0; + double gout53 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(3, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 6*nst_per_block; + rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = xjxi * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_30x = c0x * trr_20x + 2*b10 * trr_10x; + double trr_31x = cpx * trr_30x + 3*b00 * trr_20x; + double trr_21x = cpx * trr_20x + 2*b00 * trr_10x; + double hrr_211x = trr_31x - xjxi * trr_21x; + gout0 += hrr_211x * fac1 * wt; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + double hrr_111x = trr_21x - xjxi * trr_11x; + double c0y = yjyi * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += hrr_111x * trr_10y * wt; + double c0z = zjzi * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += hrr_111x * fac1 * trr_10z; + double trr_01x = cpx * 1; + double hrr_011x = trr_11x - xjxi * trr_01x; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + gout3 += hrr_011x * trr_20y * wt; + gout4 += hrr_011x * trr_10y * trr_10z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + gout5 += hrr_011x * fac1 * trr_20z; + double hrr_010y = trr_10y - yjyi * fac1; + gout6 += trr_21x * hrr_010y * wt; + double hrr_110y = trr_20y - yjyi * trr_10y; + gout7 += trr_11x * hrr_110y * wt; + gout8 += trr_11x * hrr_010y * trr_10z; + double trr_30y = c0y * trr_20y + 2*b10 * trr_10y; + double hrr_210y = trr_30y - yjyi * trr_20y; + gout9 += trr_01x * hrr_210y * wt; + gout10 += trr_01x * hrr_110y * trr_10z; + gout11 += trr_01x * hrr_010y * trr_20z; + double hrr_010z = trr_10z - zjzi * wt; + gout12 += trr_21x * fac1 * hrr_010z; + gout13 += trr_11x * trr_10y * hrr_010z; + double hrr_110z = trr_20z - zjzi * trr_10z; + gout14 += trr_11x * fac1 * hrr_110z; + gout15 += trr_01x * trr_20y * hrr_010z; + gout16 += trr_01x * trr_10y * hrr_110z; + double trr_30z = c0z * trr_20z + 2*b10 * trr_10z; + double hrr_210z = trr_30z - zjzi * trr_20z; + gout17 += trr_01x * fac1 * hrr_210z; + double hrr_210x = trr_30x - xjxi * trr_20x; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout18 += hrr_210x * trr_01y * wt; + double hrr_110x = trr_20x - xjxi * trr_10x; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout19 += hrr_110x * trr_11y * wt; + gout20 += hrr_110x * trr_01y * trr_10z; + double hrr_010x = trr_10x - xjxi * 1; + double trr_21y = cpy * trr_20y + 2*b00 * trr_10y; + gout21 += hrr_010x * trr_21y * wt; + gout22 += hrr_010x * trr_11y * trr_10z; + gout23 += hrr_010x * trr_01y * trr_20z; + double hrr_011y = trr_11y - yjyi * trr_01y; + gout24 += trr_20x * hrr_011y * wt; + double hrr_111y = trr_21y - yjyi * trr_11y; + gout25 += trr_10x * hrr_111y * wt; + gout26 += trr_10x * hrr_011y * trr_10z; + double trr_31y = cpy * trr_30y + 3*b00 * trr_20y; + double hrr_211y = trr_31y - yjyi * trr_21y; + gout27 += 1 * hrr_211y * wt; + gout28 += 1 * hrr_111y * trr_10z; + gout29 += 1 * hrr_011y * trr_20z; + gout30 += trr_20x * trr_01y * hrr_010z; + gout31 += trr_10x * trr_11y * hrr_010z; + gout32 += trr_10x * trr_01y * hrr_110z; + gout33 += 1 * trr_21y * hrr_010z; + gout34 += 1 * trr_11y * hrr_110z; + gout35 += 1 * trr_01y * hrr_210z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout36 += hrr_210x * fac1 * trr_01z; + gout37 += hrr_110x * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout38 += hrr_110x * fac1 * trr_11z; + gout39 += hrr_010x * trr_20y * trr_01z; + gout40 += hrr_010x * trr_10y * trr_11z; + double trr_21z = cpz * trr_20z + 2*b00 * trr_10z; + gout41 += hrr_010x * fac1 * trr_21z; + gout42 += trr_20x * hrr_010y * trr_01z; + gout43 += trr_10x * hrr_110y * trr_01z; + gout44 += trr_10x * hrr_010y * trr_11z; + gout45 += 1 * hrr_210y * trr_01z; + gout46 += 1 * hrr_110y * trr_11z; + gout47 += 1 * hrr_010y * trr_21z; + double hrr_011z = trr_11z - zjzi * trr_01z; + gout48 += trr_20x * fac1 * hrr_011z; + gout49 += trr_10x * trr_10y * hrr_011z; + double hrr_111z = trr_21z - zjzi * trr_11z; + gout50 += trr_10x * fac1 * hrr_111z; + gout51 += 1 * trr_20y * hrr_011z; + gout52 += 1 * trr_10y * hrr_111z; + double trr_31z = cpz * trr_30z + 3*b00 * trr_20z; + double hrr_211z = trr_31z - zjzi * trr_21z; + gout53 += 1 * fac1 * hrr_211z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 18 * naux + ksh_in_auxmol * 3; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout18; + eri_tensor[0*naux + 2] = gout36; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout19; + eri_tensor[1*naux + 2] = gout37; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout20; + eri_tensor[2*naux + 2] = gout38; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[3*naux + 1] = gout21; + eri_tensor[3*naux + 2] = gout39; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[4*naux + 1] = gout22; + eri_tensor[4*naux + 2] = gout40; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[5*naux + 1] = gout23; + eri_tensor[5*naux + 2] = gout41; + eri_tensor[6*naux + 0] = gout6; + eri_tensor[6*naux + 1] = gout24; + eri_tensor[6*naux + 2] = gout42; + eri_tensor[7*naux + 0] = gout7; + eri_tensor[7*naux + 1] = gout25; + eri_tensor[7*naux + 2] = gout43; + eri_tensor[8*naux + 0] = gout8; + eri_tensor[8*naux + 1] = gout26; + eri_tensor[8*naux + 2] = gout44; + eri_tensor[9*naux + 0] = gout9; + eri_tensor[9*naux + 1] = gout27; + eri_tensor[9*naux + 2] = gout45; + eri_tensor[10*naux + 0] = gout10; + eri_tensor[10*naux + 1] = gout28; + eri_tensor[10*naux + 2] = gout46; + eri_tensor[11*naux + 0] = gout11; + eri_tensor[11*naux + 1] = gout29; + eri_tensor[11*naux + 2] = gout47; + eri_tensor[12*naux + 0] = gout12; + eri_tensor[12*naux + 1] = gout30; + eri_tensor[12*naux + 2] = gout48; + eri_tensor[13*naux + 0] = gout13; + eri_tensor[13*naux + 1] = gout31; + eri_tensor[13*naux + 2] = gout49; + eri_tensor[14*naux + 0] = gout14; + eri_tensor[14*naux + 1] = gout32; + eri_tensor[14*naux + 2] = gout50; + eri_tensor[15*naux + 0] = gout15; + eri_tensor[15*naux + 1] = gout33; + eri_tensor[15*naux + 2] = gout51; + eri_tensor[16*naux + 0] = gout16; + eri_tensor[16*naux + 1] = gout34; + eri_tensor[16*naux + 2] = gout52; + eri_tensor[17*naux + 0] = gout17; + eri_tensor[17*naux + 1] = gout35; + eri_tensor[17*naux + 2] = gout53; + } +} + +__global__ +void int3c2e_221(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int st_id = threadIdx.x; + int gout_id = threadIdx.y; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = bounds.nroots; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + double *gx = rw + nroots * 128; + double *gy = gx + 1152; + double *gz = gy + 1152; + double *Rpq = gz + 1152; + double *rjri = Rpq + 192; + if (gout_id == 0) { + gx[0] = 1.; + } + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * 64 * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + 64 * BATCHES_PER_BLOCK); + for (int ijk_idx = st0+st_id; ijk_idx < st1+st_id; ijk_idx += 64) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + __syncthreads(); + if (ijk_idx >= nst) { + shl_pair_idx = st0 / nksh; + if (gout_id == 0) { + gx[0] = 0.; + } + } + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + if (gout_id == 0) { + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0] = xjxi; + rjri[64] = yjyi; + rjri[128] = zjzi; + rjri[192] = rr_ij; + } + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + double s0, s1, s2; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double aj_aij = aj / aij; + __syncthreads(); + double xij = rjri[0] * aj_aij + ri[0]; + double yij = rjri[64] * aj_aij + ri[1]; + double zij = rjri[128] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + if (gout_id == 0) { + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[192]; + gy[0] = fac * exp(-Kab); + Rpq[0] = xpq; + Rpq[64] = ypq; + Rpq[128] = zpq; + } + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(3, theta_rr, rw, 64, gout_id, 4); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4); + __syncthreads(); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = gout_id; irys < 3; irys+=4) { + rw[ irys*2 *64] *= theta_fac; + rw[(irys*2+1)*64] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 384; + rys_roots(3, theta_rr, rw1, 64, gout_id, 4); + rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4); + __syncthreads(); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = gout_id; irys < 3; irys+=4) { + rw[ irys*2 *64] *= theta_fac; + rw[(irys*2+1)*64] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + __syncthreads(); + double rt = rw[irys*128]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double rt_ak = rt_aa * aij; + double b00 = .5 * rt_aa; + for (int n = gout_id; n < 3; n += 4) { + if (n == 2) { + gz[0] = rw[irys*128+64]; + } + double *_gx = gx + n * 1152; + double xjxi = rjri[n * 64]; + double Rpa = xjxi * aj_aij; + double c0x = Rpa - rt_aij * Rpq[n * 64]; + s0 = _gx[0]; + s1 = c0x * s0; + _gx[64] = s1; + s2 = c0x * s1 + 1 * b10 * s0; + _gx[128] = s2; + s0 = s1; + s1 = s2; + s2 = c0x * s1 + 2 * b10 * s0; + _gx[192] = s2; + s0 = s1; + s1 = s2; + s2 = c0x * s1 + 3 * b10 * s0; + _gx[256] = s2; + double cpx = rt_ak * Rpq[n * 64]; + s0 = _gx[0]; + s1 = cpx * s0; + _gx[576] = s1; + s0 = _gx[64]; + s1 = cpx * s0; + s1 += 1 * b00 * _gx[0]; + _gx[640] = s1; + s0 = _gx[128]; + s1 = cpx * s0; + s1 += 2 * b00 * _gx[64]; + _gx[704] = s1; + s0 = _gx[192]; + s1 = cpx * s0; + s1 += 3 * b00 * _gx[128]; + _gx[768] = s1; + s0 = _gx[256]; + s1 = cpx * s0; + s1 += 4 * b00 * _gx[192]; + _gx[832] = s1; + s1 = _gx[256]; + s0 = _gx[192]; + _gx[384] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[128]; + _gx[320] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[64]; + _gx[256] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[0]; + _gx[192] = s1 - xjxi * s0; + s1 = _gx[384]; + s0 = _gx[320]; + _gx[512] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[256]; + _gx[448] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[192]; + _gx[384] = s1 - xjxi * s0; + s1 = _gx[832]; + s0 = _gx[768]; + _gx[960] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[704]; + _gx[896] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[640]; + _gx[832] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[576]; + _gx[768] = s1 - xjxi * s0; + s1 = _gx[960]; + s0 = _gx[896]; + _gx[1088] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[832]; + _gx[1024] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[768]; + _gx[960] = s1 - xjxi * s0; + } + __syncthreads(); + switch (gout_id) { + case 0: + gout0 += gx[1088] * gy[0] * gz[0]; + gout1 += gx[448] * gy[640] * gz[0]; + gout2 += gx[448] * gy[0] * gz[640]; + gout3 += gx[960] * gy[64] * gz[64]; + gout4 += gx[384] * gy[576] * gz[128]; + gout5 += gx[320] * gy[192] * gz[576]; + gout6 += gx[832] * gy[192] * gz[64]; + gout7 += gx[192] * gy[896] * gz[0]; + gout8 += gx[192] * gy[256] * gz[640]; + gout9 += gx[896] * gy[0] * gz[192]; + gout10 += gx[256] * gy[640] * gz[192]; + gout11 += gx[256] * gy[0] * gz[832]; + gout12 += gx[768] * gy[64] * gz[256]; + gout13 += gx[192] * gy[576] * gz[320]; + gout14 += gx[128] * gy[384] * gz[576]; + gout15 += gx[640] * gy[384] * gz[64]; + gout16 += gx[0] * gy[1088] * gz[0]; + gout17 += gx[0] * gy[448] * gz[640]; + gout18 += gx[704] * gy[192] * gz[192]; + gout19 += gx[64] * gy[832] * gz[192]; + gout20 += gx[64] * gy[192] * gz[832]; + gout21 += gx[576] * gy[256] * gz[256]; + gout22 += gx[0] * gy[768] * gz[320]; + gout23 += gx[128] * gy[0] * gz[960]; + gout24 += gx[640] * gy[0] * gz[448]; + gout25 += gx[0] * gy[704] * gz[384]; + gout26 += gx[0] * gy[64] * gz[1024]; + break; + case 1: + gout0 += gx[512] * gy[576] * gz[0]; + gout1 += gx[448] * gy[64] * gz[576]; + gout2 += gx[960] * gy[128] * gz[0]; + gout3 += gx[384] * gy[640] * gz[64]; + gout4 += gx[384] * gy[0] * gz[704]; + gout5 += gx[832] * gy[256] * gz[0]; + gout6 += gx[256] * gy[768] * gz[64]; + gout7 += gx[192] * gy[320] * gz[576]; + gout8 += gx[768] * gy[192] * gz[128]; + gout9 += gx[320] * gy[576] * gz[192]; + gout10 += gx[256] * gy[64] * gz[768]; + gout11 += gx[768] * gy[128] * gz[192]; + gout12 += gx[192] * gy[640] * gz[256]; + gout13 += gx[192] * gy[0] * gz[896]; + gout14 += gx[640] * gy[448] * gz[0]; + gout15 += gx[64] * gy[960] * gz[64]; + gout16 += gx[0] * gy[512] * gz[576]; + gout17 += gx[576] * gy[384] * gz[128]; + gout18 += gx[128] * gy[768] * gz[192]; + gout19 += gx[64] * gy[256] * gz[768]; + gout20 += gx[576] * gy[320] * gz[192]; + gout21 += gx[0] * gy[832] * gz[256]; + gout22 += gx[0] * gy[192] * gz[896]; + gout23 += gx[640] * gy[64] * gz[384]; + gout24 += gx[64] * gy[576] * gz[448]; + gout25 += gx[0] * gy[128] * gz[960]; + gout26 += gx[576] * gy[0] * gz[512]; + break; + case 2: + gout0 += gx[512] * gy[0] * gz[576]; + gout1 += gx[1024] * gy[0] * gz[64]; + gout2 += gx[384] * gy[704] * gz[0]; + gout3 += gx[384] * gy[64] * gz[640]; + gout4 += gx[896] * gy[192] * gz[0]; + gout5 += gx[256] * gy[832] * gz[0]; + gout6 += gx[256] * gy[192] * gz[640]; + gout7 += gx[768] * gy[256] * gz[64]; + gout8 += gx[192] * gy[768] * gz[128]; + gout9 += gx[320] * gy[0] * gz[768]; + gout10 += gx[832] * gy[0] * gz[256]; + gout11 += gx[192] * gy[704] * gz[192]; + gout12 += gx[192] * gy[64] * gz[832]; + gout13 += gx[704] * gy[384] * gz[0]; + gout14 += gx[64] * gy[1024] * gz[0]; + gout15 += gx[64] * gy[384] * gz[640]; + gout16 += gx[576] * gy[448] * gz[64]; + gout17 += gx[0] * gy[960] * gz[128]; + gout18 += gx[128] * gy[192] * gz[768]; + gout19 += gx[640] * gy[192] * gz[256]; + gout20 += gx[0] * gy[896] * gz[192]; + gout21 += gx[0] * gy[256] * gz[832]; + gout22 += gx[704] * gy[0] * gz[384]; + gout23 += gx[64] * gy[640] * gz[384]; + gout24 += gx[64] * gy[0] * gz[1024]; + gout25 += gx[576] * gy[64] * gz[448]; + gout26 += gx[0] * gy[576] * gz[512]; + break; + case 3: + gout0 += gx[1024] * gy[64] * gz[0]; + gout1 += gx[448] * gy[576] * gz[64]; + gout2 += gx[384] * gy[128] * gz[576]; + gout3 += gx[960] * gy[0] * gz[128]; + gout4 += gx[320] * gy[768] * gz[0]; + gout5 += gx[256] * gy[256] * gz[576]; + gout6 += gx[768] * gy[320] * gz[0]; + gout7 += gx[192] * gy[832] * gz[64]; + gout8 += gx[192] * gy[192] * gz[704]; + gout9 += gx[832] * gy[64] * gz[192]; + gout10 += gx[256] * gy[576] * gz[256]; + gout11 += gx[192] * gy[128] * gz[768]; + gout12 += gx[768] * gy[0] * gz[320]; + gout13 += gx[128] * gy[960] * gz[0]; + gout14 += gx[64] * gy[448] * gz[576]; + gout15 += gx[576] * gy[512] * gz[0]; + gout16 += gx[0] * gy[1024] * gz[64]; + gout17 += gx[0] * gy[384] * gz[704]; + gout18 += gx[640] * gy[256] * gz[192]; + gout19 += gx[64] * gy[768] * gz[256]; + gout20 += gx[0] * gy[320] * gz[768]; + gout21 += gx[576] * gy[192] * gz[320]; + gout22 += gx[128] * gy[576] * gz[384]; + gout23 += gx[64] * gy[64] * gz[960]; + gout24 += gx[576] * gy[128] * gz[384]; + gout25 += gx[0] * gy[640] * gz[448]; + gout26 += gx[0] * gy[0] * gz[1088]; + break; + } + } + } + if (ijk_idx < nst) { + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 36 * naux + ksh_in_auxmol * 3; + switch (gout_id) { + case 0: + eri_tensor[0*naux + 0] = gout0; + eri_tensor[1*naux + 1] = gout1; + eri_tensor[2*naux + 2] = gout2; + eri_tensor[4*naux + 0] = gout3; + eri_tensor[5*naux + 1] = gout4; + eri_tensor[6*naux + 2] = gout5; + eri_tensor[8*naux + 0] = gout6; + eri_tensor[9*naux + 1] = gout7; + eri_tensor[10*naux + 2] = gout8; + eri_tensor[12*naux + 0] = gout9; + eri_tensor[13*naux + 1] = gout10; + eri_tensor[14*naux + 2] = gout11; + eri_tensor[16*naux + 0] = gout12; + eri_tensor[17*naux + 1] = gout13; + eri_tensor[18*naux + 2] = gout14; + eri_tensor[20*naux + 0] = gout15; + eri_tensor[21*naux + 1] = gout16; + eri_tensor[22*naux + 2] = gout17; + eri_tensor[24*naux + 0] = gout18; + eri_tensor[25*naux + 1] = gout19; + eri_tensor[26*naux + 2] = gout20; + eri_tensor[28*naux + 0] = gout21; + eri_tensor[29*naux + 1] = gout22; + eri_tensor[30*naux + 2] = gout23; + eri_tensor[32*naux + 0] = gout24; + eri_tensor[33*naux + 1] = gout25; + eri_tensor[34*naux + 2] = gout26; + break; + case 1: + eri_tensor[0*naux + 1] = gout0; + eri_tensor[1*naux + 2] = gout1; + eri_tensor[3*naux + 0] = gout2; + eri_tensor[4*naux + 1] = gout3; + eri_tensor[5*naux + 2] = gout4; + eri_tensor[7*naux + 0] = gout5; + eri_tensor[8*naux + 1] = gout6; + eri_tensor[9*naux + 2] = gout7; + eri_tensor[11*naux + 0] = gout8; + eri_tensor[12*naux + 1] = gout9; + eri_tensor[13*naux + 2] = gout10; + eri_tensor[15*naux + 0] = gout11; + eri_tensor[16*naux + 1] = gout12; + eri_tensor[17*naux + 2] = gout13; + eri_tensor[19*naux + 0] = gout14; + eri_tensor[20*naux + 1] = gout15; + eri_tensor[21*naux + 2] = gout16; + eri_tensor[23*naux + 0] = gout17; + eri_tensor[24*naux + 1] = gout18; + eri_tensor[25*naux + 2] = gout19; + eri_tensor[27*naux + 0] = gout20; + eri_tensor[28*naux + 1] = gout21; + eri_tensor[29*naux + 2] = gout22; + eri_tensor[31*naux + 0] = gout23; + eri_tensor[32*naux + 1] = gout24; + eri_tensor[33*naux + 2] = gout25; + eri_tensor[35*naux + 0] = gout26; + break; + case 2: + eri_tensor[0*naux + 2] = gout0; + eri_tensor[2*naux + 0] = gout1; + eri_tensor[3*naux + 1] = gout2; + eri_tensor[4*naux + 2] = gout3; + eri_tensor[6*naux + 0] = gout4; + eri_tensor[7*naux + 1] = gout5; + eri_tensor[8*naux + 2] = gout6; + eri_tensor[10*naux + 0] = gout7; + eri_tensor[11*naux + 1] = gout8; + eri_tensor[12*naux + 2] = gout9; + eri_tensor[14*naux + 0] = gout10; + eri_tensor[15*naux + 1] = gout11; + eri_tensor[16*naux + 2] = gout12; + eri_tensor[18*naux + 0] = gout13; + eri_tensor[19*naux + 1] = gout14; + eri_tensor[20*naux + 2] = gout15; + eri_tensor[22*naux + 0] = gout16; + eri_tensor[23*naux + 1] = gout17; + eri_tensor[24*naux + 2] = gout18; + eri_tensor[26*naux + 0] = gout19; + eri_tensor[27*naux + 1] = gout20; + eri_tensor[28*naux + 2] = gout21; + eri_tensor[30*naux + 0] = gout22; + eri_tensor[31*naux + 1] = gout23; + eri_tensor[32*naux + 2] = gout24; + eri_tensor[34*naux + 0] = gout25; + eri_tensor[35*naux + 1] = gout26; + break; + case 3: + eri_tensor[1*naux + 0] = gout0; + eri_tensor[2*naux + 1] = gout1; + eri_tensor[3*naux + 2] = gout2; + eri_tensor[5*naux + 0] = gout3; + eri_tensor[6*naux + 1] = gout4; + eri_tensor[7*naux + 2] = gout5; + eri_tensor[9*naux + 0] = gout6; + eri_tensor[10*naux + 1] = gout7; + eri_tensor[11*naux + 2] = gout8; + eri_tensor[13*naux + 0] = gout9; + eri_tensor[14*naux + 1] = gout10; + eri_tensor[15*naux + 2] = gout11; + eri_tensor[17*naux + 0] = gout12; + eri_tensor[18*naux + 1] = gout13; + eri_tensor[19*naux + 2] = gout14; + eri_tensor[21*naux + 0] = gout15; + eri_tensor[22*naux + 1] = gout16; + eri_tensor[23*naux + 2] = gout17; + eri_tensor[25*naux + 0] = gout18; + eri_tensor[26*naux + 1] = gout19; + eri_tensor[27*naux + 2] = gout20; + eri_tensor[29*naux + 0] = gout21; + eri_tensor[30*naux + 1] = gout22; + eri_tensor[31*naux + 2] = gout23; + eri_tensor[33*naux + 0] = gout24; + eri_tensor[34*naux + 1] = gout25; + eri_tensor[35*naux + 2] = gout26; + break; + } + } + } +} + +#if CUDA_VERSION >= 12040 +__global__ __maxnreg__(128) +#else +__global__ +#endif +void int3c2e_002(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_ak = rt_aa * aij; + double b01 = .5/ak * (1 - rt_ak); + double cpx = xpq*rt_ak; + double trr_01x = cpx * 1; + double trr_02x = cpx * trr_01x + 1*b01 * 1; + gout0 += trr_02x * fac1 * wt; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout1 += trr_01x * trr_01y * wt; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout2 += trr_01x * fac1 * trr_01z; + double trr_02y = cpy * trr_01y + 1*b01 * fac1; + gout3 += 1 * trr_02y * wt; + gout4 += 1 * trr_01y * trr_01z; + double trr_02z = cpz * trr_01z + 1*b01 * wt; + gout5 += 1 * fac1 * trr_02z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 1 * naux + ksh_in_auxmol * 6; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout1; + eri_tensor[0*naux + 2] = gout2; + eri_tensor[0*naux + 3] = gout3; + eri_tensor[0*naux + 4] = gout4; + eri_tensor[0*naux + 5] = gout5; + } +} + +#if CUDA_VERSION >= 12040 +__global__ __maxnreg__(128) +#else +__global__ +#endif +void int3c2e_102(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double b01 = .5/ak * (1 - rt_ak); + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double c0x = xjxi * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + double trr_01x = cpx * 1; + double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x; + gout0 += trr_12x * fac1 * wt; + double trr_02x = cpx * trr_01x + 1*b01 * 1; + double c0y = yjyi * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += trr_02x * trr_10y * wt; + double c0z = zjzi * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += trr_02x * fac1 * trr_10z; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout3 += trr_11x * trr_01y * wt; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout4 += trr_01x * trr_11y * wt; + gout5 += trr_01x * trr_01y * trr_10z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout6 += trr_11x * fac1 * trr_01z; + gout7 += trr_01x * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout8 += trr_01x * fac1 * trr_11z; + double trr_02y = cpy * trr_01y + 1*b01 * fac1; + gout9 += trr_10x * trr_02y * wt; + double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y; + gout10 += 1 * trr_12y * wt; + gout11 += 1 * trr_02y * trr_10z; + gout12 += trr_10x * trr_01y * trr_01z; + gout13 += 1 * trr_11y * trr_01z; + gout14 += 1 * trr_01y * trr_11z; + double trr_02z = cpz * trr_01z + 1*b01 * wt; + gout15 += trr_10x * fac1 * trr_02z; + gout16 += 1 * trr_10y * trr_02z; + double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z; + gout17 += 1 * fac1 * trr_12z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 3 * naux + ksh_in_auxmol * 6; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout3; + eri_tensor[0*naux + 2] = gout6; + eri_tensor[0*naux + 3] = gout9; + eri_tensor[0*naux + 4] = gout12; + eri_tensor[0*naux + 5] = gout15; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout4; + eri_tensor[1*naux + 2] = gout7; + eri_tensor[1*naux + 3] = gout10; + eri_tensor[1*naux + 4] = gout13; + eri_tensor[1*naux + 5] = gout16; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout5; + eri_tensor[2*naux + 2] = gout8; + eri_tensor[2*naux + 3] = gout11; + eri_tensor[2*naux + 4] = gout14; + eri_tensor[2*naux + 5] = gout17; + } +} + +__global__ +void int3c2e_112(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + double gout27 = 0; + double gout28 = 0; + double gout29 = 0; + double gout30 = 0; + double gout31 = 0; + double gout32 = 0; + double gout33 = 0; + double gout34 = 0; + double gout35 = 0; + double gout36 = 0; + double gout37 = 0; + double gout38 = 0; + double gout39 = 0; + double gout40 = 0; + double gout41 = 0; + double gout42 = 0; + double gout43 = 0; + double gout44 = 0; + double gout45 = 0; + double gout46 = 0; + double gout47 = 0; + double gout48 = 0; + double gout49 = 0; + double gout50 = 0; + double gout51 = 0; + double gout52 = 0; + double gout53 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(3, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 6*nst_per_block; + rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double b01 = .5/ak * (1 - rt_ak); + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = xjxi * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_21x = cpx * trr_20x + 2*b00 * trr_10x; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + double trr_22x = cpx * trr_21x + 1*b01 * trr_20x + 2*b00 * trr_11x; + double trr_01x = cpx * 1; + double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x; + double hrr_112x = trr_22x - xjxi * trr_12x; + gout0 += hrr_112x * fac1 * wt; + double trr_02x = cpx * trr_01x + 1*b01 * 1; + double hrr_012x = trr_12x - xjxi * trr_02x; + double c0y = yjyi * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += hrr_012x * trr_10y * wt; + double c0z = zjzi * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += hrr_012x * fac1 * trr_10z; + double hrr_010y = trr_10y - yjyi * fac1; + gout3 += trr_12x * hrr_010y * wt; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + double hrr_110y = trr_20y - yjyi * trr_10y; + gout4 += trr_02x * hrr_110y * wt; + gout5 += trr_02x * hrr_010y * trr_10z; + double hrr_010z = trr_10z - zjzi * wt; + gout6 += trr_12x * fac1 * hrr_010z; + gout7 += trr_02x * trr_10y * hrr_010z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + double hrr_110z = trr_20z - zjzi * trr_10z; + gout8 += trr_02x * fac1 * hrr_110z; + double hrr_111x = trr_21x - xjxi * trr_11x; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout9 += hrr_111x * trr_01y * wt; + double hrr_011x = trr_11x - xjxi * trr_01x; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout10 += hrr_011x * trr_11y * wt; + gout11 += hrr_011x * trr_01y * trr_10z; + double hrr_011y = trr_11y - yjyi * trr_01y; + gout12 += trr_11x * hrr_011y * wt; + double trr_21y = cpy * trr_20y + 2*b00 * trr_10y; + double hrr_111y = trr_21y - yjyi * trr_11y; + gout13 += trr_01x * hrr_111y * wt; + gout14 += trr_01x * hrr_011y * trr_10z; + gout15 += trr_11x * trr_01y * hrr_010z; + gout16 += trr_01x * trr_11y * hrr_010z; + gout17 += trr_01x * trr_01y * hrr_110z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout18 += hrr_111x * fac1 * trr_01z; + gout19 += hrr_011x * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout20 += hrr_011x * fac1 * trr_11z; + gout21 += trr_11x * hrr_010y * trr_01z; + gout22 += trr_01x * hrr_110y * trr_01z; + gout23 += trr_01x * hrr_010y * trr_11z; + double hrr_011z = trr_11z - zjzi * trr_01z; + gout24 += trr_11x * fac1 * hrr_011z; + gout25 += trr_01x * trr_10y * hrr_011z; + double trr_21z = cpz * trr_20z + 2*b00 * trr_10z; + double hrr_111z = trr_21z - zjzi * trr_11z; + gout26 += trr_01x * fac1 * hrr_111z; + double hrr_110x = trr_20x - xjxi * trr_10x; + double trr_02y = cpy * trr_01y + 1*b01 * fac1; + gout27 += hrr_110x * trr_02y * wt; + double hrr_010x = trr_10x - xjxi * 1; + double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y; + gout28 += hrr_010x * trr_12y * wt; + gout29 += hrr_010x * trr_02y * trr_10z; + double hrr_012y = trr_12y - yjyi * trr_02y; + gout30 += trr_10x * hrr_012y * wt; + double trr_22y = cpy * trr_21y + 1*b01 * trr_20y + 2*b00 * trr_11y; + double hrr_112y = trr_22y - yjyi * trr_12y; + gout31 += 1 * hrr_112y * wt; + gout32 += 1 * hrr_012y * trr_10z; + gout33 += trr_10x * trr_02y * hrr_010z; + gout34 += 1 * trr_12y * hrr_010z; + gout35 += 1 * trr_02y * hrr_110z; + gout36 += hrr_110x * trr_01y * trr_01z; + gout37 += hrr_010x * trr_11y * trr_01z; + gout38 += hrr_010x * trr_01y * trr_11z; + gout39 += trr_10x * hrr_011y * trr_01z; + gout40 += 1 * hrr_111y * trr_01z; + gout41 += 1 * hrr_011y * trr_11z; + gout42 += trr_10x * trr_01y * hrr_011z; + gout43 += 1 * trr_11y * hrr_011z; + gout44 += 1 * trr_01y * hrr_111z; + double trr_02z = cpz * trr_01z + 1*b01 * wt; + gout45 += hrr_110x * fac1 * trr_02z; + gout46 += hrr_010x * trr_10y * trr_02z; + double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z; + gout47 += hrr_010x * fac1 * trr_12z; + gout48 += trr_10x * hrr_010y * trr_02z; + gout49 += 1 * hrr_110y * trr_02z; + gout50 += 1 * hrr_010y * trr_12z; + double hrr_012z = trr_12z - zjzi * trr_02z; + gout51 += trr_10x * fac1 * hrr_012z; + gout52 += 1 * trr_10y * hrr_012z; + double trr_22z = cpz * trr_21z + 1*b01 * trr_20z + 2*b00 * trr_11z; + double hrr_112z = trr_22z - zjzi * trr_12z; + gout53 += 1 * fac1 * hrr_112z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 9 * naux + ksh_in_auxmol * 6; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout9; + eri_tensor[0*naux + 2] = gout18; + eri_tensor[0*naux + 3] = gout27; + eri_tensor[0*naux + 4] = gout36; + eri_tensor[0*naux + 5] = gout45; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout10; + eri_tensor[1*naux + 2] = gout19; + eri_tensor[1*naux + 3] = gout28; + eri_tensor[1*naux + 4] = gout37; + eri_tensor[1*naux + 5] = gout46; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout11; + eri_tensor[2*naux + 2] = gout20; + eri_tensor[2*naux + 3] = gout29; + eri_tensor[2*naux + 4] = gout38; + eri_tensor[2*naux + 5] = gout47; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[3*naux + 1] = gout12; + eri_tensor[3*naux + 2] = gout21; + eri_tensor[3*naux + 3] = gout30; + eri_tensor[3*naux + 4] = gout39; + eri_tensor[3*naux + 5] = gout48; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[4*naux + 1] = gout13; + eri_tensor[4*naux + 2] = gout22; + eri_tensor[4*naux + 3] = gout31; + eri_tensor[4*naux + 4] = gout40; + eri_tensor[4*naux + 5] = gout49; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[5*naux + 1] = gout14; + eri_tensor[5*naux + 2] = gout23; + eri_tensor[5*naux + 3] = gout32; + eri_tensor[5*naux + 4] = gout41; + eri_tensor[5*naux + 5] = gout50; + eri_tensor[6*naux + 0] = gout6; + eri_tensor[6*naux + 1] = gout15; + eri_tensor[6*naux + 2] = gout24; + eri_tensor[6*naux + 3] = gout33; + eri_tensor[6*naux + 4] = gout42; + eri_tensor[6*naux + 5] = gout51; + eri_tensor[7*naux + 0] = gout7; + eri_tensor[7*naux + 1] = gout16; + eri_tensor[7*naux + 2] = gout25; + eri_tensor[7*naux + 3] = gout34; + eri_tensor[7*naux + 4] = gout43; + eri_tensor[7*naux + 5] = gout52; + eri_tensor[8*naux + 0] = gout8; + eri_tensor[8*naux + 1] = gout17; + eri_tensor[8*naux + 2] = gout26; + eri_tensor[8*naux + 3] = gout35; + eri_tensor[8*naux + 4] = gout44; + eri_tensor[8*naux + 5] = gout53; + } +} + +__global__ +void int3c2e_202(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * nst_per_block * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + nst_per_block * BATCHES_PER_BLOCK); + for (int ijk_idx = st0 + st_id; ijk_idx < st1; ijk_idx += nst_per_block) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xjxi = rj[0] - xi; + double yjyi = rj[1] - yi; + double zjzi = rj[2] - zi; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + double gout27 = 0; + double gout28 = 0; + double gout29 = 0; + double gout30 = 0; + double gout31 = 0; + double gout32 = 0; + double gout33 = 0; + double gout34 = 0; + double gout35 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rr_ij; + double fac1 = fac * exp(-Kab); + double xij = xjxi * aj_aij + xi; + double yij = yjyi * aj_aij + yi; + double zij = zjzi * aj_aij + zi; + double xpq = xij - xk; + double ypq = yij - yk; + double zpq = zij - zk; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(3, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 6*nst_per_block; + rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < bounds.nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double b01 = .5/ak * (1 - rt_ak); + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = xjxi * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_21x = cpx * trr_20x + 2*b00 * trr_10x; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + double trr_22x = cpx * trr_21x + 1*b01 * trr_20x + 2*b00 * trr_11x; + gout0 += trr_22x * fac1 * wt; + double trr_01x = cpx * 1; + double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x; + double c0y = yjyi * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += trr_12x * trr_10y * wt; + double c0z = zjzi * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += trr_12x * fac1 * trr_10z; + double trr_02x = cpx * trr_01x + 1*b01 * 1; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + gout3 += trr_02x * trr_20y * wt; + gout4 += trr_02x * trr_10y * trr_10z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + gout5 += trr_02x * fac1 * trr_20z; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout6 += trr_21x * trr_01y * wt; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout7 += trr_11x * trr_11y * wt; + gout8 += trr_11x * trr_01y * trr_10z; + double trr_21y = cpy * trr_20y + 2*b00 * trr_10y; + gout9 += trr_01x * trr_21y * wt; + gout10 += trr_01x * trr_11y * trr_10z; + gout11 += trr_01x * trr_01y * trr_20z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout12 += trr_21x * fac1 * trr_01z; + gout13 += trr_11x * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout14 += trr_11x * fac1 * trr_11z; + gout15 += trr_01x * trr_20y * trr_01z; + gout16 += trr_01x * trr_10y * trr_11z; + double trr_21z = cpz * trr_20z + 2*b00 * trr_10z; + gout17 += trr_01x * fac1 * trr_21z; + double trr_02y = cpy * trr_01y + 1*b01 * fac1; + gout18 += trr_20x * trr_02y * wt; + double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y; + gout19 += trr_10x * trr_12y * wt; + gout20 += trr_10x * trr_02y * trr_10z; + double trr_22y = cpy * trr_21y + 1*b01 * trr_20y + 2*b00 * trr_11y; + gout21 += 1 * trr_22y * wt; + gout22 += 1 * trr_12y * trr_10z; + gout23 += 1 * trr_02y * trr_20z; + gout24 += trr_20x * trr_01y * trr_01z; + gout25 += trr_10x * trr_11y * trr_01z; + gout26 += trr_10x * trr_01y * trr_11z; + gout27 += 1 * trr_21y * trr_01z; + gout28 += 1 * trr_11y * trr_11z; + gout29 += 1 * trr_01y * trr_21z; + double trr_02z = cpz * trr_01z + 1*b01 * wt; + gout30 += trr_20x * fac1 * trr_02z; + gout31 += trr_10x * trr_10y * trr_02z; + double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z; + gout32 += trr_10x * fac1 * trr_12z; + gout33 += 1 * trr_20y * trr_02z; + gout34 += 1 * trr_10y * trr_12z; + double trr_22z = cpz * trr_21z + 1*b01 * trr_20z + 2*b00 * trr_11z; + gout35 += 1 * fac1 * trr_22z; + } + } + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 6 * naux + ksh_in_auxmol * 6; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout6; + eri_tensor[0*naux + 2] = gout12; + eri_tensor[0*naux + 3] = gout18; + eri_tensor[0*naux + 4] = gout24; + eri_tensor[0*naux + 5] = gout30; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout7; + eri_tensor[1*naux + 2] = gout13; + eri_tensor[1*naux + 3] = gout19; + eri_tensor[1*naux + 4] = gout25; + eri_tensor[1*naux + 5] = gout31; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout8; + eri_tensor[2*naux + 2] = gout14; + eri_tensor[2*naux + 3] = gout20; + eri_tensor[2*naux + 4] = gout26; + eri_tensor[2*naux + 5] = gout32; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[3*naux + 1] = gout9; + eri_tensor[3*naux + 2] = gout15; + eri_tensor[3*naux + 3] = gout21; + eri_tensor[3*naux + 4] = gout27; + eri_tensor[3*naux + 5] = gout33; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[4*naux + 1] = gout10; + eri_tensor[4*naux + 2] = gout16; + eri_tensor[4*naux + 3] = gout22; + eri_tensor[4*naux + 4] = gout28; + eri_tensor[4*naux + 5] = gout34; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[5*naux + 1] = gout11; + eri_tensor[5*naux + 2] = gout17; + eri_tensor[5*naux + 3] = gout23; + eri_tensor[5*naux + 4] = gout29; + eri_tensor[5*naux + 5] = gout35; + } +} + +__global__ +void int3c2e_212(double *out, Int3c2eEnvVars envs, Int3c2eBounds bounds) +{ + int st_id = threadIdx.x; + int gout_id = threadIdx.y; + int batch_id = blockIdx.x; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = bounds.nroots; + int *bas = envs.bas; + int nbas = envs.nbas; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + double *gx = rw + nroots * 128; + double *gy = gx + 1152; + double *gz = gy + 1152; + double *Rpq = gz + 1152; + double *rjri = Rpq + 192; + if (gout_id == 0) { + gx[0] = 1.; + } + + int nksh = bounds.nksh; + int nshl_pair = bounds.nshl_pair; + int nst = nksh * nshl_pair; + int st0 = batch_id * 64 * BATCHES_PER_BLOCK; + int st1 = MIN(nst, st0 + 64 * BATCHES_PER_BLOCK); + for (int ijk_idx = st0+st_id; ijk_idx < st1+st_id; ijk_idx += 64) { + int ksh_in_auxmol = ijk_idx % nksh; + int ksh = ksh_in_auxmol + bounds.ksh0; + int shl_pair_idx = ijk_idx / nksh; + __syncthreads(); + if (ijk_idx >= nst) { + shl_pair_idx = st0 / nksh; + if (gout_id == 0) { + gx[0] = 0.; + } + } + int bas_ij = bounds.bas_ij_idx[shl_pair_idx]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + if (gout_id == 0) { + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0] = xjxi; + rjri[64] = yjyi; + rjri[128] = zjzi; + rjri[192] = rr_ij; + } + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + double s0, s1, s2; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double aj_aij = aj / aij; + __syncthreads(); + double xij = rjri[0] * aj_aij + ri[0]; + double yij = rjri[64] * aj_aij + ri[1]; + double zij = rjri[128] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + if (gout_id == 0) { + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[192]; + gy[0] = fac * exp(-Kab); + Rpq[0] = xpq; + Rpq[64] = ypq; + Rpq[128] = zpq; + } + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(3, theta_rr, rw, 64, gout_id, 4); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4); + __syncthreads(); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = gout_id; irys < 3; irys+=4) { + rw[ irys*2 *64] *= theta_fac; + rw[(irys*2+1)*64] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 384; + rys_roots(3, theta_rr, rw1, 64, gout_id, 4); + rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4); + __syncthreads(); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = gout_id; irys < 3; irys+=4) { + rw[ irys*2 *64] *= theta_fac; + rw[(irys*2+1)*64] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + __syncthreads(); + double rt = rw[irys*128]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double rt_ak = rt_aa * aij; + double b00 = .5 * rt_aa; + double b01 = .5/ak * (1 - rt_ak); + for (int n = gout_id; n < 3; n += 4) { + if (n == 2) { + gz[0] = rw[irys*128+64]; + } + double *_gx = gx + n * 1152; + double xjxi = rjri[n * 64]; + double Rpa = xjxi * aj_aij; + double c0x = Rpa - rt_aij * Rpq[n * 64]; + s0 = _gx[0]; + s1 = c0x * s0; + _gx[64] = s1; + s2 = c0x * s1 + 1 * b10 * s0; + _gx[128] = s2; + s0 = s1; + s1 = s2; + s2 = c0x * s1 + 2 * b10 * s0; + _gx[192] = s2; + double cpx = rt_ak * Rpq[n * 64]; + s0 = _gx[0]; + s1 = cpx * s0; + _gx[384] = s1; + s2 = cpx*s1 + 1 * b01 *s0; + _gx[768] = s2; + s0 = _gx[64]; + s1 = cpx * s0; + s1 += 1 * b00 * _gx[0]; + _gx[448] = s1; + s2 = cpx*s1 + 1 * b01 *s0; + s2 += 1 * b00 * _gx[384]; + _gx[832] = s2; + s0 = _gx[128]; + s1 = cpx * s0; + s1 += 2 * b00 * _gx[64]; + _gx[512] = s1; + s2 = cpx*s1 + 1 * b01 *s0; + s2 += 2 * b00 * _gx[448]; + _gx[896] = s2; + s0 = _gx[192]; + s1 = cpx * s0; + s1 += 3 * b00 * _gx[128]; + _gx[576] = s1; + s2 = cpx*s1 + 1 * b01 *s0; + s2 += 3 * b00 * _gx[512]; + _gx[960] = s2; + s1 = _gx[192]; + s0 = _gx[128]; + _gx[320] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[64]; + _gx[256] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[0]; + _gx[192] = s1 - xjxi * s0; + s1 = _gx[576]; + s0 = _gx[512]; + _gx[704] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[448]; + _gx[640] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[384]; + _gx[576] = s1 - xjxi * s0; + s1 = _gx[960]; + s0 = _gx[896]; + _gx[1088] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[832]; + _gx[1024] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[768]; + _gx[960] = s1 - xjxi * s0; + } + __syncthreads(); + switch (gout_id) { + case 0: + gout0 += gx[1088] * gy[0] * gz[0]; + gout1 += gx[320] * gy[384] * gz[384]; + gout2 += gx[640] * gy[64] * gz[384]; + gout3 += gx[1024] * gy[0] * gz[64]; + gout4 += gx[256] * gy[384] * gz[448]; + gout5 += gx[576] * gy[128] * gz[384]; + gout6 += gx[960] * gy[64] * gz[64]; + gout7 += gx[192] * gy[448] * gz[448]; + gout8 += gx[576] * gy[0] * gz[512]; + gout9 += gx[896] * gy[192] * gz[0]; + gout10 += gx[128] * gy[576] * gz[384]; + gout11 += gx[448] * gy[256] * gz[384]; + gout12 += gx[832] * gy[192] * gz[64]; + gout13 += gx[64] * gy[576] * gz[448]; + gout14 += gx[384] * gy[320] * gz[384]; + gout15 += gx[768] * gy[256] * gz[64]; + gout16 += gx[0] * gy[640] * gz[448]; + gout17 += gx[384] * gy[192] * gz[512]; + gout18 += gx[896] * gy[0] * gz[192]; + gout19 += gx[128] * gy[384] * gz[576]; + gout20 += gx[448] * gy[64] * gz[576]; + gout21 += gx[832] * gy[0] * gz[256]; + gout22 += gx[64] * gy[384] * gz[640]; + gout23 += gx[384] * gy[128] * gz[576]; + gout24 += gx[768] * gy[64] * gz[256]; + gout25 += gx[0] * gy[448] * gz[640]; + gout26 += gx[384] * gy[0] * gz[704]; + break; + case 1: + gout0 += gx[704] * gy[384] * gz[0]; + gout1 += gx[320] * gy[0] * gz[768]; + gout2 += gx[256] * gy[832] * gz[0]; + gout3 += gx[640] * gy[384] * gz[64]; + gout4 += gx[256] * gy[0] * gz[832]; + gout5 += gx[192] * gy[896] * gz[0]; + gout6 += gx[576] * gy[448] * gz[64]; + gout7 += gx[192] * gy[64] * gz[832]; + gout8 += gx[192] * gy[768] * gz[128]; + gout9 += gx[512] * gy[576] * gz[0]; + gout10 += gx[128] * gy[192] * gz[768]; + gout11 += gx[64] * gy[1024] * gz[0]; + gout12 += gx[448] * gy[576] * gz[64]; + gout13 += gx[64] * gy[192] * gz[832]; + gout14 += gx[0] * gy[1088] * gz[0]; + gout15 += gx[384] * gy[640] * gz[64]; + gout16 += gx[0] * gy[256] * gz[832]; + gout17 += gx[0] * gy[960] * gz[128]; + gout18 += gx[512] * gy[384] * gz[192]; + gout19 += gx[128] * gy[0] * gz[960]; + gout20 += gx[64] * gy[832] * gz[192]; + gout21 += gx[448] * gy[384] * gz[256]; + gout22 += gx[64] * gy[0] * gz[1024]; + gout23 += gx[0] * gy[896] * gz[192]; + gout24 += gx[384] * gy[448] * gz[256]; + gout25 += gx[0] * gy[64] * gz[1024]; + gout26 += gx[0] * gy[768] * gz[320]; + break; + case 2: + gout0 += gx[704] * gy[0] * gz[384]; + gout1 += gx[1024] * gy[64] * gz[0]; + gout2 += gx[256] * gy[448] * gz[384]; + gout3 += gx[640] * gy[0] * gz[448]; + gout4 += gx[960] * gy[128] * gz[0]; + gout5 += gx[192] * gy[512] * gz[384]; + gout6 += gx[576] * gy[64] * gz[448]; + gout7 += gx[960] * gy[0] * gz[128]; + gout8 += gx[192] * gy[384] * gz[512]; + gout9 += gx[512] * gy[192] * gz[384]; + gout10 += gx[832] * gy[256] * gz[0]; + gout11 += gx[64] * gy[640] * gz[384]; + gout12 += gx[448] * gy[192] * gz[448]; + gout13 += gx[768] * gy[320] * gz[0]; + gout14 += gx[0] * gy[704] * gz[384]; + gout15 += gx[384] * gy[256] * gz[448]; + gout16 += gx[768] * gy[192] * gz[128]; + gout17 += gx[0] * gy[576] * gz[512]; + gout18 += gx[512] * gy[0] * gz[576]; + gout19 += gx[832] * gy[64] * gz[192]; + gout20 += gx[64] * gy[448] * gz[576]; + gout21 += gx[448] * gy[0] * gz[640]; + gout22 += gx[768] * gy[128] * gz[192]; + gout23 += gx[0] * gy[512] * gz[576]; + gout24 += gx[384] * gy[64] * gz[640]; + gout25 += gx[768] * gy[0] * gz[320]; + gout26 += gx[0] * gy[384] * gz[704]; + break; + case 3: + gout0 += gx[320] * gy[768] * gz[0]; + gout1 += gx[640] * gy[448] * gz[0]; + gout2 += gx[256] * gy[64] * gz[768]; + gout3 += gx[256] * gy[768] * gz[64]; + gout4 += gx[576] * gy[512] * gz[0]; + gout5 += gx[192] * gy[128] * gz[768]; + gout6 += gx[192] * gy[832] * gz[64]; + gout7 += gx[576] * gy[384] * gz[128]; + gout8 += gx[192] * gy[0] * gz[896]; + gout9 += gx[128] * gy[960] * gz[0]; + gout10 += gx[448] * gy[640] * gz[0]; + gout11 += gx[64] * gy[256] * gz[768]; + gout12 += gx[64] * gy[960] * gz[64]; + gout13 += gx[384] * gy[704] * gz[0]; + gout14 += gx[0] * gy[320] * gz[768]; + gout15 += gx[0] * gy[1024] * gz[64]; + gout16 += gx[384] * gy[576] * gz[128]; + gout17 += gx[0] * gy[192] * gz[896]; + gout18 += gx[128] * gy[768] * gz[192]; + gout19 += gx[448] * gy[448] * gz[192]; + gout20 += gx[64] * gy[64] * gz[960]; + gout21 += gx[64] * gy[768] * gz[256]; + gout22 += gx[384] * gy[512] * gz[192]; + gout23 += gx[0] * gy[128] * gz[960]; + gout24 += gx[0] * gy[832] * gz[256]; + gout25 += gx[384] * gy[384] * gz[320]; + gout26 += gx[0] * gy[0] * gz[1088]; + break; + } + } + } + if (ijk_idx < nst) { + int naux = bounds.naux; + double *eri_tensor = out + shl_pair_idx * 18 * naux + ksh_in_auxmol * 6; + switch (gout_id) { + case 0: + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 4] = gout1; + eri_tensor[1*naux + 2] = gout2; + eri_tensor[2*naux + 0] = gout3; + eri_tensor[2*naux + 4] = gout4; + eri_tensor[3*naux + 2] = gout5; + eri_tensor[4*naux + 0] = gout6; + eri_tensor[4*naux + 4] = gout7; + eri_tensor[5*naux + 2] = gout8; + eri_tensor[6*naux + 0] = gout9; + eri_tensor[6*naux + 4] = gout10; + eri_tensor[7*naux + 2] = gout11; + eri_tensor[8*naux + 0] = gout12; + eri_tensor[8*naux + 4] = gout13; + eri_tensor[9*naux + 2] = gout14; + eri_tensor[10*naux + 0] = gout15; + eri_tensor[10*naux + 4] = gout16; + eri_tensor[11*naux + 2] = gout17; + eri_tensor[12*naux + 0] = gout18; + eri_tensor[12*naux + 4] = gout19; + eri_tensor[13*naux + 2] = gout20; + eri_tensor[14*naux + 0] = gout21; + eri_tensor[14*naux + 4] = gout22; + eri_tensor[15*naux + 2] = gout23; + eri_tensor[16*naux + 0] = gout24; + eri_tensor[16*naux + 4] = gout25; + eri_tensor[17*naux + 2] = gout26; + break; + case 1: + eri_tensor[0*naux + 1] = gout0; + eri_tensor[0*naux + 5] = gout1; + eri_tensor[1*naux + 3] = gout2; + eri_tensor[2*naux + 1] = gout3; + eri_tensor[2*naux + 5] = gout4; + eri_tensor[3*naux + 3] = gout5; + eri_tensor[4*naux + 1] = gout6; + eri_tensor[4*naux + 5] = gout7; + eri_tensor[5*naux + 3] = gout8; + eri_tensor[6*naux + 1] = gout9; + eri_tensor[6*naux + 5] = gout10; + eri_tensor[7*naux + 3] = gout11; + eri_tensor[8*naux + 1] = gout12; + eri_tensor[8*naux + 5] = gout13; + eri_tensor[9*naux + 3] = gout14; + eri_tensor[10*naux + 1] = gout15; + eri_tensor[10*naux + 5] = gout16; + eri_tensor[11*naux + 3] = gout17; + eri_tensor[12*naux + 1] = gout18; + eri_tensor[12*naux + 5] = gout19; + eri_tensor[13*naux + 3] = gout20; + eri_tensor[14*naux + 1] = gout21; + eri_tensor[14*naux + 5] = gout22; + eri_tensor[15*naux + 3] = gout23; + eri_tensor[16*naux + 1] = gout24; + eri_tensor[16*naux + 5] = gout25; + eri_tensor[17*naux + 3] = gout26; + break; + case 2: + eri_tensor[0*naux + 2] = gout0; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 4] = gout2; + eri_tensor[2*naux + 2] = gout3; + eri_tensor[3*naux + 0] = gout4; + eri_tensor[3*naux + 4] = gout5; + eri_tensor[4*naux + 2] = gout6; + eri_tensor[5*naux + 0] = gout7; + eri_tensor[5*naux + 4] = gout8; + eri_tensor[6*naux + 2] = gout9; + eri_tensor[7*naux + 0] = gout10; + eri_tensor[7*naux + 4] = gout11; + eri_tensor[8*naux + 2] = gout12; + eri_tensor[9*naux + 0] = gout13; + eri_tensor[9*naux + 4] = gout14; + eri_tensor[10*naux + 2] = gout15; + eri_tensor[11*naux + 0] = gout16; + eri_tensor[11*naux + 4] = gout17; + eri_tensor[12*naux + 2] = gout18; + eri_tensor[13*naux + 0] = gout19; + eri_tensor[13*naux + 4] = gout20; + eri_tensor[14*naux + 2] = gout21; + eri_tensor[15*naux + 0] = gout22; + eri_tensor[15*naux + 4] = gout23; + eri_tensor[16*naux + 2] = gout24; + eri_tensor[17*naux + 0] = gout25; + eri_tensor[17*naux + 4] = gout26; + break; + case 3: + eri_tensor[0*naux + 3] = gout0; + eri_tensor[1*naux + 1] = gout1; + eri_tensor[1*naux + 5] = gout2; + eri_tensor[2*naux + 3] = gout3; + eri_tensor[3*naux + 1] = gout4; + eri_tensor[3*naux + 5] = gout5; + eri_tensor[4*naux + 3] = gout6; + eri_tensor[5*naux + 1] = gout7; + eri_tensor[5*naux + 5] = gout8; + eri_tensor[6*naux + 3] = gout9; + eri_tensor[7*naux + 1] = gout10; + eri_tensor[7*naux + 5] = gout11; + eri_tensor[8*naux + 3] = gout12; + eri_tensor[9*naux + 1] = gout13; + eri_tensor[9*naux + 5] = gout14; + eri_tensor[10*naux + 3] = gout15; + eri_tensor[11*naux + 1] = gout16; + eri_tensor[11*naux + 5] = gout17; + eri_tensor[12*naux + 3] = gout18; + eri_tensor[13*naux + 1] = gout19; + eri_tensor[13*naux + 5] = gout20; + eri_tensor[14*naux + 3] = gout21; + eri_tensor[15*naux + 1] = gout22; + eri_tensor[15*naux + 5] = gout23; + eri_tensor[16*naux + 3] = gout24; + eri_tensor[17*naux + 1] = gout25; + eri_tensor[17*naux + 5] = gout26; + break; + } + } + } +} + +int int3c2e_unrolled(double *out, Int3c2eEnvVars *envs, Int3c2eBounds *bounds) +{ + int li = bounds->li; + int lj = bounds->lj; + int lk = bounds->lk; + int kij = lk*25 + li*5 + lj; + int nroots = bounds->nroots; + int nshl_pair = bounds->nshl_pair; + int nksh = bounds->nksh; + int nst_per_block = 256; + int gout_stride = 1; + + switch (kij) { + case 37: + nst_per_block = 64; + gout_stride = 4; + break; + case 61: + nst_per_block = 64; + gout_stride = 4; + break; + } + +#if CUDA_VERSION >= 12040 + switch (kij) { + case 0: nst_per_block *= 2; break; + case 5: nst_per_block *= 2; break; + case 6: nst_per_block *= 2; break; + case 10: nst_per_block *= 2; break; + case 11: nst_per_block *= 2; break; + case 25: nst_per_block *= 2; break; + case 30: nst_per_block *= 2; break; + case 35: nst_per_block *= 2; break; + case 50: nst_per_block *= 2; break; + case 55: nst_per_block *= 2; break; + } +#endif + + dim3 threads(nst_per_block, gout_stride); + int tasks_per_block = BATCHES_PER_BLOCK * nst_per_block; + int st_blocks = (nksh*nshl_pair + tasks_per_block - 1) / tasks_per_block; + int buflen = nroots*2 * nst_per_block; + switch (kij) { + case 0: + int3c2e_000<<>>(out, *envs, *bounds); break; + case 5: + int3c2e_100<<>>(out, *envs, *bounds); break; + case 6: + int3c2e_110<<>>(out, *envs, *bounds); break; + case 10: + int3c2e_200<<>>(out, *envs, *bounds); break; + case 11: + int3c2e_210<<>>(out, *envs, *bounds); break; + case 12: + int3c2e_220<<>>(out, *envs, *bounds); break; + case 25: + int3c2e_001<<>>(out, *envs, *bounds); break; + case 30: + int3c2e_101<<>>(out, *envs, *bounds); break; + case 31: + int3c2e_111<<>>(out, *envs, *bounds); break; + case 35: + int3c2e_201<<>>(out, *envs, *bounds); break; + case 36: + int3c2e_211<<>>(out, *envs, *bounds); break; + case 37: + buflen += 3904; + int3c2e_221<<>>(out, *envs, *bounds); break; + case 50: + int3c2e_002<<>>(out, *envs, *bounds); break; + case 55: + int3c2e_102<<>>(out, *envs, *bounds); break; + case 56: + int3c2e_112<<>>(out, *envs, *bounds); break; + case 60: + int3c2e_202<<>>(out, *envs, *bounds); break; + case 61: + buflen += 3904; + int3c2e_212<<>>(out, *envs, *bounds); break; + default: return 0; + } + return 1; +} diff --git a/gpu4pyscf/lib/gint-rys/unrolled_int3c2e_bdiv.cu b/gpu4pyscf/lib/gint-rys/unrolled_int3c2e_bdiv.cu new file mode 100644 index 00000000..e58e4986 --- /dev/null +++ b/gpu4pyscf/lib/gint-rys/unrolled_int3c2e_bdiv.cu @@ -0,0 +1,4093 @@ +#include +#include +#include +#include + +#include "gvhf-rys/vhf.cuh" +#include "gvhf-rys/rys_roots.cu" +#include "int3c2e.cuh" + +__device__ +void int3c2e_bdiv_000(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 1; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(1, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 1; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 2*nst_per_block; + rys_roots(1, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 1; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + gout0 += 1 * fac1 * wt; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 1 * naux + ksh_in_block * 1; + eri_tensor[0*naux + 0] = gout0; + } +} + +__device__ +void int3c2e_bdiv_100(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 1; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(1, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 1; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 2*nst_per_block; + rys_roots(1, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 1; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + gout0 += trr_10x * fac1 * wt; + double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += 1 * trr_10y * wt; + double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += 1 * fac1 * trr_10z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 3 * naux + ksh_in_block * 1; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[2*naux + 0] = gout2; + } +} + +__device__ +void int3c2e_bdiv_110(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 2; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double hrr_110x = trr_20x - xjxi * trr_10x; + gout0 += hrr_110x * fac1 * wt; + double hrr_010x = trr_10x - xjxi * 1; + double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += hrr_010x * trr_10y * wt; + double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += hrr_010x * fac1 * trr_10z; + double hrr_010y = trr_10y - yjyi * fac1; + gout3 += trr_10x * hrr_010y * wt; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + double hrr_110y = trr_20y - yjyi * trr_10y; + gout4 += 1 * hrr_110y * wt; + gout5 += 1 * hrr_010y * trr_10z; + double hrr_010z = trr_10z - zjzi * wt; + gout6 += trr_10x * fac1 * hrr_010z; + gout7 += 1 * trr_10y * hrr_010z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + double hrr_110z = trr_20z - zjzi * trr_10z; + gout8 += 1 * fac1 * hrr_110z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 9 * naux + ksh_in_block * 1; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[6*naux + 0] = gout6; + eri_tensor[7*naux + 0] = gout7; + eri_tensor[8*naux + 0] = gout8; + } +} + +__device__ +void int3c2e_bdiv_200(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 2; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + gout0 += trr_20x * fac1 * wt; + double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += trr_10x * trr_10y * wt; + double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += trr_10x * fac1 * trr_10z; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + gout3 += 1 * trr_20y * wt; + gout4 += 1 * trr_10y * trr_10z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + gout5 += 1 * fac1 * trr_20z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 6 * naux + ksh_in_block * 1; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[5*naux + 0] = gout5; + } +} + +__device__ +void int3c2e_bdiv_210(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 2; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_30x = c0x * trr_20x + 2*b10 * trr_10x; + double hrr_210x = trr_30x - xjxi * trr_20x; + gout0 += hrr_210x * fac1 * wt; + double hrr_110x = trr_20x - xjxi * trr_10x; + double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += hrr_110x * trr_10y * wt; + double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += hrr_110x * fac1 * trr_10z; + double hrr_010x = trr_10x - xjxi * 1; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + gout3 += hrr_010x * trr_20y * wt; + gout4 += hrr_010x * trr_10y * trr_10z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + gout5 += hrr_010x * fac1 * trr_20z; + double hrr_010y = trr_10y - yjyi * fac1; + gout6 += trr_20x * hrr_010y * wt; + double hrr_110y = trr_20y - yjyi * trr_10y; + gout7 += trr_10x * hrr_110y * wt; + gout8 += trr_10x * hrr_010y * trr_10z; + double trr_30y = c0y * trr_20y + 2*b10 * trr_10y; + double hrr_210y = trr_30y - yjyi * trr_20y; + gout9 += 1 * hrr_210y * wt; + gout10 += 1 * hrr_110y * trr_10z; + gout11 += 1 * hrr_010y * trr_20z; + double hrr_010z = trr_10z - zjzi * wt; + gout12 += trr_20x * fac1 * hrr_010z; + gout13 += trr_10x * trr_10y * hrr_010z; + double hrr_110z = trr_20z - zjzi * trr_10z; + gout14 += trr_10x * fac1 * hrr_110z; + gout15 += 1 * trr_20y * hrr_010z; + gout16 += 1 * trr_10y * hrr_110z; + double trr_30z = c0z * trr_20z + 2*b10 * trr_10z; + double hrr_210z = trr_30z - zjzi * trr_20z; + gout17 += 1 * fac1 * hrr_210z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 18 * naux + ksh_in_block * 1; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[6*naux + 0] = gout6; + eri_tensor[7*naux + 0] = gout7; + eri_tensor[8*naux + 0] = gout8; + eri_tensor[9*naux + 0] = gout9; + eri_tensor[10*naux + 0] = gout10; + eri_tensor[11*naux + 0] = gout11; + eri_tensor[12*naux + 0] = gout12; + eri_tensor[13*naux + 0] = gout13; + eri_tensor[14*naux + 0] = gout14; + eri_tensor[15*naux + 0] = gout15; + eri_tensor[16*naux + 0] = gout16; + eri_tensor[17*naux + 0] = gout17; + } +} + +__device__ +void int3c2e_bdiv_220(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 3; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + double gout27 = 0; + double gout28 = 0; + double gout29 = 0; + double gout30 = 0; + double gout31 = 0; + double gout32 = 0; + double gout33 = 0; + double gout34 = 0; + double gout35 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(3, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 6*nst_per_block; + rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_30x = c0x * trr_20x + 2*b10 * trr_10x; + double trr_40x = c0x * trr_30x + 3*b10 * trr_20x; + double hrr_310x = trr_40x - xjxi * trr_30x; + double hrr_210x = trr_30x - xjxi * trr_20x; + double hrr_220x = hrr_310x - xjxi * hrr_210x; + gout0 += hrr_220x * fac1 * wt; + double hrr_110x = trr_20x - xjxi * trr_10x; + double hrr_120x = hrr_210x - xjxi * hrr_110x; + double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += hrr_120x * trr_10y * wt; + double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += hrr_120x * fac1 * trr_10z; + double hrr_010x = trr_10x - xjxi * 1; + double hrr_020x = hrr_110x - xjxi * hrr_010x; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + gout3 += hrr_020x * trr_20y * wt; + gout4 += hrr_020x * trr_10y * trr_10z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + gout5 += hrr_020x * fac1 * trr_20z; + double hrr_010y = trr_10y - yjyi * fac1; + gout6 += hrr_210x * hrr_010y * wt; + double hrr_110y = trr_20y - yjyi * trr_10y; + gout7 += hrr_110x * hrr_110y * wt; + gout8 += hrr_110x * hrr_010y * trr_10z; + double trr_30y = c0y * trr_20y + 2*b10 * trr_10y; + double hrr_210y = trr_30y - yjyi * trr_20y; + gout9 += hrr_010x * hrr_210y * wt; + gout10 += hrr_010x * hrr_110y * trr_10z; + gout11 += hrr_010x * hrr_010y * trr_20z; + double hrr_010z = trr_10z - zjzi * wt; + gout12 += hrr_210x * fac1 * hrr_010z; + gout13 += hrr_110x * trr_10y * hrr_010z; + double hrr_110z = trr_20z - zjzi * trr_10z; + gout14 += hrr_110x * fac1 * hrr_110z; + gout15 += hrr_010x * trr_20y * hrr_010z; + gout16 += hrr_010x * trr_10y * hrr_110z; + double trr_30z = c0z * trr_20z + 2*b10 * trr_10z; + double hrr_210z = trr_30z - zjzi * trr_20z; + gout17 += hrr_010x * fac1 * hrr_210z; + double hrr_020y = hrr_110y - yjyi * hrr_010y; + gout18 += trr_20x * hrr_020y * wt; + double hrr_120y = hrr_210y - yjyi * hrr_110y; + gout19 += trr_10x * hrr_120y * wt; + gout20 += trr_10x * hrr_020y * trr_10z; + double trr_40y = c0y * trr_30y + 3*b10 * trr_20y; + double hrr_310y = trr_40y - yjyi * trr_30y; + double hrr_220y = hrr_310y - yjyi * hrr_210y; + gout21 += 1 * hrr_220y * wt; + gout22 += 1 * hrr_120y * trr_10z; + gout23 += 1 * hrr_020y * trr_20z; + gout24 += trr_20x * hrr_010y * hrr_010z; + gout25 += trr_10x * hrr_110y * hrr_010z; + gout26 += trr_10x * hrr_010y * hrr_110z; + gout27 += 1 * hrr_210y * hrr_010z; + gout28 += 1 * hrr_110y * hrr_110z; + gout29 += 1 * hrr_010y * hrr_210z; + double hrr_020z = hrr_110z - zjzi * hrr_010z; + gout30 += trr_20x * fac1 * hrr_020z; + gout31 += trr_10x * trr_10y * hrr_020z; + double hrr_120z = hrr_210z - zjzi * hrr_110z; + gout32 += trr_10x * fac1 * hrr_120z; + gout33 += 1 * trr_20y * hrr_020z; + gout34 += 1 * trr_10y * hrr_120z; + double trr_40z = c0z * trr_30z + 3*b10 * trr_20z; + double hrr_310z = trr_40z - zjzi * trr_30z; + double hrr_220z = hrr_310z - zjzi * hrr_210z; + gout35 += 1 * fac1 * hrr_220z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 36 * naux + ksh_in_block * 1; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[6*naux + 0] = gout6; + eri_tensor[7*naux + 0] = gout7; + eri_tensor[8*naux + 0] = gout8; + eri_tensor[9*naux + 0] = gout9; + eri_tensor[10*naux + 0] = gout10; + eri_tensor[11*naux + 0] = gout11; + eri_tensor[12*naux + 0] = gout12; + eri_tensor[13*naux + 0] = gout13; + eri_tensor[14*naux + 0] = gout14; + eri_tensor[15*naux + 0] = gout15; + eri_tensor[16*naux + 0] = gout16; + eri_tensor[17*naux + 0] = gout17; + eri_tensor[18*naux + 0] = gout18; + eri_tensor[19*naux + 0] = gout19; + eri_tensor[20*naux + 0] = gout20; + eri_tensor[21*naux + 0] = gout21; + eri_tensor[22*naux + 0] = gout22; + eri_tensor[23*naux + 0] = gout23; + eri_tensor[24*naux + 0] = gout24; + eri_tensor[25*naux + 0] = gout25; + eri_tensor[26*naux + 0] = gout26; + eri_tensor[27*naux + 0] = gout27; + eri_tensor[28*naux + 0] = gout28; + eri_tensor[29*naux + 0] = gout29; + eri_tensor[30*naux + 0] = gout30; + eri_tensor[31*naux + 0] = gout31; + eri_tensor[32*naux + 0] = gout32; + eri_tensor[33*naux + 0] = gout33; + eri_tensor[34*naux + 0] = gout34; + eri_tensor[35*naux + 0] = gout35; + } +} + +__device__ +void int3c2e_bdiv_001(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 1; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(1, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 1; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 2*nst_per_block; + rys_roots(1, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(1, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 1; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_ak = rt_aa * aij; + double cpx = xpq*rt_ak; + double trr_01x = cpx * 1; + gout0 += trr_01x * fac1 * wt; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout1 += 1 * trr_01y * wt; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout2 += 1 * fac1 * trr_01z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 1 * naux + ksh_in_block * 3; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout1; + eri_tensor[0*naux + 2] = gout2; + } +} + +__device__ +void int3c2e_bdiv_101(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 2; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + gout0 += trr_11x * fac1 * wt; + double trr_01x = cpx * 1; + double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += trr_01x * trr_10y * wt; + double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += trr_01x * fac1 * trr_10z; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout3 += trr_10x * trr_01y * wt; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout4 += 1 * trr_11y * wt; + gout5 += 1 * trr_01y * trr_10z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout6 += trr_10x * fac1 * trr_01z; + gout7 += 1 * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout8 += 1 * fac1 * trr_11z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 3 * naux + ksh_in_block * 3; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout3; + eri_tensor[0*naux + 2] = gout6; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout4; + eri_tensor[1*naux + 2] = gout7; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout5; + eri_tensor[2*naux + 2] = gout8; + } +} + +__device__ +void int3c2e_bdiv_111(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 2; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_21x = cpx * trr_20x + 2*b00 * trr_10x; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + double hrr_111x = trr_21x - xjxi * trr_11x; + gout0 += hrr_111x * fac1 * wt; + double trr_01x = cpx * 1; + double hrr_011x = trr_11x - xjxi * trr_01x; + double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += hrr_011x * trr_10y * wt; + double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += hrr_011x * fac1 * trr_10z; + double hrr_010y = trr_10y - yjyi * fac1; + gout3 += trr_11x * hrr_010y * wt; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + double hrr_110y = trr_20y - yjyi * trr_10y; + gout4 += trr_01x * hrr_110y * wt; + gout5 += trr_01x * hrr_010y * trr_10z; + double hrr_010z = trr_10z - zjzi * wt; + gout6 += trr_11x * fac1 * hrr_010z; + gout7 += trr_01x * trr_10y * hrr_010z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + double hrr_110z = trr_20z - zjzi * trr_10z; + gout8 += trr_01x * fac1 * hrr_110z; + double hrr_110x = trr_20x - xjxi * trr_10x; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout9 += hrr_110x * trr_01y * wt; + double hrr_010x = trr_10x - xjxi * 1; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout10 += hrr_010x * trr_11y * wt; + gout11 += hrr_010x * trr_01y * trr_10z; + double hrr_011y = trr_11y - yjyi * trr_01y; + gout12 += trr_10x * hrr_011y * wt; + double trr_21y = cpy * trr_20y + 2*b00 * trr_10y; + double hrr_111y = trr_21y - yjyi * trr_11y; + gout13 += 1 * hrr_111y * wt; + gout14 += 1 * hrr_011y * trr_10z; + gout15 += trr_10x * trr_01y * hrr_010z; + gout16 += 1 * trr_11y * hrr_010z; + gout17 += 1 * trr_01y * hrr_110z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout18 += hrr_110x * fac1 * trr_01z; + gout19 += hrr_010x * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout20 += hrr_010x * fac1 * trr_11z; + gout21 += trr_10x * hrr_010y * trr_01z; + gout22 += 1 * hrr_110y * trr_01z; + gout23 += 1 * hrr_010y * trr_11z; + double hrr_011z = trr_11z - zjzi * trr_01z; + gout24 += trr_10x * fac1 * hrr_011z; + gout25 += 1 * trr_10y * hrr_011z; + double trr_21z = cpz * trr_20z + 2*b00 * trr_10z; + double hrr_111z = trr_21z - zjzi * trr_11z; + gout26 += 1 * fac1 * hrr_111z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 9 * naux + ksh_in_block * 3; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout9; + eri_tensor[0*naux + 2] = gout18; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout10; + eri_tensor[1*naux + 2] = gout19; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout11; + eri_tensor[2*naux + 2] = gout20; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[3*naux + 1] = gout12; + eri_tensor[3*naux + 2] = gout21; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[4*naux + 1] = gout13; + eri_tensor[4*naux + 2] = gout22; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[5*naux + 1] = gout14; + eri_tensor[5*naux + 2] = gout23; + eri_tensor[6*naux + 0] = gout6; + eri_tensor[6*naux + 1] = gout15; + eri_tensor[6*naux + 2] = gout24; + eri_tensor[7*naux + 0] = gout7; + eri_tensor[7*naux + 1] = gout16; + eri_tensor[7*naux + 2] = gout25; + eri_tensor[8*naux + 0] = gout8; + eri_tensor[8*naux + 1] = gout17; + eri_tensor[8*naux + 2] = gout26; + } +} + +__device__ +void int3c2e_bdiv_201(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 2; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_21x = cpx * trr_20x + 2*b00 * trr_10x; + gout0 += trr_21x * fac1 * wt; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += trr_11x * trr_10y * wt; + double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += trr_11x * fac1 * trr_10z; + double trr_01x = cpx * 1; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + gout3 += trr_01x * trr_20y * wt; + gout4 += trr_01x * trr_10y * trr_10z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + gout5 += trr_01x * fac1 * trr_20z; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout6 += trr_20x * trr_01y * wt; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout7 += trr_10x * trr_11y * wt; + gout8 += trr_10x * trr_01y * trr_10z; + double trr_21y = cpy * trr_20y + 2*b00 * trr_10y; + gout9 += 1 * trr_21y * wt; + gout10 += 1 * trr_11y * trr_10z; + gout11 += 1 * trr_01y * trr_20z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout12 += trr_20x * fac1 * trr_01z; + gout13 += trr_10x * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout14 += trr_10x * fac1 * trr_11z; + gout15 += 1 * trr_20y * trr_01z; + gout16 += 1 * trr_10y * trr_11z; + double trr_21z = cpz * trr_20z + 2*b00 * trr_10z; + gout17 += 1 * fac1 * trr_21z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 6 * naux + ksh_in_block * 3; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout6; + eri_tensor[0*naux + 2] = gout12; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout7; + eri_tensor[1*naux + 2] = gout13; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout8; + eri_tensor[2*naux + 2] = gout14; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[3*naux + 1] = gout9; + eri_tensor[3*naux + 2] = gout15; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[4*naux + 1] = gout10; + eri_tensor[4*naux + 2] = gout16; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[5*naux + 1] = gout11; + eri_tensor[5*naux + 2] = gout17; + } +} + +__device__ +void int3c2e_bdiv_211(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 3; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + double gout27 = 0; + double gout28 = 0; + double gout29 = 0; + double gout30 = 0; + double gout31 = 0; + double gout32 = 0; + double gout33 = 0; + double gout34 = 0; + double gout35 = 0; + double gout36 = 0; + double gout37 = 0; + double gout38 = 0; + double gout39 = 0; + double gout40 = 0; + double gout41 = 0; + double gout42 = 0; + double gout43 = 0; + double gout44 = 0; + double gout45 = 0; + double gout46 = 0; + double gout47 = 0; + double gout48 = 0; + double gout49 = 0; + double gout50 = 0; + double gout51 = 0; + double gout52 = 0; + double gout53 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(3, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 6*nst_per_block; + rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_30x = c0x * trr_20x + 2*b10 * trr_10x; + double trr_31x = cpx * trr_30x + 3*b00 * trr_20x; + double trr_21x = cpx * trr_20x + 2*b00 * trr_10x; + double hrr_211x = trr_31x - xjxi * trr_21x; + gout0 += hrr_211x * fac1 * wt; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + double hrr_111x = trr_21x - xjxi * trr_11x; + double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += hrr_111x * trr_10y * wt; + double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += hrr_111x * fac1 * trr_10z; + double trr_01x = cpx * 1; + double hrr_011x = trr_11x - xjxi * trr_01x; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + gout3 += hrr_011x * trr_20y * wt; + gout4 += hrr_011x * trr_10y * trr_10z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + gout5 += hrr_011x * fac1 * trr_20z; + double hrr_010y = trr_10y - yjyi * fac1; + gout6 += trr_21x * hrr_010y * wt; + double hrr_110y = trr_20y - yjyi * trr_10y; + gout7 += trr_11x * hrr_110y * wt; + gout8 += trr_11x * hrr_010y * trr_10z; + double trr_30y = c0y * trr_20y + 2*b10 * trr_10y; + double hrr_210y = trr_30y - yjyi * trr_20y; + gout9 += trr_01x * hrr_210y * wt; + gout10 += trr_01x * hrr_110y * trr_10z; + gout11 += trr_01x * hrr_010y * trr_20z; + double hrr_010z = trr_10z - zjzi * wt; + gout12 += trr_21x * fac1 * hrr_010z; + gout13 += trr_11x * trr_10y * hrr_010z; + double hrr_110z = trr_20z - zjzi * trr_10z; + gout14 += trr_11x * fac1 * hrr_110z; + gout15 += trr_01x * trr_20y * hrr_010z; + gout16 += trr_01x * trr_10y * hrr_110z; + double trr_30z = c0z * trr_20z + 2*b10 * trr_10z; + double hrr_210z = trr_30z - zjzi * trr_20z; + gout17 += trr_01x * fac1 * hrr_210z; + double hrr_210x = trr_30x - xjxi * trr_20x; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout18 += hrr_210x * trr_01y * wt; + double hrr_110x = trr_20x - xjxi * trr_10x; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout19 += hrr_110x * trr_11y * wt; + gout20 += hrr_110x * trr_01y * trr_10z; + double hrr_010x = trr_10x - xjxi * 1; + double trr_21y = cpy * trr_20y + 2*b00 * trr_10y; + gout21 += hrr_010x * trr_21y * wt; + gout22 += hrr_010x * trr_11y * trr_10z; + gout23 += hrr_010x * trr_01y * trr_20z; + double hrr_011y = trr_11y - yjyi * trr_01y; + gout24 += trr_20x * hrr_011y * wt; + double hrr_111y = trr_21y - yjyi * trr_11y; + gout25 += trr_10x * hrr_111y * wt; + gout26 += trr_10x * hrr_011y * trr_10z; + double trr_31y = cpy * trr_30y + 3*b00 * trr_20y; + double hrr_211y = trr_31y - yjyi * trr_21y; + gout27 += 1 * hrr_211y * wt; + gout28 += 1 * hrr_111y * trr_10z; + gout29 += 1 * hrr_011y * trr_20z; + gout30 += trr_20x * trr_01y * hrr_010z; + gout31 += trr_10x * trr_11y * hrr_010z; + gout32 += trr_10x * trr_01y * hrr_110z; + gout33 += 1 * trr_21y * hrr_010z; + gout34 += 1 * trr_11y * hrr_110z; + gout35 += 1 * trr_01y * hrr_210z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout36 += hrr_210x * fac1 * trr_01z; + gout37 += hrr_110x * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout38 += hrr_110x * fac1 * trr_11z; + gout39 += hrr_010x * trr_20y * trr_01z; + gout40 += hrr_010x * trr_10y * trr_11z; + double trr_21z = cpz * trr_20z + 2*b00 * trr_10z; + gout41 += hrr_010x * fac1 * trr_21z; + gout42 += trr_20x * hrr_010y * trr_01z; + gout43 += trr_10x * hrr_110y * trr_01z; + gout44 += trr_10x * hrr_010y * trr_11z; + gout45 += 1 * hrr_210y * trr_01z; + gout46 += 1 * hrr_110y * trr_11z; + gout47 += 1 * hrr_010y * trr_21z; + double hrr_011z = trr_11z - zjzi * trr_01z; + gout48 += trr_20x * fac1 * hrr_011z; + gout49 += trr_10x * trr_10y * hrr_011z; + double hrr_111z = trr_21z - zjzi * trr_11z; + gout50 += trr_10x * fac1 * hrr_111z; + gout51 += 1 * trr_20y * hrr_011z; + gout52 += 1 * trr_10y * hrr_111z; + double trr_31z = cpz * trr_30z + 3*b00 * trr_20z; + double hrr_211z = trr_31z - zjzi * trr_21z; + gout53 += 1 * fac1 * hrr_211z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 18 * naux + ksh_in_block * 3; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout18; + eri_tensor[0*naux + 2] = gout36; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout19; + eri_tensor[1*naux + 2] = gout37; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout20; + eri_tensor[2*naux + 2] = gout38; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[3*naux + 1] = gout21; + eri_tensor[3*naux + 2] = gout39; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[4*naux + 1] = gout22; + eri_tensor[4*naux + 2] = gout40; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[5*naux + 1] = gout23; + eri_tensor[5*naux + 2] = gout41; + eri_tensor[6*naux + 0] = gout6; + eri_tensor[6*naux + 1] = gout24; + eri_tensor[6*naux + 2] = gout42; + eri_tensor[7*naux + 0] = gout7; + eri_tensor[7*naux + 1] = gout25; + eri_tensor[7*naux + 2] = gout43; + eri_tensor[8*naux + 0] = gout8; + eri_tensor[8*naux + 1] = gout26; + eri_tensor[8*naux + 2] = gout44; + eri_tensor[9*naux + 0] = gout9; + eri_tensor[9*naux + 1] = gout27; + eri_tensor[9*naux + 2] = gout45; + eri_tensor[10*naux + 0] = gout10; + eri_tensor[10*naux + 1] = gout28; + eri_tensor[10*naux + 2] = gout46; + eri_tensor[11*naux + 0] = gout11; + eri_tensor[11*naux + 1] = gout29; + eri_tensor[11*naux + 2] = gout47; + eri_tensor[12*naux + 0] = gout12; + eri_tensor[12*naux + 1] = gout30; + eri_tensor[12*naux + 2] = gout48; + eri_tensor[13*naux + 0] = gout13; + eri_tensor[13*naux + 1] = gout31; + eri_tensor[13*naux + 2] = gout49; + eri_tensor[14*naux + 0] = gout14; + eri_tensor[14*naux + 1] = gout32; + eri_tensor[14*naux + 2] = gout50; + eri_tensor[15*naux + 0] = gout15; + eri_tensor[15*naux + 1] = gout33; + eri_tensor[15*naux + 2] = gout51; + eri_tensor[16*naux + 0] = gout16; + eri_tensor[16*naux + 1] = gout34; + eri_tensor[16*naux + 2] = gout52; + eri_tensor[17*naux + 0] = gout17; + eri_tensor[17*naux + 1] = gout35; + eri_tensor[17*naux + 2] = gout53; + } +} + +__device__ +void int3c2e_bdiv_221(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int thread_id = threadIdx.x; + int st_id = thread_id % 64; + int gout_id = thread_id / 64; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 3; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + double *gx = rw + nroots * 128; + double *gy = gx + 1152; + double *gz = gy + 1152; + double *Rpq = gz + 1152; + double *rjri = Rpq + 192; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + if (gout_id == 0) { + gx[0] = 1.; + } + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst+st_id; ijk_idx += 64) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + if (ijk_idx >= nst) { + shl_pair_in_block = 0; + if (gout_id == 0) { + gx[0] = 0.; + } + } + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + if (gout_id == 0) { + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0] = xjxi; + rjri[64] = yjyi; + rjri[128] = zjzi; + rjri[192] = rr_ij; + } + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + double s0, s1, s2; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double aj_aij = aj / aij; + __syncthreads(); + double xij = rjri[0] * aj_aij + ri[0]; + double yij = rjri[64] * aj_aij + ri[1]; + double zij = rjri[128] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + if (gout_id == 0) { + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[192]; + gy[0] = fac * exp(-Kab); + Rpq[0] = xpq; + Rpq[64] = ypq; + Rpq[128] = zpq; + } + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(3, theta_rr, rw, 64, gout_id, 4); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4); + __syncthreads(); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = gout_id; irys < 3; irys+=4) { + rw[ irys*2 *64] *= theta_fac; + rw[(irys*2+1)*64] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 384; + rys_roots(3, theta_rr, rw1, 64, gout_id, 4); + rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4); + __syncthreads(); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = gout_id; irys < 3; irys+=4) { + rw[ irys*2 *64] *= theta_fac; + rw[(irys*2+1)*64] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + __syncthreads(); + double rt = rw[irys*128]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double rt_ak = rt_aa * aij; + double b00 = .5 * rt_aa; + for (int n = gout_id; n < 3; n += 4) { + if (n == 2) { + gz[0] = rw[irys*128+64]; + } + double *_gx = gx + n * 1152; + double xjxi = rjri[n * 64]; + double Rpa = xjxi * aj_aij; + double c0x = Rpa - rt_aij * Rpq[n * 64]; + s0 = _gx[0]; + s1 = c0x * s0; + _gx[64] = s1; + s2 = c0x * s1 + 1 * b10 * s0; + _gx[128] = s2; + s0 = s1; + s1 = s2; + s2 = c0x * s1 + 2 * b10 * s0; + _gx[192] = s2; + s0 = s1; + s1 = s2; + s2 = c0x * s1 + 3 * b10 * s0; + _gx[256] = s2; + double cpx = rt_ak * Rpq[n * 64]; + s0 = _gx[0]; + s1 = cpx * s0; + _gx[576] = s1; + s0 = _gx[64]; + s1 = cpx * s0; + s1 += 1 * b00 * _gx[0]; + _gx[640] = s1; + s0 = _gx[128]; + s1 = cpx * s0; + s1 += 2 * b00 * _gx[64]; + _gx[704] = s1; + s0 = _gx[192]; + s1 = cpx * s0; + s1 += 3 * b00 * _gx[128]; + _gx[768] = s1; + s0 = _gx[256]; + s1 = cpx * s0; + s1 += 4 * b00 * _gx[192]; + _gx[832] = s1; + s1 = _gx[256]; + s0 = _gx[192]; + _gx[384] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[128]; + _gx[320] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[64]; + _gx[256] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[0]; + _gx[192] = s1 - xjxi * s0; + s1 = _gx[384]; + s0 = _gx[320]; + _gx[512] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[256]; + _gx[448] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[192]; + _gx[384] = s1 - xjxi * s0; + s1 = _gx[832]; + s0 = _gx[768]; + _gx[960] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[704]; + _gx[896] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[640]; + _gx[832] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[576]; + _gx[768] = s1 - xjxi * s0; + s1 = _gx[960]; + s0 = _gx[896]; + _gx[1088] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[832]; + _gx[1024] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[768]; + _gx[960] = s1 - xjxi * s0; + } + __syncthreads(); + switch (gout_id) { + case 0: + gout0 += gx[1088] * gy[0] * gz[0]; + gout1 += gx[448] * gy[640] * gz[0]; + gout2 += gx[448] * gy[0] * gz[640]; + gout3 += gx[960] * gy[64] * gz[64]; + gout4 += gx[384] * gy[576] * gz[128]; + gout5 += gx[320] * gy[192] * gz[576]; + gout6 += gx[832] * gy[192] * gz[64]; + gout7 += gx[192] * gy[896] * gz[0]; + gout8 += gx[192] * gy[256] * gz[640]; + gout9 += gx[896] * gy[0] * gz[192]; + gout10 += gx[256] * gy[640] * gz[192]; + gout11 += gx[256] * gy[0] * gz[832]; + gout12 += gx[768] * gy[64] * gz[256]; + gout13 += gx[192] * gy[576] * gz[320]; + gout14 += gx[128] * gy[384] * gz[576]; + gout15 += gx[640] * gy[384] * gz[64]; + gout16 += gx[0] * gy[1088] * gz[0]; + gout17 += gx[0] * gy[448] * gz[640]; + gout18 += gx[704] * gy[192] * gz[192]; + gout19 += gx[64] * gy[832] * gz[192]; + gout20 += gx[64] * gy[192] * gz[832]; + gout21 += gx[576] * gy[256] * gz[256]; + gout22 += gx[0] * gy[768] * gz[320]; + gout23 += gx[128] * gy[0] * gz[960]; + gout24 += gx[640] * gy[0] * gz[448]; + gout25 += gx[0] * gy[704] * gz[384]; + gout26 += gx[0] * gy[64] * gz[1024]; + break; + case 1: + gout0 += gx[512] * gy[576] * gz[0]; + gout1 += gx[448] * gy[64] * gz[576]; + gout2 += gx[960] * gy[128] * gz[0]; + gout3 += gx[384] * gy[640] * gz[64]; + gout4 += gx[384] * gy[0] * gz[704]; + gout5 += gx[832] * gy[256] * gz[0]; + gout6 += gx[256] * gy[768] * gz[64]; + gout7 += gx[192] * gy[320] * gz[576]; + gout8 += gx[768] * gy[192] * gz[128]; + gout9 += gx[320] * gy[576] * gz[192]; + gout10 += gx[256] * gy[64] * gz[768]; + gout11 += gx[768] * gy[128] * gz[192]; + gout12 += gx[192] * gy[640] * gz[256]; + gout13 += gx[192] * gy[0] * gz[896]; + gout14 += gx[640] * gy[448] * gz[0]; + gout15 += gx[64] * gy[960] * gz[64]; + gout16 += gx[0] * gy[512] * gz[576]; + gout17 += gx[576] * gy[384] * gz[128]; + gout18 += gx[128] * gy[768] * gz[192]; + gout19 += gx[64] * gy[256] * gz[768]; + gout20 += gx[576] * gy[320] * gz[192]; + gout21 += gx[0] * gy[832] * gz[256]; + gout22 += gx[0] * gy[192] * gz[896]; + gout23 += gx[640] * gy[64] * gz[384]; + gout24 += gx[64] * gy[576] * gz[448]; + gout25 += gx[0] * gy[128] * gz[960]; + gout26 += gx[576] * gy[0] * gz[512]; + break; + case 2: + gout0 += gx[512] * gy[0] * gz[576]; + gout1 += gx[1024] * gy[0] * gz[64]; + gout2 += gx[384] * gy[704] * gz[0]; + gout3 += gx[384] * gy[64] * gz[640]; + gout4 += gx[896] * gy[192] * gz[0]; + gout5 += gx[256] * gy[832] * gz[0]; + gout6 += gx[256] * gy[192] * gz[640]; + gout7 += gx[768] * gy[256] * gz[64]; + gout8 += gx[192] * gy[768] * gz[128]; + gout9 += gx[320] * gy[0] * gz[768]; + gout10 += gx[832] * gy[0] * gz[256]; + gout11 += gx[192] * gy[704] * gz[192]; + gout12 += gx[192] * gy[64] * gz[832]; + gout13 += gx[704] * gy[384] * gz[0]; + gout14 += gx[64] * gy[1024] * gz[0]; + gout15 += gx[64] * gy[384] * gz[640]; + gout16 += gx[576] * gy[448] * gz[64]; + gout17 += gx[0] * gy[960] * gz[128]; + gout18 += gx[128] * gy[192] * gz[768]; + gout19 += gx[640] * gy[192] * gz[256]; + gout20 += gx[0] * gy[896] * gz[192]; + gout21 += gx[0] * gy[256] * gz[832]; + gout22 += gx[704] * gy[0] * gz[384]; + gout23 += gx[64] * gy[640] * gz[384]; + gout24 += gx[64] * gy[0] * gz[1024]; + gout25 += gx[576] * gy[64] * gz[448]; + gout26 += gx[0] * gy[576] * gz[512]; + break; + case 3: + gout0 += gx[1024] * gy[64] * gz[0]; + gout1 += gx[448] * gy[576] * gz[64]; + gout2 += gx[384] * gy[128] * gz[576]; + gout3 += gx[960] * gy[0] * gz[128]; + gout4 += gx[320] * gy[768] * gz[0]; + gout5 += gx[256] * gy[256] * gz[576]; + gout6 += gx[768] * gy[320] * gz[0]; + gout7 += gx[192] * gy[832] * gz[64]; + gout8 += gx[192] * gy[192] * gz[704]; + gout9 += gx[832] * gy[64] * gz[192]; + gout10 += gx[256] * gy[576] * gz[256]; + gout11 += gx[192] * gy[128] * gz[768]; + gout12 += gx[768] * gy[0] * gz[320]; + gout13 += gx[128] * gy[960] * gz[0]; + gout14 += gx[64] * gy[448] * gz[576]; + gout15 += gx[576] * gy[512] * gz[0]; + gout16 += gx[0] * gy[1024] * gz[64]; + gout17 += gx[0] * gy[384] * gz[704]; + gout18 += gx[640] * gy[256] * gz[192]; + gout19 += gx[64] * gy[768] * gz[256]; + gout20 += gx[0] * gy[320] * gz[768]; + gout21 += gx[576] * gy[192] * gz[320]; + gout22 += gx[128] * gy[576] * gz[384]; + gout23 += gx[64] * gy[64] * gz[960]; + gout24 += gx[576] * gy[128] * gz[384]; + gout25 += gx[0] * gy[640] * gz[448]; + gout26 += gx[0] * gy[0] * gz[1088]; + break; + } + } + } + if (ijk_idx < nst) { + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + shl_pair_in_block * 36 * naux + k0 + ksh_in_block * 3; + switch (gout_id) { + case 0: + eri_tensor[0*naux + 0] = gout0; + eri_tensor[1*naux + 1] = gout1; + eri_tensor[2*naux + 2] = gout2; + eri_tensor[4*naux + 0] = gout3; + eri_tensor[5*naux + 1] = gout4; + eri_tensor[6*naux + 2] = gout5; + eri_tensor[8*naux + 0] = gout6; + eri_tensor[9*naux + 1] = gout7; + eri_tensor[10*naux + 2] = gout8; + eri_tensor[12*naux + 0] = gout9; + eri_tensor[13*naux + 1] = gout10; + eri_tensor[14*naux + 2] = gout11; + eri_tensor[16*naux + 0] = gout12; + eri_tensor[17*naux + 1] = gout13; + eri_tensor[18*naux + 2] = gout14; + eri_tensor[20*naux + 0] = gout15; + eri_tensor[21*naux + 1] = gout16; + eri_tensor[22*naux + 2] = gout17; + eri_tensor[24*naux + 0] = gout18; + eri_tensor[25*naux + 1] = gout19; + eri_tensor[26*naux + 2] = gout20; + eri_tensor[28*naux + 0] = gout21; + eri_tensor[29*naux + 1] = gout22; + eri_tensor[30*naux + 2] = gout23; + eri_tensor[32*naux + 0] = gout24; + eri_tensor[33*naux + 1] = gout25; + eri_tensor[34*naux + 2] = gout26; + break; + case 1: + eri_tensor[0*naux + 1] = gout0; + eri_tensor[1*naux + 2] = gout1; + eri_tensor[3*naux + 0] = gout2; + eri_tensor[4*naux + 1] = gout3; + eri_tensor[5*naux + 2] = gout4; + eri_tensor[7*naux + 0] = gout5; + eri_tensor[8*naux + 1] = gout6; + eri_tensor[9*naux + 2] = gout7; + eri_tensor[11*naux + 0] = gout8; + eri_tensor[12*naux + 1] = gout9; + eri_tensor[13*naux + 2] = gout10; + eri_tensor[15*naux + 0] = gout11; + eri_tensor[16*naux + 1] = gout12; + eri_tensor[17*naux + 2] = gout13; + eri_tensor[19*naux + 0] = gout14; + eri_tensor[20*naux + 1] = gout15; + eri_tensor[21*naux + 2] = gout16; + eri_tensor[23*naux + 0] = gout17; + eri_tensor[24*naux + 1] = gout18; + eri_tensor[25*naux + 2] = gout19; + eri_tensor[27*naux + 0] = gout20; + eri_tensor[28*naux + 1] = gout21; + eri_tensor[29*naux + 2] = gout22; + eri_tensor[31*naux + 0] = gout23; + eri_tensor[32*naux + 1] = gout24; + eri_tensor[33*naux + 2] = gout25; + eri_tensor[35*naux + 0] = gout26; + break; + case 2: + eri_tensor[0*naux + 2] = gout0; + eri_tensor[2*naux + 0] = gout1; + eri_tensor[3*naux + 1] = gout2; + eri_tensor[4*naux + 2] = gout3; + eri_tensor[6*naux + 0] = gout4; + eri_tensor[7*naux + 1] = gout5; + eri_tensor[8*naux + 2] = gout6; + eri_tensor[10*naux + 0] = gout7; + eri_tensor[11*naux + 1] = gout8; + eri_tensor[12*naux + 2] = gout9; + eri_tensor[14*naux + 0] = gout10; + eri_tensor[15*naux + 1] = gout11; + eri_tensor[16*naux + 2] = gout12; + eri_tensor[18*naux + 0] = gout13; + eri_tensor[19*naux + 1] = gout14; + eri_tensor[20*naux + 2] = gout15; + eri_tensor[22*naux + 0] = gout16; + eri_tensor[23*naux + 1] = gout17; + eri_tensor[24*naux + 2] = gout18; + eri_tensor[26*naux + 0] = gout19; + eri_tensor[27*naux + 1] = gout20; + eri_tensor[28*naux + 2] = gout21; + eri_tensor[30*naux + 0] = gout22; + eri_tensor[31*naux + 1] = gout23; + eri_tensor[32*naux + 2] = gout24; + eri_tensor[34*naux + 0] = gout25; + eri_tensor[35*naux + 1] = gout26; + break; + case 3: + eri_tensor[1*naux + 0] = gout0; + eri_tensor[2*naux + 1] = gout1; + eri_tensor[3*naux + 2] = gout2; + eri_tensor[5*naux + 0] = gout3; + eri_tensor[6*naux + 1] = gout4; + eri_tensor[7*naux + 2] = gout5; + eri_tensor[9*naux + 0] = gout6; + eri_tensor[10*naux + 1] = gout7; + eri_tensor[11*naux + 2] = gout8; + eri_tensor[13*naux + 0] = gout9; + eri_tensor[14*naux + 1] = gout10; + eri_tensor[15*naux + 2] = gout11; + eri_tensor[17*naux + 0] = gout12; + eri_tensor[18*naux + 1] = gout13; + eri_tensor[19*naux + 2] = gout14; + eri_tensor[21*naux + 0] = gout15; + eri_tensor[22*naux + 1] = gout16; + eri_tensor[23*naux + 2] = gout17; + eri_tensor[25*naux + 0] = gout18; + eri_tensor[26*naux + 1] = gout19; + eri_tensor[27*naux + 2] = gout20; + eri_tensor[29*naux + 0] = gout21; + eri_tensor[30*naux + 1] = gout22; + eri_tensor[31*naux + 2] = gout23; + eri_tensor[33*naux + 0] = gout24; + eri_tensor[34*naux + 1] = gout25; + eri_tensor[35*naux + 2] = gout26; + break; + } + } + } +} + +__device__ +void int3c2e_bdiv_002(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 2; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double rt_ak = rt_aa * aij; + double b01 = .5/ak * (1 - rt_ak); + double cpx = xpq*rt_ak; + double trr_01x = cpx * 1; + double trr_02x = cpx * trr_01x + 1*b01 * 1; + gout0 += trr_02x * fac1 * wt; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout1 += trr_01x * trr_01y * wt; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout2 += trr_01x * fac1 * trr_01z; + double trr_02y = cpy * trr_01y + 1*b01 * fac1; + gout3 += 1 * trr_02y * wt; + gout4 += 1 * trr_01y * trr_01z; + double trr_02z = cpz * trr_01z + 1*b01 * wt; + gout5 += 1 * fac1 * trr_02z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 1 * naux + ksh_in_block * 6; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout1; + eri_tensor[0*naux + 2] = gout2; + eri_tensor[0*naux + 3] = gout3; + eri_tensor[0*naux + 4] = gout4; + eri_tensor[0*naux + 5] = gout5; + } +} + +__device__ +void int3c2e_bdiv_102(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 2; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(2, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 4*nst_per_block; + rys_roots(2, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(2, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 2; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double b01 = .5/ak * (1 - rt_ak); + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + double trr_01x = cpx * 1; + double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x; + gout0 += trr_12x * fac1 * wt; + double trr_02x = cpx * trr_01x + 1*b01 * 1; + double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += trr_02x * trr_10y * wt; + double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += trr_02x * fac1 * trr_10z; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout3 += trr_11x * trr_01y * wt; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout4 += trr_01x * trr_11y * wt; + gout5 += trr_01x * trr_01y * trr_10z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout6 += trr_11x * fac1 * trr_01z; + gout7 += trr_01x * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout8 += trr_01x * fac1 * trr_11z; + double trr_02y = cpy * trr_01y + 1*b01 * fac1; + gout9 += trr_10x * trr_02y * wt; + double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y; + gout10 += 1 * trr_12y * wt; + gout11 += 1 * trr_02y * trr_10z; + gout12 += trr_10x * trr_01y * trr_01z; + gout13 += 1 * trr_11y * trr_01z; + gout14 += 1 * trr_01y * trr_11z; + double trr_02z = cpz * trr_01z + 1*b01 * wt; + gout15 += trr_10x * fac1 * trr_02z; + gout16 += 1 * trr_10y * trr_02z; + double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z; + gout17 += 1 * fac1 * trr_12z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 3 * naux + ksh_in_block * 6; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout3; + eri_tensor[0*naux + 2] = gout6; + eri_tensor[0*naux + 3] = gout9; + eri_tensor[0*naux + 4] = gout12; + eri_tensor[0*naux + 5] = gout15; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout4; + eri_tensor[1*naux + 2] = gout7; + eri_tensor[1*naux + 3] = gout10; + eri_tensor[1*naux + 4] = gout13; + eri_tensor[1*naux + 5] = gout16; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout5; + eri_tensor[2*naux + 2] = gout8; + eri_tensor[2*naux + 3] = gout11; + eri_tensor[2*naux + 4] = gout14; + eri_tensor[2*naux + 5] = gout17; + } +} + +__device__ +void int3c2e_bdiv_112(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 3; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + double gout27 = 0; + double gout28 = 0; + double gout29 = 0; + double gout30 = 0; + double gout31 = 0; + double gout32 = 0; + double gout33 = 0; + double gout34 = 0; + double gout35 = 0; + double gout36 = 0; + double gout37 = 0; + double gout38 = 0; + double gout39 = 0; + double gout40 = 0; + double gout41 = 0; + double gout42 = 0; + double gout43 = 0; + double gout44 = 0; + double gout45 = 0; + double gout46 = 0; + double gout47 = 0; + double gout48 = 0; + double gout49 = 0; + double gout50 = 0; + double gout51 = 0; + double gout52 = 0; + double gout53 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(3, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 6*nst_per_block; + rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double b01 = .5/ak * (1 - rt_ak); + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_21x = cpx * trr_20x + 2*b00 * trr_10x; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + double trr_22x = cpx * trr_21x + 1*b01 * trr_20x + 2*b00 * trr_11x; + double trr_01x = cpx * 1; + double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x; + double hrr_112x = trr_22x - xjxi * trr_12x; + gout0 += hrr_112x * fac1 * wt; + double trr_02x = cpx * trr_01x + 1*b01 * 1; + double hrr_012x = trr_12x - xjxi * trr_02x; + double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += hrr_012x * trr_10y * wt; + double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += hrr_012x * fac1 * trr_10z; + double hrr_010y = trr_10y - yjyi * fac1; + gout3 += trr_12x * hrr_010y * wt; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + double hrr_110y = trr_20y - yjyi * trr_10y; + gout4 += trr_02x * hrr_110y * wt; + gout5 += trr_02x * hrr_010y * trr_10z; + double hrr_010z = trr_10z - zjzi * wt; + gout6 += trr_12x * fac1 * hrr_010z; + gout7 += trr_02x * trr_10y * hrr_010z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + double hrr_110z = trr_20z - zjzi * trr_10z; + gout8 += trr_02x * fac1 * hrr_110z; + double hrr_111x = trr_21x - xjxi * trr_11x; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout9 += hrr_111x * trr_01y * wt; + double hrr_011x = trr_11x - xjxi * trr_01x; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout10 += hrr_011x * trr_11y * wt; + gout11 += hrr_011x * trr_01y * trr_10z; + double hrr_011y = trr_11y - yjyi * trr_01y; + gout12 += trr_11x * hrr_011y * wt; + double trr_21y = cpy * trr_20y + 2*b00 * trr_10y; + double hrr_111y = trr_21y - yjyi * trr_11y; + gout13 += trr_01x * hrr_111y * wt; + gout14 += trr_01x * hrr_011y * trr_10z; + gout15 += trr_11x * trr_01y * hrr_010z; + gout16 += trr_01x * trr_11y * hrr_010z; + gout17 += trr_01x * trr_01y * hrr_110z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout18 += hrr_111x * fac1 * trr_01z; + gout19 += hrr_011x * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout20 += hrr_011x * fac1 * trr_11z; + gout21 += trr_11x * hrr_010y * trr_01z; + gout22 += trr_01x * hrr_110y * trr_01z; + gout23 += trr_01x * hrr_010y * trr_11z; + double hrr_011z = trr_11z - zjzi * trr_01z; + gout24 += trr_11x * fac1 * hrr_011z; + gout25 += trr_01x * trr_10y * hrr_011z; + double trr_21z = cpz * trr_20z + 2*b00 * trr_10z; + double hrr_111z = trr_21z - zjzi * trr_11z; + gout26 += trr_01x * fac1 * hrr_111z; + double hrr_110x = trr_20x - xjxi * trr_10x; + double trr_02y = cpy * trr_01y + 1*b01 * fac1; + gout27 += hrr_110x * trr_02y * wt; + double hrr_010x = trr_10x - xjxi * 1; + double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y; + gout28 += hrr_010x * trr_12y * wt; + gout29 += hrr_010x * trr_02y * trr_10z; + double hrr_012y = trr_12y - yjyi * trr_02y; + gout30 += trr_10x * hrr_012y * wt; + double trr_22y = cpy * trr_21y + 1*b01 * trr_20y + 2*b00 * trr_11y; + double hrr_112y = trr_22y - yjyi * trr_12y; + gout31 += 1 * hrr_112y * wt; + gout32 += 1 * hrr_012y * trr_10z; + gout33 += trr_10x * trr_02y * hrr_010z; + gout34 += 1 * trr_12y * hrr_010z; + gout35 += 1 * trr_02y * hrr_110z; + gout36 += hrr_110x * trr_01y * trr_01z; + gout37 += hrr_010x * trr_11y * trr_01z; + gout38 += hrr_010x * trr_01y * trr_11z; + gout39 += trr_10x * hrr_011y * trr_01z; + gout40 += 1 * hrr_111y * trr_01z; + gout41 += 1 * hrr_011y * trr_11z; + gout42 += trr_10x * trr_01y * hrr_011z; + gout43 += 1 * trr_11y * hrr_011z; + gout44 += 1 * trr_01y * hrr_111z; + double trr_02z = cpz * trr_01z + 1*b01 * wt; + gout45 += hrr_110x * fac1 * trr_02z; + gout46 += hrr_010x * trr_10y * trr_02z; + double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z; + gout47 += hrr_010x * fac1 * trr_12z; + gout48 += trr_10x * hrr_010y * trr_02z; + gout49 += 1 * hrr_110y * trr_02z; + gout50 += 1 * hrr_010y * trr_12z; + double hrr_012z = trr_12z - zjzi * trr_02z; + gout51 += trr_10x * fac1 * hrr_012z; + gout52 += 1 * trr_10y * hrr_012z; + double trr_22z = cpz * trr_21z + 1*b01 * trr_20z + 2*b00 * trr_11z; + double hrr_112z = trr_22z - zjzi * trr_12z; + gout53 += 1 * fac1 * hrr_112z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 9 * naux + ksh_in_block * 6; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout9; + eri_tensor[0*naux + 2] = gout18; + eri_tensor[0*naux + 3] = gout27; + eri_tensor[0*naux + 4] = gout36; + eri_tensor[0*naux + 5] = gout45; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout10; + eri_tensor[1*naux + 2] = gout19; + eri_tensor[1*naux + 3] = gout28; + eri_tensor[1*naux + 4] = gout37; + eri_tensor[1*naux + 5] = gout46; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout11; + eri_tensor[2*naux + 2] = gout20; + eri_tensor[2*naux + 3] = gout29; + eri_tensor[2*naux + 4] = gout38; + eri_tensor[2*naux + 5] = gout47; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[3*naux + 1] = gout12; + eri_tensor[3*naux + 2] = gout21; + eri_tensor[3*naux + 3] = gout30; + eri_tensor[3*naux + 4] = gout39; + eri_tensor[3*naux + 5] = gout48; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[4*naux + 1] = gout13; + eri_tensor[4*naux + 2] = gout22; + eri_tensor[4*naux + 3] = gout31; + eri_tensor[4*naux + 4] = gout40; + eri_tensor[4*naux + 5] = gout49; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[5*naux + 1] = gout14; + eri_tensor[5*naux + 2] = gout23; + eri_tensor[5*naux + 3] = gout32; + eri_tensor[5*naux + 4] = gout41; + eri_tensor[5*naux + 5] = gout50; + eri_tensor[6*naux + 0] = gout6; + eri_tensor[6*naux + 1] = gout15; + eri_tensor[6*naux + 2] = gout24; + eri_tensor[6*naux + 3] = gout33; + eri_tensor[6*naux + 4] = gout42; + eri_tensor[6*naux + 5] = gout51; + eri_tensor[7*naux + 0] = gout7; + eri_tensor[7*naux + 1] = gout16; + eri_tensor[7*naux + 2] = gout25; + eri_tensor[7*naux + 3] = gout34; + eri_tensor[7*naux + 4] = gout43; + eri_tensor[7*naux + 5] = gout52; + eri_tensor[8*naux + 0] = gout8; + eri_tensor[8*naux + 1] = gout17; + eri_tensor[8*naux + 2] = gout26; + eri_tensor[8*naux + 3] = gout35; + eri_tensor[8*naux + 4] = gout44; + eri_tensor[8*naux + 5] = gout53; + } +} + +__device__ +void int3c2e_bdiv_202(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + // For better load balance, consume blocks in the reversed order + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int nst_per_block = blockDim.x; + int st_id = threadIdx.x; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 3; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + nroots *= 2; + } + extern __shared__ double rw_buffer[]; + double *rw = rw_buffer + st_id; + double *rjri = rw + nst_per_block * nroots*2; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst; ijk_idx += nst_per_block) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nst_per_block] = rj[0] - ri[0]; + rjri[1*nst_per_block] = rj[1] - ri[1]; + rjri[2*nst_per_block] = rj[2] - ri[2]; + rjri[3*nst_per_block] = rr_ij; + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + double gout27 = 0; + double gout28 = 0; + double gout29 = 0; + double gout30 = 0; + double gout31 = 0; + double gout32 = 0; + double gout33 = 0; + double gout34 = 0; + double gout35 = 0; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double aj_aij = aj / aij; + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[3*nst_per_block]; + double fac1 = fac * exp(-Kab); + double xij = rjri[0*nst_per_block] * aj_aij + ri[0]; + double yij = rjri[1*nst_per_block] * aj_aij + ri[1]; + double zij = rjri[2*nst_per_block] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(3, theta_rr, rw, nst_per_block, 0, 1); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 6*nst_per_block; + rys_roots(3, theta_rr, rw1, nst_per_block, 0, 1); + rys_roots(3, theta_fac*theta_rr, rw, nst_per_block, 0, 1); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = 0; irys < 3; ++irys) { + rw[ irys*2 *nst_per_block] *= theta_fac; + rw[(irys*2+1)*nst_per_block] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + double wt = rw[(2*irys+1)*nst_per_block]; + double rt = rw[ 2*irys *nst_per_block]; + double rt_aa = rt / (aij + ak); + double b00 = .5 * rt_aa; + double rt_ak = rt_aa * aij; + double b01 = .5/ak * (1 - rt_ak); + double cpx = xpq*rt_ak; + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double c0x = rjri[0*nst_per_block] * aj_aij - xpq*rt_aij; + double trr_10x = c0x * 1; + double trr_20x = c0x * trr_10x + 1*b10 * 1; + double trr_21x = cpx * trr_20x + 2*b00 * trr_10x; + double trr_11x = cpx * trr_10x + 1*b00 * 1; + double trr_22x = cpx * trr_21x + 1*b01 * trr_20x + 2*b00 * trr_11x; + gout0 += trr_22x * fac1 * wt; + double trr_01x = cpx * 1; + double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x; + double c0y = rjri[1*nst_per_block] * aj_aij - ypq*rt_aij; + double trr_10y = c0y * fac1; + gout1 += trr_12x * trr_10y * wt; + double c0z = rjri[2*nst_per_block] * aj_aij - zpq*rt_aij; + double trr_10z = c0z * wt; + gout2 += trr_12x * fac1 * trr_10z; + double trr_02x = cpx * trr_01x + 1*b01 * 1; + double trr_20y = c0y * trr_10y + 1*b10 * fac1; + gout3 += trr_02x * trr_20y * wt; + gout4 += trr_02x * trr_10y * trr_10z; + double trr_20z = c0z * trr_10z + 1*b10 * wt; + gout5 += trr_02x * fac1 * trr_20z; + double cpy = ypq*rt_ak; + double trr_01y = cpy * fac1; + gout6 += trr_21x * trr_01y * wt; + double trr_11y = cpy * trr_10y + 1*b00 * fac1; + gout7 += trr_11x * trr_11y * wt; + gout8 += trr_11x * trr_01y * trr_10z; + double trr_21y = cpy * trr_20y + 2*b00 * trr_10y; + gout9 += trr_01x * trr_21y * wt; + gout10 += trr_01x * trr_11y * trr_10z; + gout11 += trr_01x * trr_01y * trr_20z; + double cpz = zpq*rt_ak; + double trr_01z = cpz * wt; + gout12 += trr_21x * fac1 * trr_01z; + gout13 += trr_11x * trr_10y * trr_01z; + double trr_11z = cpz * trr_10z + 1*b00 * wt; + gout14 += trr_11x * fac1 * trr_11z; + gout15 += trr_01x * trr_20y * trr_01z; + gout16 += trr_01x * trr_10y * trr_11z; + double trr_21z = cpz * trr_20z + 2*b00 * trr_10z; + gout17 += trr_01x * fac1 * trr_21z; + double trr_02y = cpy * trr_01y + 1*b01 * fac1; + gout18 += trr_20x * trr_02y * wt; + double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y; + gout19 += trr_10x * trr_12y * wt; + gout20 += trr_10x * trr_02y * trr_10z; + double trr_22y = cpy * trr_21y + 1*b01 * trr_20y + 2*b00 * trr_11y; + gout21 += 1 * trr_22y * wt; + gout22 += 1 * trr_12y * trr_10z; + gout23 += 1 * trr_02y * trr_20z; + gout24 += trr_20x * trr_01y * trr_01z; + gout25 += trr_10x * trr_11y * trr_01z; + gout26 += trr_10x * trr_01y * trr_11z; + gout27 += 1 * trr_21y * trr_01z; + gout28 += 1 * trr_11y * trr_11z; + gout29 += 1 * trr_01y * trr_21z; + double trr_02z = cpz * trr_01z + 1*b01 * wt; + gout30 += trr_20x * fac1 * trr_02z; + gout31 += trr_10x * trr_10y * trr_02z; + double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z; + gout32 += trr_10x * fac1 * trr_12z; + gout33 += 1 * trr_20y * trr_02z; + gout34 += 1 * trr_10y * trr_12z; + double trr_22z = cpz * trr_21z + 1*b01 * trr_20z + 2*b00 * trr_11z; + gout35 += 1 * fac1 * trr_22z; + } + } + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + k0 + shl_pair_in_block * 6 * naux + ksh_in_block * 6; + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 1] = gout6; + eri_tensor[0*naux + 2] = gout12; + eri_tensor[0*naux + 3] = gout18; + eri_tensor[0*naux + 4] = gout24; + eri_tensor[0*naux + 5] = gout30; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 1] = gout7; + eri_tensor[1*naux + 2] = gout13; + eri_tensor[1*naux + 3] = gout19; + eri_tensor[1*naux + 4] = gout25; + eri_tensor[1*naux + 5] = gout31; + eri_tensor[2*naux + 0] = gout2; + eri_tensor[2*naux + 1] = gout8; + eri_tensor[2*naux + 2] = gout14; + eri_tensor[2*naux + 3] = gout20; + eri_tensor[2*naux + 4] = gout26; + eri_tensor[2*naux + 5] = gout32; + eri_tensor[3*naux + 0] = gout3; + eri_tensor[3*naux + 1] = gout9; + eri_tensor[3*naux + 2] = gout15; + eri_tensor[3*naux + 3] = gout21; + eri_tensor[3*naux + 4] = gout27; + eri_tensor[3*naux + 5] = gout33; + eri_tensor[4*naux + 0] = gout4; + eri_tensor[4*naux + 1] = gout10; + eri_tensor[4*naux + 2] = gout16; + eri_tensor[4*naux + 3] = gout22; + eri_tensor[4*naux + 4] = gout28; + eri_tensor[4*naux + 5] = gout34; + eri_tensor[5*naux + 0] = gout5; + eri_tensor[5*naux + 1] = gout11; + eri_tensor[5*naux + 2] = gout17; + eri_tensor[5*naux + 3] = gout23; + eri_tensor[5*naux + 4] = gout29; + eri_tensor[5*naux + 5] = gout35; + } +} + +__device__ +void int3c2e_bdiv_212(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int shl_pair1 = bounds.shl_pair_offsets[sp_block_id+1]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int ksh1 = bounds.ksh_offsets[ksh_block_id+1]; + int nksh = ksh1 - ksh0; + int nshl_pair = shl_pair1 - shl_pair0; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int thread_id = threadIdx.x; + int st_id = thread_id % 64; + int gout_id = thread_id / 64; + int *bas = envs.bas; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int kprim = bas[ksh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int ijkprim = ijprim * kprim; + int nroots = 3; + double *env = envs.env; + double omega = env[PTR_RANGE_OMEGA]; + extern __shared__ double rw_cache[]; + double *rw = rw_cache + st_id; + double *gx = rw + nroots * 128; + double *gy = gx + 1152; + double *gz = gy + 1152; + double *Rpq = gz + 1152; + double *rjri = Rpq + 192; + int naux = bounds.naux; + double *out_local = out + bounds.ao_pair_loc[sp_block_id] * naux; + + if (gout_id == 0) { + gx[0] = 1.; + } + + int nst = nshl_pair * nksh; + for (int ijk_idx = st_id; ijk_idx < nst+st_id; ijk_idx += 64) { + int shl_pair_in_block = ijk_idx / nksh; + int ksh_in_block = ijk_idx % nksh; + if (ijk_idx >= nst) { + shl_pair_in_block = 0; + if (gout_id == 0) { + gx[0] = 0.; + } + } + int ksh = ksh_in_block + ksh0; + int bas_ij = bounds.bas_ij_idx[shl_pair_in_block + shl_pair0]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + if (gout_id == 0) { + double xjxi = rj[0] - ri[0]; + double yjyi = rj[1] - ri[1]; + double zjzi = rj[2] - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0] = xjxi; + rjri[64] = yjyi; + rjri[128] = zjzi; + rjri[192] = rr_ij; + } + double gout0 = 0; + double gout1 = 0; + double gout2 = 0; + double gout3 = 0; + double gout4 = 0; + double gout5 = 0; + double gout6 = 0; + double gout7 = 0; + double gout8 = 0; + double gout9 = 0; + double gout10 = 0; + double gout11 = 0; + double gout12 = 0; + double gout13 = 0; + double gout14 = 0; + double gout15 = 0; + double gout16 = 0; + double gout17 = 0; + double gout18 = 0; + double gout19 = 0; + double gout20 = 0; + double gout21 = 0; + double gout22 = 0; + double gout23 = 0; + double gout24 = 0; + double gout25 = 0; + double gout26 = 0; + double s0, s1, s2; + for (int ijkp = 0; ijkp < ijkprim; ++ijkp) { + int ijp = ijkp / kprim; + int kp = ijkp % kprim; + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ak = expk[kp]; + double aij = ai + aj; + double aj_aij = aj / aij; + __syncthreads(); + double xij = rjri[0] * aj_aij + ri[0]; + double yij = rjri[64] * aj_aij + ri[1]; + double zij = rjri[128] * aj_aij + ri[2]; + double xpq = xij - rk[0]; + double ypq = yij - rk[1]; + double zpq = zij - rk[2]; + if (gout_id == 0) { + double cijk = ci[ip] * cj[jp] * ck[kp]; + double fac = PI_FAC * cijk / (aij*ak*sqrt(aij+ak)); + double theta_ij = ai * aj_aij; + double Kab = theta_ij * rjri[192]; + gy[0] = fac * exp(-Kab); + Rpq[0] = xpq; + Rpq[64] = ypq; + Rpq[128] = zpq; + } + double rr = xpq * xpq + ypq * ypq + zpq * zpq; + double theta = aij * ak / (aij + ak); + double theta_rr = theta * rr; + if (omega == 0) { + rys_roots(3, theta_rr, rw, 64, gout_id, 4); + } else if (omega > 0) { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4); + __syncthreads(); + double sqrt_theta_fac = sqrt(theta_fac); + for (int irys = gout_id; irys < 3; irys+=4) { + rw[ irys*2 *64] *= theta_fac; + rw[(irys*2+1)*64] *= sqrt_theta_fac; + } + } else { + double omega2 = omega * omega; + double theta_fac = omega2 / (omega2 + theta); + double *rw1 = rw + 384; + rys_roots(3, theta_rr, rw1, 64, gout_id, 4); + rys_roots(3, theta_fac*theta_rr, rw, 64, gout_id, 4); + __syncthreads(); + double sqrt_theta_fac = -sqrt(theta_fac); + for (int irys = gout_id; irys < 3; irys+=4) { + rw[ irys*2 *64] *= theta_fac; + rw[(irys*2+1)*64] *= sqrt_theta_fac; + } + } + for (int irys = 0; irys < nroots; ++irys) { + __syncthreads(); + double rt = rw[irys*128]; + double rt_aa = rt / (aij + ak); + double rt_aij = rt_aa * ak; + double b10 = .5/aij * (1 - rt_aij); + double rt_ak = rt_aa * aij; + double b00 = .5 * rt_aa; + double b01 = .5/ak * (1 - rt_ak); + for (int n = gout_id; n < 3; n += 4) { + if (n == 2) { + gz[0] = rw[irys*128+64]; + } + double *_gx = gx + n * 1152; + double xjxi = rjri[n * 64]; + double Rpa = xjxi * aj_aij; + double c0x = Rpa - rt_aij * Rpq[n * 64]; + s0 = _gx[0]; + s1 = c0x * s0; + _gx[64] = s1; + s2 = c0x * s1 + 1 * b10 * s0; + _gx[128] = s2; + s0 = s1; + s1 = s2; + s2 = c0x * s1 + 2 * b10 * s0; + _gx[192] = s2; + double cpx = rt_ak * Rpq[n * 64]; + s0 = _gx[0]; + s1 = cpx * s0; + _gx[384] = s1; + s2 = cpx*s1 + 1 * b01 *s0; + _gx[768] = s2; + s0 = _gx[64]; + s1 = cpx * s0; + s1 += 1 * b00 * _gx[0]; + _gx[448] = s1; + s2 = cpx*s1 + 1 * b01 *s0; + s2 += 1 * b00 * _gx[384]; + _gx[832] = s2; + s0 = _gx[128]; + s1 = cpx * s0; + s1 += 2 * b00 * _gx[64]; + _gx[512] = s1; + s2 = cpx*s1 + 1 * b01 *s0; + s2 += 2 * b00 * _gx[448]; + _gx[896] = s2; + s0 = _gx[192]; + s1 = cpx * s0; + s1 += 3 * b00 * _gx[128]; + _gx[576] = s1; + s2 = cpx*s1 + 1 * b01 *s0; + s2 += 3 * b00 * _gx[512]; + _gx[960] = s2; + s1 = _gx[192]; + s0 = _gx[128]; + _gx[320] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[64]; + _gx[256] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[0]; + _gx[192] = s1 - xjxi * s0; + s1 = _gx[576]; + s0 = _gx[512]; + _gx[704] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[448]; + _gx[640] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[384]; + _gx[576] = s1 - xjxi * s0; + s1 = _gx[960]; + s0 = _gx[896]; + _gx[1088] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[832]; + _gx[1024] = s1 - xjxi * s0; + s1 = s0; + s0 = _gx[768]; + _gx[960] = s1 - xjxi * s0; + } + __syncthreads(); + switch (gout_id) { + case 0: + gout0 += gx[1088] * gy[0] * gz[0]; + gout1 += gx[320] * gy[384] * gz[384]; + gout2 += gx[640] * gy[64] * gz[384]; + gout3 += gx[1024] * gy[0] * gz[64]; + gout4 += gx[256] * gy[384] * gz[448]; + gout5 += gx[576] * gy[128] * gz[384]; + gout6 += gx[960] * gy[64] * gz[64]; + gout7 += gx[192] * gy[448] * gz[448]; + gout8 += gx[576] * gy[0] * gz[512]; + gout9 += gx[896] * gy[192] * gz[0]; + gout10 += gx[128] * gy[576] * gz[384]; + gout11 += gx[448] * gy[256] * gz[384]; + gout12 += gx[832] * gy[192] * gz[64]; + gout13 += gx[64] * gy[576] * gz[448]; + gout14 += gx[384] * gy[320] * gz[384]; + gout15 += gx[768] * gy[256] * gz[64]; + gout16 += gx[0] * gy[640] * gz[448]; + gout17 += gx[384] * gy[192] * gz[512]; + gout18 += gx[896] * gy[0] * gz[192]; + gout19 += gx[128] * gy[384] * gz[576]; + gout20 += gx[448] * gy[64] * gz[576]; + gout21 += gx[832] * gy[0] * gz[256]; + gout22 += gx[64] * gy[384] * gz[640]; + gout23 += gx[384] * gy[128] * gz[576]; + gout24 += gx[768] * gy[64] * gz[256]; + gout25 += gx[0] * gy[448] * gz[640]; + gout26 += gx[384] * gy[0] * gz[704]; + break; + case 1: + gout0 += gx[704] * gy[384] * gz[0]; + gout1 += gx[320] * gy[0] * gz[768]; + gout2 += gx[256] * gy[832] * gz[0]; + gout3 += gx[640] * gy[384] * gz[64]; + gout4 += gx[256] * gy[0] * gz[832]; + gout5 += gx[192] * gy[896] * gz[0]; + gout6 += gx[576] * gy[448] * gz[64]; + gout7 += gx[192] * gy[64] * gz[832]; + gout8 += gx[192] * gy[768] * gz[128]; + gout9 += gx[512] * gy[576] * gz[0]; + gout10 += gx[128] * gy[192] * gz[768]; + gout11 += gx[64] * gy[1024] * gz[0]; + gout12 += gx[448] * gy[576] * gz[64]; + gout13 += gx[64] * gy[192] * gz[832]; + gout14 += gx[0] * gy[1088] * gz[0]; + gout15 += gx[384] * gy[640] * gz[64]; + gout16 += gx[0] * gy[256] * gz[832]; + gout17 += gx[0] * gy[960] * gz[128]; + gout18 += gx[512] * gy[384] * gz[192]; + gout19 += gx[128] * gy[0] * gz[960]; + gout20 += gx[64] * gy[832] * gz[192]; + gout21 += gx[448] * gy[384] * gz[256]; + gout22 += gx[64] * gy[0] * gz[1024]; + gout23 += gx[0] * gy[896] * gz[192]; + gout24 += gx[384] * gy[448] * gz[256]; + gout25 += gx[0] * gy[64] * gz[1024]; + gout26 += gx[0] * gy[768] * gz[320]; + break; + case 2: + gout0 += gx[704] * gy[0] * gz[384]; + gout1 += gx[1024] * gy[64] * gz[0]; + gout2 += gx[256] * gy[448] * gz[384]; + gout3 += gx[640] * gy[0] * gz[448]; + gout4 += gx[960] * gy[128] * gz[0]; + gout5 += gx[192] * gy[512] * gz[384]; + gout6 += gx[576] * gy[64] * gz[448]; + gout7 += gx[960] * gy[0] * gz[128]; + gout8 += gx[192] * gy[384] * gz[512]; + gout9 += gx[512] * gy[192] * gz[384]; + gout10 += gx[832] * gy[256] * gz[0]; + gout11 += gx[64] * gy[640] * gz[384]; + gout12 += gx[448] * gy[192] * gz[448]; + gout13 += gx[768] * gy[320] * gz[0]; + gout14 += gx[0] * gy[704] * gz[384]; + gout15 += gx[384] * gy[256] * gz[448]; + gout16 += gx[768] * gy[192] * gz[128]; + gout17 += gx[0] * gy[576] * gz[512]; + gout18 += gx[512] * gy[0] * gz[576]; + gout19 += gx[832] * gy[64] * gz[192]; + gout20 += gx[64] * gy[448] * gz[576]; + gout21 += gx[448] * gy[0] * gz[640]; + gout22 += gx[768] * gy[128] * gz[192]; + gout23 += gx[0] * gy[512] * gz[576]; + gout24 += gx[384] * gy[64] * gz[640]; + gout25 += gx[768] * gy[0] * gz[320]; + gout26 += gx[0] * gy[384] * gz[704]; + break; + case 3: + gout0 += gx[320] * gy[768] * gz[0]; + gout1 += gx[640] * gy[448] * gz[0]; + gout2 += gx[256] * gy[64] * gz[768]; + gout3 += gx[256] * gy[768] * gz[64]; + gout4 += gx[576] * gy[512] * gz[0]; + gout5 += gx[192] * gy[128] * gz[768]; + gout6 += gx[192] * gy[832] * gz[64]; + gout7 += gx[576] * gy[384] * gz[128]; + gout8 += gx[192] * gy[0] * gz[896]; + gout9 += gx[128] * gy[960] * gz[0]; + gout10 += gx[448] * gy[640] * gz[0]; + gout11 += gx[64] * gy[256] * gz[768]; + gout12 += gx[64] * gy[960] * gz[64]; + gout13 += gx[384] * gy[704] * gz[0]; + gout14 += gx[0] * gy[320] * gz[768]; + gout15 += gx[0] * gy[1024] * gz[64]; + gout16 += gx[384] * gy[576] * gz[128]; + gout17 += gx[0] * gy[192] * gz[896]; + gout18 += gx[128] * gy[768] * gz[192]; + gout19 += gx[448] * gy[448] * gz[192]; + gout20 += gx[64] * gy[64] * gz[960]; + gout21 += gx[64] * gy[768] * gz[256]; + gout22 += gx[384] * gy[512] * gz[192]; + gout23 += gx[0] * gy[128] * gz[960]; + gout24 += gx[0] * gy[832] * gz[256]; + gout25 += gx[384] * gy[384] * gz[320]; + gout26 += gx[0] * gy[0] * gz[1088]; + break; + } + } + } + if (ijk_idx < nst) { + int *ao_loc = envs.ao_loc; + int k0 = ao_loc[ksh0] - ao_loc[nbas]; + double *eri_tensor = out_local + shl_pair_in_block * 18 * naux + k0 + ksh_in_block * 6; + switch (gout_id) { + case 0: + eri_tensor[0*naux + 0] = gout0; + eri_tensor[0*naux + 4] = gout1; + eri_tensor[1*naux + 2] = gout2; + eri_tensor[2*naux + 0] = gout3; + eri_tensor[2*naux + 4] = gout4; + eri_tensor[3*naux + 2] = gout5; + eri_tensor[4*naux + 0] = gout6; + eri_tensor[4*naux + 4] = gout7; + eri_tensor[5*naux + 2] = gout8; + eri_tensor[6*naux + 0] = gout9; + eri_tensor[6*naux + 4] = gout10; + eri_tensor[7*naux + 2] = gout11; + eri_tensor[8*naux + 0] = gout12; + eri_tensor[8*naux + 4] = gout13; + eri_tensor[9*naux + 2] = gout14; + eri_tensor[10*naux + 0] = gout15; + eri_tensor[10*naux + 4] = gout16; + eri_tensor[11*naux + 2] = gout17; + eri_tensor[12*naux + 0] = gout18; + eri_tensor[12*naux + 4] = gout19; + eri_tensor[13*naux + 2] = gout20; + eri_tensor[14*naux + 0] = gout21; + eri_tensor[14*naux + 4] = gout22; + eri_tensor[15*naux + 2] = gout23; + eri_tensor[16*naux + 0] = gout24; + eri_tensor[16*naux + 4] = gout25; + eri_tensor[17*naux + 2] = gout26; + break; + case 1: + eri_tensor[0*naux + 1] = gout0; + eri_tensor[0*naux + 5] = gout1; + eri_tensor[1*naux + 3] = gout2; + eri_tensor[2*naux + 1] = gout3; + eri_tensor[2*naux + 5] = gout4; + eri_tensor[3*naux + 3] = gout5; + eri_tensor[4*naux + 1] = gout6; + eri_tensor[4*naux + 5] = gout7; + eri_tensor[5*naux + 3] = gout8; + eri_tensor[6*naux + 1] = gout9; + eri_tensor[6*naux + 5] = gout10; + eri_tensor[7*naux + 3] = gout11; + eri_tensor[8*naux + 1] = gout12; + eri_tensor[8*naux + 5] = gout13; + eri_tensor[9*naux + 3] = gout14; + eri_tensor[10*naux + 1] = gout15; + eri_tensor[10*naux + 5] = gout16; + eri_tensor[11*naux + 3] = gout17; + eri_tensor[12*naux + 1] = gout18; + eri_tensor[12*naux + 5] = gout19; + eri_tensor[13*naux + 3] = gout20; + eri_tensor[14*naux + 1] = gout21; + eri_tensor[14*naux + 5] = gout22; + eri_tensor[15*naux + 3] = gout23; + eri_tensor[16*naux + 1] = gout24; + eri_tensor[16*naux + 5] = gout25; + eri_tensor[17*naux + 3] = gout26; + break; + case 2: + eri_tensor[0*naux + 2] = gout0; + eri_tensor[1*naux + 0] = gout1; + eri_tensor[1*naux + 4] = gout2; + eri_tensor[2*naux + 2] = gout3; + eri_tensor[3*naux + 0] = gout4; + eri_tensor[3*naux + 4] = gout5; + eri_tensor[4*naux + 2] = gout6; + eri_tensor[5*naux + 0] = gout7; + eri_tensor[5*naux + 4] = gout8; + eri_tensor[6*naux + 2] = gout9; + eri_tensor[7*naux + 0] = gout10; + eri_tensor[7*naux + 4] = gout11; + eri_tensor[8*naux + 2] = gout12; + eri_tensor[9*naux + 0] = gout13; + eri_tensor[9*naux + 4] = gout14; + eri_tensor[10*naux + 2] = gout15; + eri_tensor[11*naux + 0] = gout16; + eri_tensor[11*naux + 4] = gout17; + eri_tensor[12*naux + 2] = gout18; + eri_tensor[13*naux + 0] = gout19; + eri_tensor[13*naux + 4] = gout20; + eri_tensor[14*naux + 2] = gout21; + eri_tensor[15*naux + 0] = gout22; + eri_tensor[15*naux + 4] = gout23; + eri_tensor[16*naux + 2] = gout24; + eri_tensor[17*naux + 0] = gout25; + eri_tensor[17*naux + 4] = gout26; + break; + case 3: + eri_tensor[0*naux + 3] = gout0; + eri_tensor[1*naux + 1] = gout1; + eri_tensor[1*naux + 5] = gout2; + eri_tensor[2*naux + 3] = gout3; + eri_tensor[3*naux + 1] = gout4; + eri_tensor[3*naux + 5] = gout5; + eri_tensor[4*naux + 3] = gout6; + eri_tensor[5*naux + 1] = gout7; + eri_tensor[5*naux + 5] = gout8; + eri_tensor[6*naux + 3] = gout9; + eri_tensor[7*naux + 1] = gout10; + eri_tensor[7*naux + 5] = gout11; + eri_tensor[8*naux + 3] = gout12; + eri_tensor[9*naux + 1] = gout13; + eri_tensor[9*naux + 5] = gout14; + eri_tensor[10*naux + 3] = gout15; + eri_tensor[11*naux + 1] = gout16; + eri_tensor[11*naux + 5] = gout17; + eri_tensor[12*naux + 3] = gout18; + eri_tensor[13*naux + 1] = gout19; + eri_tensor[13*naux + 5] = gout20; + eri_tensor[14*naux + 3] = gout21; + eri_tensor[15*naux + 1] = gout22; + eri_tensor[15*naux + 5] = gout23; + eri_tensor[16*naux + 3] = gout24; + eri_tensor[17*naux + 1] = gout25; + eri_tensor[17*naux + 5] = gout26; + break; + } + } + } +} + +__device__ +int int3c2e_bdiv_unrolled(double *out, Int3c2eEnvVars envs, BDiv3c2eBounds bounds) +{ + int sp_block_id = gridDim.x - blockIdx.x - 1; + int ksh_block_id = gridDim.y - blockIdx.y - 1; + int shl_pair0 = bounds.shl_pair_offsets[sp_block_id]; + int ksh0 = bounds.ksh_offsets[ksh_block_id]; + int bas_ij0 = bounds.bas_ij_idx[shl_pair0]; + int nbas = envs.nbas; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + int *bas = envs.bas; + int li = bas[ish0*BAS_SLOTS+ANG_OF]; + int lj = bas[jsh0*BAS_SLOTS+ANG_OF]; + int lk = bas[ksh0*BAS_SLOTS+ANG_OF]; + int kij_type = lk*25 + li*5 + lj; + switch (kij_type) { + case 0: int3c2e_bdiv_000(out, envs, bounds); break; + case 5: int3c2e_bdiv_100(out, envs, bounds); break; + case 6: int3c2e_bdiv_110(out, envs, bounds); break; + case 10: int3c2e_bdiv_200(out, envs, bounds); break; + case 11: int3c2e_bdiv_210(out, envs, bounds); break; + case 12: int3c2e_bdiv_220(out, envs, bounds); break; + case 25: int3c2e_bdiv_001(out, envs, bounds); break; + case 30: int3c2e_bdiv_101(out, envs, bounds); break; + case 31: int3c2e_bdiv_111(out, envs, bounds); break; + case 35: int3c2e_bdiv_201(out, envs, bounds); break; + case 36: int3c2e_bdiv_211(out, envs, bounds); break; + case 37: int3c2e_bdiv_221(out, envs, bounds); break; + case 50: int3c2e_bdiv_002(out, envs, bounds); break; + case 55: int3c2e_bdiv_102(out, envs, bounds); break; + case 56: int3c2e_bdiv_112(out, envs, bounds); break; + case 60: int3c2e_bdiv_202(out, envs, bounds); break; + case 61: int3c2e_bdiv_212(out, envs, bounds); break; + default: return 0; + } + return 1; +} diff --git a/gpu4pyscf/pbc/df/int3c2e.py b/gpu4pyscf/pbc/df/int3c2e.py index f92b6ef6..8e53289f 100644 --- a/gpu4pyscf/pbc/df/int3c2e.py +++ b/gpu4pyscf/pbc/df/int3c2e.py @@ -287,7 +287,8 @@ def int3c2e_kernel(self, cutoff=None, verbose=None): gen_img_idx = create_img_idx(cell, bvkcell, auxcell, Ls, int3c2e_envs) uniq_l = uniq_l_ctr[:,0] - n_groups = np.count_nonzero(uniq_l <= LMAX) + assert uniq_l.max() <= LMAX + n_groups = len(uniq_l) init_constant(cell) kern = libpbc.fill_int3c2e cp.cuda.Stream.null.synchronize() diff --git a/gpu4pyscf/scf/int4c2e.py b/gpu4pyscf/scf/int4c2e.py index c0874ca1..b40377cc 100644 --- a/gpu4pyscf/scf/int4c2e.py +++ b/gpu4pyscf/scf/int4c2e.py @@ -20,7 +20,7 @@ import cupy from pyscf import gto from pyscf.scf import _vhf -from gpu4pyscf.lib.cupy_helper import block_c2s_diag, cart2sph, block_diag, contract, load_library, c2s_l +from gpu4pyscf.lib.cupy_helper import block_c2s_diag, cart2sph, block_diag, contract, load_library from gpu4pyscf.lib import logger from gpu4pyscf.gto.mole import basis_seg_contraction From c65406b865a4a1f4469ad51697e0347c437ef0c9 Mon Sep 17 00:00:00 2001 From: "xiaojie.wu" Date: Thu, 27 Feb 2025 03:14:07 +0800 Subject: [PATCH 5/6] disable unused build --- builder/build_libxc.sh | 2 +- gpu4pyscf/lib/CMakeLists.txt | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/builder/build_libxc.sh b/builder/build_libxc.sh index 486b935f..dece8b43 100644 --- a/builder/build_libxc.sh +++ b/builder/build_libxc.sh @@ -23,7 +23,7 @@ rm -rf /gpu4pyscf/put4pyscf/lib/*.so setup_dir=$(dirname $0) -cmake -S /gpu4pyscf/gpu4pyscf/lib -B build/temp.gpu4pyscf-libxc -DBUILD_DFTD3=OFF -DBUILD_DFTD4=OFF -DBUILD_GINT=OFF -DBUILD_GVHF=OFF -DBUILD_GDFT=OFF -DBUILD_CUPY_HELPER=OFF -DBUILD_SOLVENT=OFF +cmake -S /gpu4pyscf/gpu4pyscf/lib -B build/temp.gpu4pyscf-libxc -DBUILD_GINT=OFF -DBUILD_GVHF=OFF -DBUILD_GDFT=OFF -DBUILD_CUPY_HELPER=OFF -DBUILD_SOLVENT=OFF -DBUILD_GVHF_RYS=OFF -DBUILD_GVHF_MD=OFF -DBUILD_PBC=OFF -DCUDA_ARCHITECTURES="70-real" cmake --build build/temp.gpu4pyscf-libxc -j 4 mkdir -p build/lib.gpu4pyscf-libxc/gpu4pyscf/lib/deps/lib diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt index 521c2361..bb275d18 100644 --- a/gpu4pyscf/lib/CMakeLists.txt +++ b/gpu4pyscf/lib/CMakeLists.txt @@ -144,9 +144,20 @@ if(BUILD_SOLVENT) add_subdirectory(solvent) endif() -add_subdirectory(gvhf-rys) -add_subdirectory(gvhf-md) -add_subdirectory(pbc) +option(BUILD_GVHF_RYS "Using gvhf-rys" ON) +if(BUILD_GVHF_RYS) + add_subdirectory(gvhf-rys) +endif() + +option(BUILD_GVHF_MD "Using gvhf-md" ON) +if(BUILD_GVHF_MD) + add_subdirectory(gvhf-md) +endif() + +option(BUILD_PBC "Using PBC" ON) +if(BUILD_PBC) + add_subdirectory(pbc) +endif() option(BUILD_LIBXC "Using libxc for DFT" ON) if(BUILD_LIBXC) From 1d805bd9cf7d2af99e707d4c5d364c94dc0fc226 Mon Sep 17 00:00:00 2001 From: "xiaojie.wu" Date: Thu, 27 Feb 2025 03:28:13 +0800 Subject: [PATCH 6/6] add gint-rys back --- gpu4pyscf/lib/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt index bb275d18..286060be 100644 --- a/gpu4pyscf/lib/CMakeLists.txt +++ b/gpu4pyscf/lib/CMakeLists.txt @@ -144,6 +144,11 @@ if(BUILD_SOLVENT) add_subdirectory(solvent) endif() +option(BUILD_GINT_RYS "Using gint-rys" ON) +if(BUILD_GINT_RYS) + add_subdirectory(gint-rys) +endif() + option(BUILD_GVHF_RYS "Using gvhf-rys" ON) if(BUILD_GVHF_RYS) add_subdirectory(gvhf-rys) @@ -154,7 +159,7 @@ if(BUILD_GVHF_MD) add_subdirectory(gvhf-md) endif() -option(BUILD_PBC "Using PBC" ON) +option(BUILD_PBC "Using pbc" ON) if(BUILD_PBC) add_subdirectory(pbc) endif()