Merge pull request #139 from denghuilu/devel-up

fix prod_force GPU kernels error of wrong output
deepmodeling · Dec 16, 2019 · ab355d0 · ab355d0
2 parents 324c527 + bf9ba83
commit ab355d0
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 11 deletions.
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
@@ -170,6 +170,11 @@ include_directories(${TensorFlow_INCLUDE_DIRS})
 if (BUILD_CPP_IF)
   set (LIB_DEEPMD		"deepmd")
   set (LIB_DEEPMD_OP		"deepmd_op")
+  if (USE_CUDA_TOOLKIT)
+    set (LIB_DEEPMD_OP_CUDA		"deepmd_op_cuda")
+  else()
+    set (LIB_DEEPMD_OP_CUDA		"")
+  endif()
   if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9)
     set (LIB_DEEPMD_NATIVE	"deepmd_native_md")
     set (LIB_DEEPMD_IPI		"deepmd_ipi")

diff --git a/source/lmp/env.sh.in b/source/lmp/env.sh.in
@@ -8,4 +8,4 @@ TF_RPATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -Wl,-rpath=/g"`
 
 NNP_INC=" -std=c++11 @PREC_DEF@ @TTM_DEF@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/deepmd "
 NNP_PATH=" -L$TF_LIBRARY_PATH -L$DEEPMD_ROOT/lib"
-NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD@ -ldeepmd_op_cuda -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
+NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_OP_CUDA@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu
@@ -37,9 +37,13 @@ __global__ void deriv_wrt_center_atom_se_a(VALUETYPE * force,
                         const VALUETYPE * in_deriv,
                         const int ndescrpt)
 {
-    const unsigned int idx = blockIdx.x;
-    const unsigned int idy = threadIdx.x;
-    const unsigned int idz = blockIdx.y;
+    const unsigned int idx = blockIdx.y;
+    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idz = threadIdx.y;
+
+    if (idy >= ndescrpt) {
+        return;
+    }
 
     atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
@@ -84,8 +88,11 @@ void ProdForceSeALauncher(VALUETYPE * force,
 {   
     // std::cout << "I'm here!" << std::endl;
     cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3));
-    dim3 grid(nloc, 3);
-    deriv_wrt_center_atom_se_a<<<grid, ndescrpt>>>(force, net_deriv, in_deriv, ndescrpt);
+    const int LEN1 = 256;
+    const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
+    dim3 grid(nblock1, nloc);
+    dim3 thread(LEN1, 3);
+    deriv_wrt_center_atom_se_a<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
 
     const int LEN = 64;
     int nblock = (nloc + LEN -1) / LEN;

diff --git a/source/op/cuda/prod_force_se_r.cu b/source/op/cuda/prod_force_se_r.cu
@@ -36,10 +36,14 @@ __global__ void deriv_wrt_center_atom_se_r(VALUETYPE * force,
                         const VALUETYPE * in_deriv,
                         const int ndescrpt)
 {
-    const unsigned int idx = blockIdx.x;
-    const unsigned int idy = threadIdx.x;
-    const unsigned int idz = blockIdx.y;
+    const unsigned int idx = blockIdx.y;
+    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idz = threadIdx.y;
 
+    if (idy >= ndescrpt) {
+        return;
+    }
+
     atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
 
@@ -81,8 +85,11 @@ void ProdForceSeRLauncher(VALUETYPE * force,
                         const int n_a_shift)
 {
     cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3));
-    dim3 grid(nloc, 3);
-    deriv_wrt_center_atom_se_r<<<grid, ndescrpt>>>(force, net_deriv, in_deriv, ndescrpt);
+    const int LEN1 = 256;
+    const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
+    dim3 grid(nblock1, nloc);
+    dim3 thread(LEN1, 3);
+    deriv_wrt_center_atom_se_r<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
 
     const int LEN = 64;
     int nblock = (nloc + LEN -1) / LEN;