diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000..c2225c5f76
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "source/op/cuda/cub"]
+	path = source/op/cuda/cub
+	url = git://github.com/NVlabs/cub.git
diff --git a/README.md b/README.md
index 59965dc881..a6187acc03 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,7 @@
 	    - [Include deepmd in the pair style](#include-deepmd-in-the-pair-style)
 	    - [Long-range interaction](#long-range-interaction)
 	- [Run path-integral MD with i-PI](#run-path-integral-md-with-i-pi)
+	- [Use deep potential with ASE](#use-deep-potential-with-ase)
 - [Troubleshooting](#troubleshooting)
 
 # About DeePMD-kit
@@ -150,7 +151,7 @@ One should remember to activate the virtual environment every time he/she uses d
 Clone the DeePMD-kit source code
 ```bash
 cd /some/workspace
-git clone https://github.com/deepmodeling/deepmd-kit.git deepmd-kit
+git clone --recursive https://github.com/deepmodeling/deepmd-kit.git deepmd-kit -b devel
 ```
 If one downloads the .zip file from the github, then the default folder of source code would be `deepmd-kit-master` rather than `deepmd-kit`. For convenience, you may want to record the location of source to a variable, saying `deepmd_source_dir` by
 ```bash
@@ -553,6 +554,30 @@ The option **`graph_file`** provides the file name of the frozen model.
 
 The `dp_ipi` gets the atom names from an [XYZ file](https://en.wikipedia.org/wiki/XYZ_file_format) provided by **`coord_file`** (meanwhile ignores all coordinates in it), and translates the names to atom types by rules provided by **`atom_type`**.
 
+## Use deep potential with ASE
+
+Deep potential can be set up as a calculator with ASE to obtain potential energies and forces.
+```python
+from ase import Atoms
+from deepmd.calculator import DP
+
+water = Atoms('H2O',
+              positions=[(0.7601, 1.9270, 1),
+                         (1.9575, 1, 1),
+                         (1., 1., 1.)],
+              cell=[100, 100, 100],
+              calculator=DP(model="frozen_model.pb"))
+print(water.get_potential_energy())
+print(water.get_forces())
+```
+
+Optimization is also available:
+```python
+from ase.optimize import BFGS
+dyn = BFGS(water)
+dyn.run(fmax=1e-6)
+print(water.get_positions())
+```
 
 # Troubleshooting
 In consequence of various differences of computers or systems, problems may occur. Some common circumstances are listed as follows. 
diff --git a/deepmd/__init__.py b/deepmd/__init__.py
index b2711088e8..231145b989 100644
--- a/deepmd/__init__.py
+++ b/deepmd/__init__.py
@@ -3,6 +3,7 @@
 from .DeepPot    import DeepPot
 from .DeepDipole import DeepDipole
 from .DeepPolar  import DeepPolar
+from .DeepPolar  import DeepGlobalPolar
 from .DeepWFC    import DeepWFC
 
 set_mkl()
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 7b35067444..d83ffe9e0e 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -170,6 +170,11 @@ include_directories(${TensorFlow_INCLUDE_DIRS})
 if (BUILD_CPP_IF)
   set (LIB_DEEPMD		"deepmd")
   set (LIB_DEEPMD_OP		"deepmd_op")
+  if (USE_CUDA_TOOLKIT)
+    set (LIB_DEEPMD_OP_CUDA		"deepmd_op_cuda")
+  else()
+    set (LIB_DEEPMD_OP_CUDA		"")
+  endif()
   if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9)
     set (LIB_DEEPMD_NATIVE	"deepmd_native_md")
     set (LIB_DEEPMD_IPI		"deepmd_ipi")
diff --git a/source/lib/include/NNPAtomMap.h b/source/lib/include/NNPAtomMap.h
index e09b3186b6..c7474981e7 100644
--- a/source/lib/include/NNPAtomMap.h
+++ b/source/lib/include/NNPAtomMap.h
@@ -8,6 +8,7 @@ template <typename VALUETYPE>
 class NNPAtomMap 
 {
 public:
+  NNPAtomMap();
   NNPAtomMap(const vector<int >::const_iterator in_begin, 
 	     const vector<int >::const_iterator in_end);
   void forward (typename vector<VALUETYPE >::iterator out,
diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h
index a06faaa3b9..20e9bd8aaa 100644
--- a/source/lib/include/NNPInter.h
+++ b/source/lib/include/NNPInter.h
@@ -5,10 +5,11 @@
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
-
+#include "NNPAtomMap.h"
 #include <vector>
 #include "version.h"
 
+typedef double compute_t;
 using namespace tensorflow;
 using namespace std;
 
@@ -53,6 +54,7 @@ class NNPInter
 {
 public:
   NNPInter () ;
+  ~NNPInter() ;
   NNPInter  (const string & model, const int & gpu_rank = 0);
   void init (const string & model, const int & gpu_rank = 0);
   void print_summary(const string &pre) const;
@@ -74,6 +76,7 @@ class NNPInter
 		const vector<VALUETYPE> &	box, 
 		const int			nghost,
 		const LammpsNeighborList &	lmp_list,
+		const int				&   ago,
 		const vector<VALUETYPE>	&	fparam = vector<VALUETYPE>(),
 		const vector<VALUETYPE>	&	aparam = vector<VALUETYPE>());
   void compute (ENERGYTYPE &			ener,
@@ -96,6 +99,7 @@ class NNPInter
 		const vector<VALUETYPE> &	box, 
 		const int			nghost, 
 		const LammpsNeighborList &	lmp_list,
+		const int 				&   ago,
 		const vector<VALUETYPE>	&	fparam = vector<VALUETYPE>(),
 		const vector<VALUETYPE>	&	aparam = vector<VALUETYPE>());
   VALUETYPE cutoff () const {assert(inited); return rcut;};
@@ -118,12 +122,30 @@ class NNPInter
   void validate_fparam_aparam(const int & nloc,
 			      const vector<VALUETYPE> &fparam,
 			      const vector<VALUETYPE> &aparam)const ;
+
+  // copy neighbor list info from host
+  bool init_nbor;
+  std::vector<int> sec_a;
+  compute_t *array_double;
+  InternalNeighborList nlist;
+  NNPAtomMap<VALUETYPE> nnpmap;
+  unsigned long long *array_longlong;
+  int *ilist, *jrange, *jlist, *array_int;
+  int ilist_size, jrange_size, jlist_size;
+  int arr_int_size, arr_ll_size, arr_dou_size;
+
+  // function used for neighbor list copy
+  vector<int> get_sel_a() const;
+#ifdef USE_CUDA_TOOLKIT
+  void update_nbor(const InternalNeighborList & nlist, const int nloc);
+#endif
 };
 
 class NNPInterModelDevi
 {
 public:
   NNPInterModelDevi () ;
+  ~NNPInterModelDevi() ;
   NNPInterModelDevi  (const vector<string> & models, const int & gpu_rank = 0);
   void init (const vector<string> & models, const int & gpu_rank = 0);
 public:
@@ -144,6 +166,7 @@ class NNPInterModelDevi
 		const vector<VALUETYPE> &	box,
 		const int			nghost,
 		const LammpsNeighborList &	lmp_list,
+		const int 				&   ago,
 		const vector<VALUETYPE>	&	fparam = vector<VALUETYPE>(),
 		const vector<VALUETYPE>	&	aparam = vector<VALUETYPE>());
   void compute (vector<ENERGYTYPE> &		all_ener,
@@ -156,6 +179,7 @@ class NNPInterModelDevi
 		const vector<VALUETYPE> &	box,
 		const int			nghost,
 		const LammpsNeighborList &	lmp_list,
+		const int 				&   ago,
 		const vector<VALUETYPE>	&	fparam = vector<VALUETYPE>(),
 		const vector<VALUETYPE>	&	aparam = vector<VALUETYPE>());
   VALUETYPE cutoff () const {assert(inited); return rcut;};
@@ -176,6 +200,9 @@ class NNPInterModelDevi
   void compute_std_f (vector<VALUETYPE> &		std,
 		      const vector<VALUETYPE> &		avg,
 		      const vector<vector<VALUETYPE> >& xx);
+  void compute_relative_std_f (vector<VALUETYPE> &		std,
+		      const vector<VALUETYPE> &		avg,
+		      const VALUETYPE eps);
 private:
   unsigned numb_models;
   vector<Session*> sessions;
@@ -193,6 +220,25 @@ class NNPInterModelDevi
   void validate_fparam_aparam(const int & nloc,
 			      const vector<VALUETYPE> &fparam,
 			      const vector<VALUETYPE> &aparam)const ;
+
+  // copy neighbor list info from host
+  bool init_nbor;
+  compute_t *array_double;
+  vector<vector<int> > sec;
+  InternalNeighborList nlist;
+  NNPAtomMap<VALUETYPE> nnpmap;
+  unsigned long long *array_longlong;
+  int max_sec_size = 0, max_sec_back = 0;
+  int *ilist, *jrange, *jlist, *array_int;
+  int ilist_size, jrange_size, jlist_size, arr_int_size, arr_ll_size, arr_dou_size;
+
+  // function used for nborlist copy
+  void get_max_sec();
+  vector<vector<int> > get_sel() const;
+  void cum_sum(const std::vector<std::vector<int32> > n_sel);
+#ifdef USE_CUDA_TOOLKIT
+  void update_nbor(const InternalNeighborList & nlist, const int nloc);
+#endif
 };
 
 
diff --git a/source/lib/src/NNPAtomMap.cc b/source/lib/src/NNPAtomMap.cc
index edb410a98b..dd4fd73a00 100644
--- a/source/lib/src/NNPAtomMap.cc
+++ b/source/lib/src/NNPAtomMap.cc
@@ -3,6 +3,10 @@
 #include <algorithm>
 #include <cassert>
 
+template <typename VALUETYPE>
+NNPAtomMap<VALUETYPE>::
+NNPAtomMap() {}
+
 template <typename VALUETYPE>
 NNPAtomMap<VALUETYPE>::
 NNPAtomMap(const vector<int >::const_iterator in_begin, 
diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc
index fa1f989b49..47a66893c3 100644
--- a/source/lib/src/NNPInter.cc
+++ b/source/lib/src/NNPInter.cc
@@ -2,11 +2,25 @@
 #include "NNPAtomMap.h"
 #include "SimulationRegion.h"
 #include <stdexcept>	
+
+#define MAGIC_NUMBER 256
+typedef double compute_t;
+
 #ifdef  USE_CUDA_TOOLKIT
 #include "cuda_runtime.h"
 #include <tensorflow/core/public/session.h>
 #include <tensorflow/core/graph/default_device.h>
 #include <tensorflow/core/graph/graph_def_builder.h>
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
 #endif
 
 static
@@ -18,6 +32,17 @@ checkStatus(const tensorflow::Status& status) {
   }
 }
 
+static 
+std::vector<int> cum_sum (const std::vector<int32> & n_sel) {
+    std::vector<int> sec;
+    sec.resize (n_sel.size() + 1);
+    sec[0] = 0;
+    for (int ii = 1; ii < sec.size(); ++ii) {
+        sec[ii] = sec[ii-1] + n_sel[ii-1];
+    }
+    return sec;
+}
+
 static void
 convert_nlist_lmp_internal (InternalNeighborList & list,
 			    const LammpsNeighborList & lmp_list) 
@@ -333,6 +358,137 @@ make_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
   return nloc;
 }
 
+static int make_input_tensors (
+        vector<std::pair<string, Tensor>>   &   input_tensors,
+		    const vector<VALUETYPE>             &	  dcoord_,
+		    const int                           &   ntypes,
+		    const vector<int>                   &	  datype_,
+		    const vector<VALUETYPE>             &	  dbox,
+		    const int                           *   ilist, 
+		    const int                           *   jrange,
+		    const int                           *   jlist,
+		    int                                 *   array_int,
+		    unsigned long long                  *   array_longlong, 
+		    compute_t                           *   array_double,
+        const vector<VALUETYPE>	            &   fparam_,
+        const vector<VALUETYPE>	            &   aparam_,
+		    const NNPAtomMap<VALUETYPE>         &   nnpmap,
+    		const int			                      &   nghost)
+{
+    assert (dbox.size() == 9);
+
+    int nframes = 1;
+    int nall = dcoord_.size() / 3;
+    int nloc = nall - nghost;
+    assert (nall == datype_.size());
+
+    vector<int > datype = nnpmap.get_type();
+    vector<int > type_count (ntypes, 0);
+    for (unsigned ii = 0; ii < datype.size(); ++ii) {
+        type_count[datype[ii]] ++;
+    }
+    datype.insert (datype.end(), datype_.begin() + nloc, datype_.end());
+
+    TensorShape coord_shape ;
+    coord_shape.AddDim (nframes);
+    coord_shape.AddDim (nall * 3);
+    TensorShape type_shape ;
+    type_shape.AddDim (nframes);
+    type_shape.AddDim (nall);
+    TensorShape box_shape ;
+    box_shape.AddDim (nframes);
+    box_shape.AddDim (9);
+    TensorShape mesh_shape;
+    mesh_shape.AddDim (32);
+    TensorShape natoms_shape;
+    natoms_shape.AddDim (2 + ntypes);
+    TensorShape fparam_shape;
+    fparam_shape.AddDim (nframes);
+    fparam_shape.AddDim (fparam_.size());
+    TensorShape aparam_shape ;
+    aparam_shape.AddDim (nframes);
+    aparam_shape.AddDim (aparam_.size());
+
+    #ifdef HIGH_PREC
+        Tensor coord_tensor	(DT_DOUBLE, coord_shape);
+        Tensor box_tensor	(DT_DOUBLE, box_shape);
+        Tensor fparam_tensor(DT_DOUBLE, fparam_shape);
+        Tensor aparam_tensor(DT_DOUBLE, fparam_shape);
+    #else
+        Tensor coord_tensor	(DT_FLOAT, coord_shape);
+        Tensor box_tensor	(DT_FLOAT, box_shape);
+        Tensor fparam_tensor(DT_FLOAT, fparam_shape);
+        Tensor aparam_tensor(DT_FLOAT, fparam_shape);
+    #endif
+    Tensor type_tensor	(DT_INT32, type_shape);
+    Tensor mesh_tensor	(DT_INT32, mesh_shape);
+    Tensor natoms_tensor(DT_INT32, natoms_shape);
+
+    auto coord = coord_tensor.matrix<VALUETYPE> ();
+    auto type = type_tensor.matrix<int> ();
+    auto box = box_tensor.matrix<VALUETYPE> ();
+    auto mesh = mesh_tensor.flat<int> ();
+    auto natoms = natoms_tensor.flat<int> ();
+    auto fparam = fparam_tensor.matrix<VALUETYPE> ();
+    auto aparam = aparam_tensor.matrix<VALUETYPE> ();
+
+    vector<VALUETYPE> dcoord (dcoord_);
+    nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3);
+
+    for (int ii = 0; ii < nframes; ++ii) {
+        for (int jj = 0; jj < nall * 3; ++jj) {
+            coord(ii, jj) = dcoord[jj];
+        }
+        for (int jj = 0; jj < 9; ++jj) {
+            box(ii, jj) = dbox[jj];
+        }
+        for (int jj = 0; jj < nall; ++jj) {
+            type(ii, jj) = datype[jj];
+        }
+        for (int jj = 0; jj < fparam_.size(); ++jj) {
+            fparam(ii, jj) = fparam_[jj];
+        }
+        for (int jj = 0; jj < aparam_.size(); ++jj) {
+            aparam(ii, jj) = aparam_[jj];
+        }
+    }
+    
+    for (int ii = 0; ii < 32; ++ii) mesh(ii) = 0;
+    
+    mesh (0) = sizeof(int *) / sizeof(int);
+    assert (mesh(0) * sizeof(int) == sizeof(int *));
+    const int & stride = mesh(0);
+    // mesh (1) = dlist.ilist.size();
+    mesh (1) = nloc;
+    assert (mesh(1) == nloc);
+    assert (stride <= 4);
+    memcpy (&mesh(4), &(ilist), sizeof(int *));
+    memcpy (&mesh(8), &(jrange), sizeof(int *));
+    memcpy (&mesh(12), &(jlist), sizeof(int *));
+    memcpy (&mesh(16), &(array_int), sizeof(int *));
+    memcpy (&mesh(20), &(array_longlong), sizeof(unsigned long long *));
+    memcpy (&mesh(24), &(array_double), sizeof(compute_t *));
+
+    natoms (0) = nloc;
+    natoms (1) = nall;
+    for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii];
+    
+    input_tensors = {
+        {"t_coord",	coord_tensor}, 
+        {"t_type",	type_tensor},
+        {"t_box",		box_tensor},
+        {"t_mesh",	mesh_tensor},
+        {"t_natoms",	natoms_tensor},
+    };  
+    if (fparam_.size() > 0) {
+        input_tensors.push_back({"t_fparam", fparam_tensor});
+    }
+    if (aparam_.size() > 0) {
+        input_tensors.push_back({"t_aparam", aparam_tensor});
+    }
+    return nloc;
+}
+
 static void 
 run_model (ENERGYTYPE &			dener,
 	   vector<VALUETYPE> &		dforce_,
@@ -356,107 +512,197 @@ run_model (ENERGYTYPE &			dener,
     return;
   }
 
+#ifdef USE_CUDA_TOOLKIT
   std::vector<Tensor> output_tensors;
-
   checkStatus (session->Run(input_tensors, 
-			    {"o_energy", "o_force", "o_virial"}, 
+			    {"o_energy", "o_force", "o_atom_virial"}, 
 			    {}, 
 			    &output_tensors));
   
   Tensor output_e = output_tensors[0];
   Tensor output_f = output_tensors[1];
-  Tensor output_v = output_tensors[2];
+  Tensor output_av = output_tensors[2];
 
   auto oe = output_e.flat <ENERGYTYPE> ();
   auto of = output_f.flat <VALUETYPE> ();
-  auto ov = output_v.flat <VALUETYPE> ();
+  auto oav = output_av.flat <VALUETYPE> ();
 
   dener = oe(0);
   vector<VALUETYPE> dforce (3 * nall);
+  vector<VALUETYPE> datom_virial (9 * nall);
   dvirial.resize (9);
   for (unsigned ii = 0; ii < nall * 3; ++ii){
     dforce[ii] = of(ii);
   }
-  for (unsigned ii = 0; ii < 9; ++ii){
-    dvirial[ii] = ov(ii);
+  for (int ii = 0; ii < nall * 9; ++ii) {
+    datom_virial[ii] = oav(ii);
   }
+  for (int ii = 0; ii < nall; ++ii) {
+    dvirial[0] += 1.0 * datom_virial[9*ii+0];
+    dvirial[1] += 1.0 * datom_virial[9*ii+1];
+    dvirial[2] += 1.0 * datom_virial[9*ii+2];
+    dvirial[3] += 1.0 * datom_virial[9*ii+3];
+    dvirial[4] += 1.0 * datom_virial[9*ii+4];
+    dvirial[5] += 1.0 * datom_virial[9*ii+5];
+    dvirial[6] += 1.0 * datom_virial[9*ii+6];
+    dvirial[7] += 1.0 * datom_virial[9*ii+7];
+    dvirial[8] += 1.0 * datom_virial[9*ii+8];
+	}
+
   dforce_ = dforce;
   nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
-}
-
-static void 
-run_model (ENERGYTYPE &			dener,
-	   vector<VALUETYPE> &		dforce_,
-	   vector<VALUETYPE> &		dvirial,	   
-	   vector<VALUETYPE> &		datom_energy_,
-	   vector<VALUETYPE> &		datom_virial_,
-	   Session *			session, 
-	   const std::vector<std::pair<string, Tensor>> & input_tensors,
-	   const NNPAtomMap<VALUETYPE> &nnpmap, 
-	   const int			nghost = 0)
-{
-  unsigned nloc = nnpmap.get_type().size();
-  unsigned nall = nloc + nghost;
-  if (nloc == 0) {
-    dener = 0;
-    // no backward map needed
-    // dforce of size nall * 3
-    dforce_.resize(nall * 3);
-    fill(dforce_.begin(), dforce_.end(), 0.0);
-    // dvirial of size 9
-    dvirial.resize(9);
-    fill(dvirial.begin(), dvirial.end(), 0.0);
-    // datom_energy_ of size nall
-    datom_energy_.resize(nall);
-    fill(datom_energy_.begin(), datom_energy_.end(), 0.0);
-    // datom_virial_ of size nall * 9
-    datom_virial_.resize(nall * 9);
-    fill(datom_virial_.begin(), datom_virial_.end(), 0.0);
-    return;
-  }
-
+#else 
   std::vector<Tensor> output_tensors;
 
   checkStatus (session->Run(input_tensors, 
-			    {"o_energy", "o_force", "o_virial", "o_atom_energy", "o_atom_virial"}, 
+			    {"o_energy", "o_force", "o_virial"}, 
 			    {}, 
 			    &output_tensors));
-
+  
   Tensor output_e = output_tensors[0];
   Tensor output_f = output_tensors[1];
   Tensor output_v = output_tensors[2];
-  Tensor output_ae = output_tensors[3];
-  Tensor output_av = output_tensors[4];
 
   auto oe = output_e.flat <ENERGYTYPE> ();
   auto of = output_f.flat <VALUETYPE> ();
   auto ov = output_v.flat <VALUETYPE> ();
-  auto oae = output_ae.flat <VALUETYPE> ();
-  auto oav = output_av.flat <VALUETYPE> ();
 
   dener = oe(0);
   vector<VALUETYPE> dforce (3 * nall);
-  vector<VALUETYPE> datom_energy (nall, 0);
-  vector<VALUETYPE> datom_virial (9 * nall);
   dvirial.resize (9);
-  for (int ii = 0; ii < nall * 3; ++ii){
+  for (unsigned ii = 0; ii < nall * 3; ++ii){
     dforce[ii] = of(ii);
   }
-  for (int ii = 0; ii < nloc; ++ii){
-    datom_energy[ii] = oae(ii);
-  }
-  for (int ii = 0; ii < nall * 9; ++ii){
-    datom_virial[ii] = oav(ii);
-  }
-  for (int ii = 0; ii < 9; ++ii){
+  for (unsigned ii = 0; ii < 9; ++ii){
     dvirial[ii] = ov(ii);
   }
   dforce_ = dforce;
-  datom_energy_ = datom_energy;
-  datom_virial_ = datom_virial;
   nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
-  nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1);
-  nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9);
+#endif
+}
+
+static void run_model (ENERGYTYPE   &	dener,
+	   vector<VALUETYPE>            &	dforce_,
+	   vector<VALUETYPE>            &	dvirial,	   
+	   vector<VALUETYPE>            &	datom_energy_,
+	   vector<VALUETYPE>            &	datom_virial_,
+	   Session                      *	session, 
+	   const std::vector<std::pair<string, Tensor>> & input_tensors,
+	   const NNPAtomMap<VALUETYPE>  &   nnpmap, 
+	   const int			        &   nghost = 0)
+{
+    unsigned nloc = nnpmap.get_type().size();
+    unsigned nall = nloc + nghost;
+    if (nloc == 0) {
+        dener = 0;
+        // no backward map needed
+        // dforce of size nall * 3
+        dforce_.resize(nall * 3);
+        fill(dforce_.begin(), dforce_.end(), 0.0);
+        // dvirial of size 9
+        dvirial.resize(9);
+        fill(dvirial.begin(), dvirial.end(), 0.0);
+        // datom_energy_ of size nall
+        datom_energy_.resize(nall);
+        fill(datom_energy_.begin(), datom_energy_.end(), 0.0);
+        // datom_virial_ of size nall * 9
+        datom_virial_.resize(nall * 9);
+        fill(datom_virial_.begin(), datom_virial_.end(), 0.0);
+        return;
+    }
+#ifdef USE_CUDA_TOOLKIT
+    std::vector<Tensor> output_tensors;
+
+    checkStatus (session->Run(input_tensors, 
+			    {"o_energy", "o_force", "o_atom_energy", "o_atom_virial"}, 
+			    {},
+			    &output_tensors));
+
+    Tensor output_e = output_tensors[0];
+    Tensor output_f = output_tensors[1];
+    Tensor output_ae = output_tensors[2];
+    Tensor output_av = output_tensors[3];
+
+    auto oe = output_e.flat <ENERGYTYPE> ();
+    auto of = output_f.flat <VALUETYPE> ();
+    auto oae = output_ae.flat <VALUETYPE> ();
+    auto oav = output_av.flat <VALUETYPE> ();
+
+    dener = oe(0);
+    vector<VALUETYPE> dforce (3 * nall);
+    vector<VALUETYPE> datom_energy (nall, 0);
+    vector<VALUETYPE> datom_virial (9 * nall);
+    dvirial.resize (9);
+    for (int ii = 0; ii < nall * 3; ++ii) {
+        dforce[ii] = of(ii);
+    }
+    for (int ii = 0; ii < nloc; ++ii) {
+        datom_energy[ii] = oae(ii);
+    }
+    for (int ii = 0; ii < nall * 9; ++ii) {
+        datom_virial[ii] = oav(ii);
+    }
+    for (int ii = 0; ii < nall; ++ii) {
+        dvirial[0] += 1.0 * datom_virial[9*ii+0];
+        dvirial[1] += 1.0 * datom_virial[9*ii+1];
+        dvirial[2] += 1.0 * datom_virial[9*ii+2];
+        dvirial[3] += 1.0 * datom_virial[9*ii+3];
+        dvirial[4] += 1.0 * datom_virial[9*ii+4];
+        dvirial[5] += 1.0 * datom_virial[9*ii+5];
+        dvirial[6] += 1.0 * datom_virial[9*ii+6];
+        dvirial[7] += 1.0 * datom_virial[9*ii+7];
+        dvirial[8] += 1.0 * datom_virial[9*ii+8];
+	}
+    dforce_ = dforce;
+    datom_energy_ = datom_energy;
+    datom_virial_ = datom_virial;
+    nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
+    nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1);
+    nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9);
+#else
+    std::vector<Tensor> output_tensors;
+
+    checkStatus (session->Run(input_tensors, 
+	  		    {"o_energy", "o_force", "o_virial", "o_atom_energy", "o_atom_virial"}, 
+	  		    {}, 
+	  		    &output_tensors));
+
+    Tensor output_e = output_tensors[0];
+    Tensor output_f = output_tensors[1];
+    Tensor output_v = output_tensors[2];
+    Tensor output_ae = output_tensors[3];
+    Tensor output_av = output_tensors[4];
+
+    auto oe = output_e.flat <ENERGYTYPE> ();
+    auto of = output_f.flat <VALUETYPE> ();
+    auto ov = output_v.flat <VALUETYPE> ();
+    auto oae = output_ae.flat <VALUETYPE> ();
+    auto oav = output_av.flat <VALUETYPE> ();
+
+    dener = oe(0);
+    vector<VALUETYPE> dforce (3 * nall);
+    vector<VALUETYPE> datom_energy (nall, 0);
+    vector<VALUETYPE> datom_virial (9 * nall);
+    dvirial.resize (9);
+    for (int ii = 0; ii < nall * 3; ++ii) {
+        dforce[ii] = of(ii);
+    }
+    for (int ii = 0; ii < nloc; ++ii) {
+        datom_energy[ii] = oae(ii);
+    }
+    for (int ii = 0; ii < nall * 9; ++ii) {
+        datom_virial[ii] = oav(ii);
+    }
+    for (int ii = 0; ii < 9; ++ii) {
+        dvirial[ii] = ov(ii);
+    }
+    dforce_ = dforce;
+    datom_energy_ = datom_energy;
+    datom_virial_ = datom_virial;
+    nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
+    nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1);
+    nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9);
+#endif
 }
 
 static void
@@ -484,19 +730,94 @@ get_env_nthreads(int & num_intra_nthreads,
 
 NNPInter::
 NNPInter ()
-    : inited (false)
+    : inited (false), init_nbor (false)
 {
   get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
 }
 
 NNPInter::
 NNPInter (const string & model, const int & gpu_rank)
-    : inited (false)
+    : inited (false), init_nbor (false)
 {
   get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
   init(model, gpu_rank);  
 }
 
+NNPInter::~NNPInter() {
+    #ifdef USE_CUDA_TOOLKIT
+    if (init_nbor) {
+        cudaErrcheck(cudaFree(ilist));
+        cudaErrcheck(cudaFree(jrange));
+        cudaErrcheck(cudaFree(jlist));
+        cudaErrcheck(cudaFree(array_int));
+        cudaErrcheck(cudaFree(array_longlong));
+        cudaErrcheck(cudaFree(array_double));
+    }
+    #endif
+}
+
+#ifdef USE_CUDA_TOOLKIT
+void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) {
+    if (!init_nbor) {
+        sec_a = cum_sum(get_sel_a());
+        cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
+        cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
+        cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
+        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc)));
+        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
+        #ifdef HIGH_PREC
+            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
+        #else
+            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
+        #endif
+        ilist_size = nlist.ilist.size();
+        jrange_size = nlist.jrange.size();
+        jlist_size = nlist.jlist.size();
+        arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc;
+        arr_ll_size = nloc * MAGIC_NUMBER * 2;
+        arr_dou_size = nloc * sec_a.back() * 3;
+        init_nbor = true;
+    }
+    if (ilist_size < nlist.ilist.size()) {
+        cudaErrcheck(cudaFree(ilist));
+        cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
+        ilist_size = nlist.ilist.size();
+    }
+    if (jrange_size < nlist.jrange.size()) {
+        cudaErrcheck(cudaFree(jrange));
+        cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
+        jrange_size = nlist.jrange.size();
+    }
+    if (jlist_size < nlist.jlist.size()) {
+        cudaErrcheck(cudaFree(jlist));
+        cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
+        jlist_size = nlist.jlist.size();
+    }
+    if (arr_int_size < sec_a.size() + nloc * sec_a.size() + nloc) {
+        cudaErrcheck(cudaFree(array_int));
+        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc)));
+        arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc;
+    }
+    if (arr_ll_size < nloc * MAGIC_NUMBER * 2) {
+        cudaErrcheck(cudaFree(array_longlong));
+        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
+        arr_ll_size = nloc * MAGIC_NUMBER * 2;
+    }
+    if (arr_dou_size < nloc * sec_a.back() * 3) {
+        cudaErrcheck(cudaFree(array_double));
+        #ifdef HIGH_PREC
+            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
+        #else
+            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
+        #endif
+        arr_dou_size = nloc * sec_a.back() * 3;
+    }
+    cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
+    cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
+    cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
+}
+#endif // USE_CUDA_TOOLKIT
+
 #ifdef USE_CUDA_TOOLKIT
 void
 NNPInter::
@@ -533,6 +854,14 @@ init (const string & model, const int & gpu_rank)
   if (dfparam < 0) dfparam = 0;
   if (daparam < 0) daparam = 0;
   inited = true;
+
+  init_nbor = false;
+  array_int = NULL;
+  array_double = NULL;
+  array_longlong = NULL;
+  ilist = NULL; jrange = NULL; jlist = NULL;
+  ilist_size = 0; jrange_size = 0; jlist_size = 0;
+  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #else
 void
@@ -560,6 +889,14 @@ init (const string & model, const int & gpu_rank)
   // ntypes = get_ntypes();
   // dfparam = get_dfparam();
   inited = true;
+
+  init_nbor = false;
+  array_int = NULL;
+  array_double = NULL;
+  array_longlong = NULL;
+  ilist = NULL; jrange = NULL; jlist = NULL;
+  ilist_size = 0; jrange_size = 0; jlist_size = 0;
+  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #endif
 
@@ -594,6 +931,50 @@ get_scalar (const string & name) const
   return orc(0);
 }
 
+std::string graph_info(const GraphDef & graph_def) {
+    // std::stringstream buffer;
+    // std::streambuf * old = std::cout.rdbuf(buffer.rdbuf());
+    std::string str = "";
+    for (int ii = 0; ii < graph_def.node_size(); ii++) {
+        if (graph_def.node(ii).name() == "DescrptSeA") {
+            // str = graph_def.node(ii).PrintDebugString();
+            str = graph_def.node(ii).DebugString();
+            return str;
+            // std::cout << str << std::endl;
+        }
+        if (graph_def.node(ii).name() == "DescrptSeR") {
+            // str = graph_def.node(ii).PrintDebugString();
+            str = graph_def.node(ii).DebugString();
+            return str;
+            // std::cout << str << std::endl;
+        }
+    }
+}
+
+// init the tmp array data
+std::vector<int> NNPInter::get_sel_a () const {
+    std::vector<int> sel_a;
+    std::istringstream is(graph_info(graph_def));
+    std::string line = "";
+    while(is >> line) {
+        if (line.find("sel_a") != line.npos) {
+            while (std::getline(is, line) && line != "}") {
+                if (line.find("i:") != line.npos) {
+                    sel_a.push_back(atoi((line.substr(line.find("i:") + 2)).c_str()));
+                }
+            } break;
+        }
+        if (line.find("sel") != line.npos) {
+            while (std::getline(is, line) && line != "}") {
+                if (line.find("i:") != line.npos) {
+                    sel_a.push_back(atoi((line.substr(line.find("i:") + 2)).c_str()));
+                }
+            } break;
+        }
+    }
+    return sel_a;
+}
+
 void
 NNPInter::
 validate_fparam_aparam(const int & nloc,
@@ -622,7 +1003,7 @@ compute (ENERGYTYPE &			dener,
 {
   int nall = dcoord_.size() / 3;
   int nloc = nall - nghost;
-  NNPAtomMap<VALUETYPE> nnpmap (datype_.begin(), datype_.begin() + nloc);
+  nnpmap = NNPAtomMap<VALUETYPE> (datype_.begin(), datype_.begin() + nloc);
   assert (nloc == nnpmap.get_type().size());
   validate_fparam_aparam(nloc, fparam, aparam);
 
@@ -643,24 +1024,34 @@ compute (ENERGYTYPE &			dener,
 	 const vector<VALUETYPE> &	dbox, 
 	 const int			nghost,
 	 const LammpsNeighborList &	lmp_list,
+   const int               &  ago,
 	 const vector<VALUETYPE> &	fparam,
 	 const vector<VALUETYPE> &	aparam)
 {
   int nall = dcoord_.size() / 3;
   int nloc = nall - nghost;
-  NNPAtomMap<VALUETYPE> nnpmap (datype_.begin(), datype_.begin() + nloc);
-  assert (nloc == nnpmap.get_type().size());
-  validate_fparam_aparam(nloc, fparam, aparam);
-
-  InternalNeighborList nlist;
-  convert_nlist_lmp_internal (nlist, lmp_list);
-  shuffle_nlist (nlist, nnpmap);
-
-  std::vector<std::pair<string, Tensor>> input_tensors;
-  int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
-  assert (nloc == ret);
-
-  run_model (dener, dforce_, dvirial, session, input_tensors, nnpmap, nghost);
+    validate_fparam_aparam(nloc, fparam, aparam);
+    std::vector<std::pair<string, Tensor>> input_tensors;
+
+    // agp == 0 means that the LAMMPS nbor list has been updated
+    if (ago == 0) {
+        nnpmap = NNPAtomMap<VALUETYPE> (datype_.begin(), datype_.begin() + nloc);
+        assert (nloc == nnpmap.get_type().size());
+
+        // InternalNeighborList nlist;
+        convert_nlist_lmp_internal (nlist, lmp_list);
+        shuffle_nlist (nlist, nnpmap);
+        #ifdef USE_CUDA_TOOLKIT
+            update_nbor(nlist, nloc);
+        #endif
+    }
+    #ifdef USE_CUDA_TOOLKIT
+        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+    #else
+        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
+    #endif
+    assert (nloc == ret);
+    run_model (dener, dforce_, dvirial, session, input_tensors, nnpmap, nghost);
 }
 
 
@@ -677,7 +1068,7 @@ compute (ENERGYTYPE &			dener,
 	 const vector<VALUETYPE> &	fparam,
 	 const vector<VALUETYPE> &	aparam)
 {
-  NNPAtomMap<VALUETYPE> nnpmap (datype_.begin(), datype_.end());
+  nnpmap = NNPAtomMap<VALUETYPE> (datype_.begin(), datype_.end());
   validate_fparam_aparam(nnpmap.get_type().size(), fparam, aparam);
 
   std::vector<std::pair<string, Tensor>> input_tensors;
@@ -700,24 +1091,34 @@ compute (ENERGYTYPE &			dener,
 	 const vector<VALUETYPE> &	dbox, 
 	 const int			nghost, 
 	 const LammpsNeighborList &	lmp_list,
+   const int               &   ago,
 	 const vector<VALUETYPE> &	fparam,
 	 const vector<VALUETYPE> &	aparam)
 {
   int nall = dcoord_.size() / 3;
   int nloc = nall - nghost;
-  NNPAtomMap<VALUETYPE> nnpmap (datype_.begin(), datype_.begin() + nloc);
-  assert (nloc == nnpmap.get_type().size());
-  validate_fparam_aparam(nloc, fparam, aparam);
+    validate_fparam_aparam(nloc, fparam, aparam);
+    std::vector<std::pair<string, Tensor>> input_tensors;
 
-  InternalNeighborList nlist;
-  convert_nlist_lmp_internal (nlist, lmp_list);
-  shuffle_nlist (nlist, nnpmap);
+    if (ago == 0) {
+        nnpmap = NNPAtomMap<VALUETYPE> (datype_.begin(), datype_.begin() + nloc);
+        assert (nloc == nnpmap.get_type().size());
 
-  std::vector<std::pair<string, Tensor>> input_tensors;
-  int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
-  assert (nloc == ret);
+        // InternalNeighborList nlist;
+        convert_nlist_lmp_internal (nlist, lmp_list);
+        shuffle_nlist (nlist, nnpmap);
+        #ifdef USE_CUDA_TOOLKIT
+            update_nbor(nlist, nloc);
+        #endif
 
-  run_model (dener, dforce_, dvirial, datom_energy_, datom_virial_, session, input_tensors, nnpmap, nghost);
+    }
+    #ifdef USE_CUDA_TOOLKIT
+        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+    #else
+        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
+    #endif
+    assert (nloc == ret);
+    run_model (dener, dforce_, dvirial, datom_energy_, datom_virial_, session, input_tensors, nnpmap, nghost);
 }
 
 
@@ -726,6 +1127,7 @@ compute (ENERGYTYPE &			dener,
 NNPInterModelDevi::
 NNPInterModelDevi ()
     : inited (false), 
+      init_nbor (false),
       numb_models (0)
 {
   get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
@@ -734,12 +1136,26 @@ NNPInterModelDevi ()
 NNPInterModelDevi::
 NNPInterModelDevi (const vector<string> & models, const int & gpu_rank)
     : inited (false), 
+      init_nbor(false),
       numb_models (0)
 {
   get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
   init(models, gpu_rank);
 }
 
+NNPInterModelDevi::~NNPInterModelDevi() {
+#ifdef USE_CUDA_TOOLKIT
+    if (init_nbor) {
+        cudaErrcheck(cudaFree(ilist));
+        cudaErrcheck(cudaFree(jrange));
+        cudaErrcheck(cudaFree(jlist));
+        cudaErrcheck(cudaFree(array_int));
+        cudaErrcheck(cudaFree(array_longlong));
+        cudaErrcheck(cudaFree(array_double));
+    }
+#endif
+}
+
 #ifdef USE_CUDA_TOOLKIT
 void
 NNPInterModelDevi::
@@ -784,6 +1200,14 @@ init (const vector<string> & models, const int & gpu_rank)
   // cell_size = rcut;
   // ntypes = get_ntypes();
   inited = true;
+
+  init_nbor = false;
+  array_int = NULL;
+  array_double = NULL;
+  array_longlong = NULL;
+  ilist = NULL; jrange = NULL; jlist = NULL;
+  ilist_size = 0; jrange_size = 0; jlist_size = 0;
+  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #else
 void
@@ -813,6 +1237,14 @@ init (const vector<string> & models, const int & gpu_rank)
   // cell_size = rcut;
   // ntypes = get_ntypes();
   inited = true;
+
+  init_nbor = false;
+  array_int = NULL;
+  array_double = NULL;
+  array_longlong = NULL;
+  ilist = NULL; jrange = NULL; jlist = NULL;
+  ilist_size = 0; jrange_size = 0; jlist_size = 0;
+  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #endif
 
@@ -840,6 +1272,128 @@ get_scalar(const string name) const
   return myrcut;
 }
 
+// init the tmp array data
+std::vector<std::vector<int> > 
+NNPInterModelDevi::
+get_sel () const 
+{
+    std::vector<std::vector<int> > sec;
+    for (int ii = 0; ii < numb_models; ii++) {
+        std::vector<int> sel;
+        std::istringstream is(graph_info(graph_defs[ii]));
+        std::string line = "";
+        while(is >> line) {
+            if (line.find("sel") != line.npos) {
+                while (std::getline(is, line) && line != "}") {
+                    if (line.find("i:") != line.npos) {
+                        sel.push_back(atoi((line.substr(line.find("i:") + 2)).c_str()));
+                    }
+                } break;
+            }
+            if (line.find("sel_a") != line.npos) {
+                while (std::getline(is, line) && line != "}") {
+                    if (line.find("i:") != line.npos) {
+                        sel.push_back(atoi((line.substr(line.find("i:") + 2)).c_str()));
+                    }
+                } break;
+            }
+        }
+        sec.push_back(sel);
+    }
+    return sec;
+}
+
+void  
+NNPInterModelDevi::
+cum_sum (const std::vector<std::vector<int32> > n_sel) 
+{
+    for (int ii = 0; ii < numb_models; ++ii) {
+        std::vector<int> _sec;
+        _sec.resize (n_sel[ii].size() + 1);
+        _sec[0] = 0;
+        for (int jj = 1; jj < _sec.size(); ++jj) {
+            _sec[jj] = _sec[jj-1] + n_sel[ii][jj-1];
+        }
+        sec.push_back(_sec);
+    }
+}
+
+void  
+NNPInterModelDevi::
+get_max_sec() 
+{
+    for (int ii = 0; ii < numb_models; ii++) {
+        this->max_sec_size = max_sec_size < sec[ii].size() ? sec[ii].size() : max_sec_size;
+        this->max_sec_back = max_sec_back < sec[ii].back() ? sec[ii].back() : max_sec_back;
+    }
+}
+
+#ifdef USE_CUDA_TOOLKIT
+void
+NNPInterModelDevi::
+update_nbor(const InternalNeighborList & nlist, const int nloc) 
+{
+    if (!init_nbor) {
+        cum_sum(get_sel());
+        get_max_sec();
+        cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
+        cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
+        cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
+        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc)));
+        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
+        #ifdef HIGH_PREC
+            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
+        #else
+            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
+        #endif
+        ilist_size = nlist.ilist.size();
+        jrange_size = nlist.jrange.size();
+        jlist_size = nlist.jlist.size();
+        arr_int_size = max_sec_size + nloc * max_sec_size + nloc;
+        arr_ll_size = nloc * MAGIC_NUMBER * 2;
+        arr_dou_size = nloc * max_sec_back * 3;
+        init_nbor = true;
+    }
+    if (ilist_size < nlist.ilist.size()) {
+        cudaErrcheck(cudaFree(ilist));
+        cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
+        ilist_size = nlist.ilist.size();
+    }
+    if (jrange_size < nlist.jrange.size()) {
+        cudaErrcheck(cudaFree(jrange));
+        cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
+        jrange_size = nlist.jrange.size();
+    }
+    if (jlist_size < nlist.jlist.size()) {
+        cudaErrcheck(cudaFree(jlist));
+        cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
+        jlist_size = nlist.jlist.size();
+    }
+    if (arr_int_size < max_sec_size + nloc * max_sec_size + nloc) {
+        cudaErrcheck(cudaFree(array_int));
+        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc)));
+        arr_int_size = max_sec_size + nloc * max_sec_size + nloc;
+    }
+    if (arr_ll_size < nloc * MAGIC_NUMBER * 2) {
+        cudaErrcheck(cudaFree(array_longlong));
+        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
+        arr_ll_size = nloc * MAGIC_NUMBER * 2;
+    }
+    if (arr_dou_size < nloc * max_sec_back * 3) {
+        cudaErrcheck(cudaFree(array_double));
+        #ifdef HIGH_PREC
+            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
+        #else
+            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
+        #endif
+        arr_dou_size = nloc * max_sec_back * 3;
+    }
+    cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
+    cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
+    cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
+}
+#endif //USE_CUDA_TOOLKIT
+
 void
 NNPInterModelDevi::
 validate_fparam_aparam(const int & nloc,
@@ -868,7 +1422,7 @@ compute (ENERGYTYPE &			dener,
 {
   if (numb_models == 0) return;
 
-  NNPAtomMap<VALUETYPE> nnpmap (datype_.begin(), datype_.end());
+  nnpmap = NNPAtomMap<VALUETYPE> (datype_.begin(), datype_.end());
   validate_fparam_aparam(nnpmap.get_type().size(), fparam, aparam);
 
   std::vector<std::pair<string, Tensor>> input_tensors;
@@ -911,31 +1465,42 @@ compute (vector<ENERGYTYPE> &		all_energy,
 	 const vector<VALUETYPE> &	dbox,
 	 const int			nghost,
 	 const LammpsNeighborList &	lmp_list,
+  const int                &  ago,
 	 const vector<VALUETYPE> &	fparam,
 	 const vector<VALUETYPE> &	aparam)
 {
   if (numb_models == 0) return;
   int nall = dcoord_.size() / 3;
   int nloc = nall - nghost;
-  NNPAtomMap<VALUETYPE> nnpmap (datype_.begin(), datype_.begin() + nloc);
-  assert (nloc == nnpmap.get_type().size());
   validate_fparam_aparam(nloc, fparam, aparam);
-
-  InternalNeighborList nlist;
-  convert_nlist_lmp_internal (nlist, lmp_list);
-  shuffle_nlist (nlist, nnpmap);
-
   std::vector<std::pair<string, Tensor>> input_tensors;
-  int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
-  assert (nloc == ret);
 
-  all_energy.resize (numb_models);
-  all_force.resize (numb_models);
-  all_virial.resize (numb_models);
+    // agp == 0 means that the LAMMPS nbor list has been updated
+    if (ago == 0) {
+        nnpmap = NNPAtomMap<VALUETYPE> (datype_.begin(), datype_.begin() + nloc);
+        assert (nloc == nnpmap.get_type().size());
 
-  for (unsigned ii = 0; ii < numb_models; ++ii){
-    run_model (all_energy[ii], all_force[ii], all_virial[ii], sessions[ii], input_tensors, nnpmap, nghost);
-  }
+        // InternalNeighborList nlist;
+        convert_nlist_lmp_internal (nlist, lmp_list);
+        shuffle_nlist (nlist, nnpmap);
+        #ifdef USE_CUDA_TOOLKIT
+            update_nbor(nlist, nloc);
+        #endif
+
+    }
+    #ifdef USE_CUDA_TOOLKIT
+        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+    #else
+        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
+    #endif
+
+    all_energy.resize (numb_models);
+    all_force.resize (numb_models);
+    all_virial.resize (numb_models);
+    assert (nloc == ret);
+    for (unsigned ii = 0; ii < numb_models; ++ii) {
+        run_model (all_energy[ii], all_force[ii], all_virial[ii], sessions[ii], input_tensors, nnpmap, nghost);
+    }
 }
 
 void
@@ -950,33 +1515,44 @@ compute (vector<ENERGYTYPE> &			all_energy,
 	 const vector<VALUETYPE> &		dbox,
 	 const int				nghost,
 	 const LammpsNeighborList &		lmp_list,
+   const int	             &    ago,
 	 const vector<VALUETYPE> &	 	fparam,
 	 const vector<VALUETYPE> &	 	aparam)
 {
   if (numb_models == 0) return;
   int nall = dcoord_.size() / 3;
   int nloc = nall - nghost;
-  NNPAtomMap<VALUETYPE> nnpmap (datype_.begin(), datype_.begin() + nloc);
-  assert (nloc == nnpmap.get_type().size());
   validate_fparam_aparam(nloc, fparam, aparam);
-
-  InternalNeighborList nlist;
-  convert_nlist_lmp_internal (nlist, lmp_list);
-  shuffle_nlist (nlist, nnpmap);
-
   std::vector<std::pair<string, Tensor>> input_tensors;
-  int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
-  assert (nloc == ret);
-
-  all_energy.resize (numb_models);
-  all_force .resize (numb_models);
-  all_virial.resize (numb_models);
-  all_atom_energy.resize (numb_models);
-  all_atom_virial.resize (numb_models);  
 
-  for (unsigned ii = 0; ii < numb_models; ++ii){
-    run_model (all_energy[ii], all_force[ii], all_virial[ii], all_atom_energy[ii], all_atom_virial[ii], sessions[ii], input_tensors, nnpmap, nghost);
-  }
+    // agp == 0 means that the LAMMPS nbor list has been updated
+    if (ago == 0) {
+        nnpmap = NNPAtomMap<VALUETYPE> (datype_.begin(), datype_.begin() + nloc);
+        assert (nloc == nnpmap.get_type().size());
+
+        // InternalNeighborList nlist;
+        convert_nlist_lmp_internal (nlist, lmp_list);
+        shuffle_nlist (nlist, nnpmap);
+        #ifdef USE_CUDA_TOOLKIT
+            update_nbor(nlist, nloc);
+        #endif
+        
+    }
+    #ifdef USE_CUDA_TOOLKIT
+        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+    #else
+        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
+    #endif
+
+    all_energy.resize (numb_models);
+    all_force .resize (numb_models);
+    all_virial.resize (numb_models);
+    all_atom_energy.resize (numb_models);
+    all_atom_virial.resize (numb_models); 
+    assert (nloc == ret);
+    for (unsigned ii = 0; ii < numb_models; ++ii) {
+        run_model (all_energy[ii], all_force[ii], all_virial[ii], all_atom_energy[ii], all_atom_virial[ii], sessions[ii], input_tensors, nnpmap, nghost);
+    }
 }
 
 void
@@ -1114,3 +1690,22 @@ compute_std_f (vector<VALUETYPE> &		std,
   }
 }
 
+void
+NNPInterModelDevi::
+compute_relative_std_f (vector<VALUETYPE> &std,
+						const vector<VALUETYPE> &avg,
+						const VALUETYPE eps)
+{
+  unsigned nloc = std.size();
+  for (unsigned ii = 0; ii < nloc; ++ii){
+      const VALUETYPE * tmp_avg = &(avg[ii*3]);
+      VALUETYPE vdiff[3];
+      vdiff[0] = tmp_avg[0];
+      vdiff[1] = tmp_avg[1];
+      vdiff[2] = tmp_avg[2];
+      VALUETYPE f_norm = sqrt(MathUtilities::dot(vdiff, vdiff));
+      // relative std = std/(abs(f)+eps)
+      std[ii] /= f_norm + eps;
+  }
+}
+
diff --git a/source/lmp/env.sh.in b/source/lmp/env.sh.in
index 9b7676fcd9..2a9253ba70 100644
--- a/source/lmp/env.sh.in
+++ b/source/lmp/env.sh.in
@@ -8,4 +8,4 @@ TF_RPATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -Wl,-rpath=/g"`
 
 NNP_INC=" -std=c++11 @PREC_DEF@ @TTM_DEF@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/deepmd "
 NNP_PATH=" -L$TF_LIBRARY_PATH -L$DEEPMD_ROOT/lib"
-NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
+NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_OP_CUDA@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
diff --git a/source/lmp/pair_nnp.cpp b/source/lmp/pair_nnp.cpp
index 372adcba77..0b74dd38eb 100644
--- a/source/lmp/pair_nnp.cpp
+++ b/source/lmp/pair_nnp.cpp
@@ -1,4 +1,5 @@
 #include <iostream>
+#include <string.h>
 #include <iomanip>
 #include <limits>
 #include "atom.h"
@@ -42,6 +43,7 @@ static int stringCmp(const void *a, const void* b)
 
 int PairNNP::get_node_rank() {
     char host_name[MPI_MAX_PROCESSOR_NAME];
+    memset(host_name, '\0', sizeof(char) * MPI_MAX_PROCESSOR_NAME);
     char (*host_names)[MPI_MAX_PROCESSOR_NAME];
     int n, namelen, color, rank, nprocs, myrank;
     size_t bytes;
@@ -53,6 +55,10 @@ int PairNNP::get_node_rank() {
 
     bytes = nprocs * sizeof(char[MPI_MAX_PROCESSOR_NAME]);
     host_names = (char (*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);
+    for (int ii = 0; ii < nprocs; ii++) {
+        memset(host_names[ii], '\0', sizeof(char) * MPI_MAX_PROCESSOR_NAME);
+    }
+    
     strcpy(host_names[rank], host_name);
 
     for (n=0; n<nprocs; n++)
@@ -194,7 +200,9 @@ PairNNP::PairNNP(LAMMPS *lmp)
   eps = 0.;
   scale = NULL;
   do_ttm = false;
-
+  single_model = false;
+  multi_models_mod_devi = false;
+  multi_models_no_mod_devi = false;
   // set comm size needed by this Pair
   comm_reverse = 1;
 
@@ -288,16 +296,26 @@ void PairNNP::compute(int eflag, int vflag)
 #endif
   }
 
+  // int ago = numb_models > 1 ? 0 : neighbor->ago;
+  int ago = neighbor->ago;
+  if (numb_models > 1) {
+      if (multi_models_no_mod_devi && (out_freq > 0 && update->ntimestep % out_freq == 0)) {
+          ago = 0;
+      }
+      else if (multi_models_mod_devi && (out_freq == 0 || update->ntimestep % out_freq != 0)) {
+        ago = 0;
+      }
+  }
   // compute
-  bool single_model = (numb_models == 1);
-  bool multi_models_no_mod_devi = (numb_models > 1 && (out_freq == 0 || update->ntimestep % out_freq != 0));
-  bool multi_models_mod_devi = (numb_models > 1 && (out_freq > 0 && update->ntimestep % out_freq == 0));
+  single_model = (numb_models == 1);
+  multi_models_no_mod_devi = (numb_models > 1 && (out_freq == 0 || update->ntimestep % out_freq != 0));
+  multi_models_mod_devi = (numb_models > 1 && (out_freq > 0 && update->ntimestep % out_freq == 0));
   if (do_ghost) {
     LammpsNeighborList lmp_list (list->inum, list->ilist, list->numneigh, list->firstneigh);
     if (single_model || multi_models_no_mod_devi) {
       if ( ! (eflag_atom || vflag_atom) ) {      
 #ifdef HIGH_PREC
-	nnp_inter.compute (dener, dforce, dvirial, dcoord, dtype, dbox, nghost, lmp_list, fparam, daparam);
+	nnp_inter.compute (dener, dforce, dvirial, dcoord, dtype, dbox, nghost, lmp_list, ago, fparam, daparam);
 #else
 	vector<float> dcoord_(dcoord.size());
 	vector<float> dbox_(dbox.size());
@@ -306,7 +324,7 @@ void PairNNP::compute(int eflag, int vflag)
 	vector<float> dforce_(dforce.size(), 0);
 	vector<float> dvirial_(dvirial.size(), 0);
 	double dener_ = 0;
-	nnp_inter.compute (dener_, dforce_, dvirial_, dcoord_, dtype, dbox_, nghost, lmp_list, fparam, daparam);
+	nnp_inter.compute (dener_, dforce_, dvirial_, dcoord_, dtype, dbox_, nghost, lmp_list, ago, fparam, daparam);
 	for (unsigned dd = 0; dd < dforce.size(); ++dd) dforce[dd] = dforce_[dd];	
 	for (unsigned dd = 0; dd < dvirial.size(); ++dd) dvirial[dd] = dvirial_[dd];	
 	dener = dener_;
@@ -317,7 +335,7 @@ void PairNNP::compute(int eflag, int vflag)
 	vector<double > deatom (nall * 1, 0);
 	vector<double > dvatom (nall * 9, 0);
 #ifdef HIGH_PREC
-	nnp_inter.compute (dener, dforce, dvirial, deatom, dvatom, dcoord, dtype, dbox, nghost, lmp_list, fparam, daparam);
+	nnp_inter.compute (dener, dforce, dvirial, deatom, dvatom, dcoord, dtype, dbox, nghost, lmp_list, ago, fparam, daparam);
 #else 
 	vector<float> dcoord_(dcoord.size());
 	vector<float> dbox_(dbox.size());
@@ -328,7 +346,7 @@ void PairNNP::compute(int eflag, int vflag)
 	vector<float> deatom_(dforce.size(), 0);
 	vector<float> dvatom_(dforce.size(), 0);
 	double dener_ = 0;
-	nnp_inter.compute (dener_, dforce_, dvirial_, deatom_, dvatom_, dcoord_, dtype, dbox_, nghost, lmp_list, fparam, daparam);
+	nnp_inter.compute (dener_, dforce_, dvirial_, deatom_, dvatom_, dcoord_, dtype, dbox_, nghost, lmp_list, ago, fparam, daparam);
 	for (unsigned dd = 0; dd < dforce.size(); ++dd) dforce[dd] = dforce_[dd];	
 	for (unsigned dd = 0; dd < dvirial.size(); ++dd) dvirial[dd] = dvirial_[dd];	
 	for (unsigned dd = 0; dd < deatom.size(); ++dd) deatom[dd] = deatom_[dd];	
@@ -358,7 +376,7 @@ void PairNNP::compute(int eflag, int vflag)
       vector<vector<double>> 	all_virial;	       
       vector<vector<double>> 	all_atom_energy;
       vector<vector<double>> 	all_atom_virial;
-      nnp_inter_model_devi.compute(all_energy, all_force, all_virial, all_atom_energy, all_atom_virial, dcoord, dtype, dbox, nghost, lmp_list, fparam, daparam);
+      nnp_inter_model_devi.compute(all_energy, all_force, all_virial, all_atom_energy, all_atom_virial, dcoord, dtype, dbox, nghost, lmp_list, ago, fparam, daparam);
       // nnp_inter_model_devi.compute_avg (dener, all_energy);
       // nnp_inter_model_devi.compute_avg (dforce, all_force);
       // nnp_inter_model_devi.compute_avg (dvirial, all_virial);
@@ -384,7 +402,7 @@ void PairNNP::compute(int eflag, int vflag)
       vector<vector<float>> 	all_virial_;	       
       vector<vector<float>> 	all_atom_energy_;
       vector<vector<float>> 	all_atom_virial_;
-      nnp_inter_model_devi.compute(all_energy_, all_force_, all_virial_, all_atom_energy_, all_atom_virial_, dcoord_, dtype, dbox_, nghost, lmp_list, fparam, daparam);
+      nnp_inter_model_devi.compute(all_energy_, all_force_, all_virial_, all_atom_energy_, all_atom_virial_, dcoord_, dtype, dbox_, nghost, lmp_list, ago, fparam, daparam);
       // nnp_inter_model_devi.compute_avg (dener_, all_energy_);
       // nnp_inter_model_devi.compute_avg (dforce_, all_force_);
       // nnp_inter_model_devi.compute_avg (dvirial_, all_virial_);
@@ -432,6 +450,9 @@ void PairNNP::compute(int eflag, int vflag)
 	vector<double> tmp_avg_f;
 	nnp_inter_model_devi.compute_avg (tmp_avg_f, all_force);  
 	nnp_inter_model_devi.compute_std_f (std_f, tmp_avg_f, all_force);
+	if (out_rel == 1){
+	    nnp_inter_model_devi.compute_relative_std_f (std_f, tmp_avg_f, eps);
+	}
 #else 
 	vector<float> tmp_avg_f_, std_f_;
 	for (unsigned ii = 0; ii < all_force_.size(); ++ii){
@@ -443,6 +464,9 @@ void PairNNP::compute(int eflag, int vflag)
 	nnp_inter_model_devi.compute_std_f (std_f_, tmp_avg_f_, all_force_);
 	std_f.resize(std_f_.size());
 	for (int dd = 0; dd < std_f_.size(); ++dd) std_f[dd] = std_f_[dd];
+	if (out_rel == 1){
+	    nnp_inter_model_devi.compute_relative_std_f (std_f_, tmp_avg_f_, eps);
+	}
 #endif
 	double min = numeric_limits<double>::max(), max = 0, avg = 0;
 	ana_st(max, min, avg, std_f, nlocal);
@@ -496,16 +520,7 @@ void PairNNP::compute(int eflag, int vflag)
 	      // 1. If the atom_style is not atomic (e.g. charge), the order of std_f is different from that of atom ids.
               // 2. std_f is not gathered by MPI.
 	      for (int dd = 0; dd < all_nlocal; ++dd) {
-          if (out_rel == 1){
-            // relative std = std/(abs(f)+1)
-#ifdef HIGH_PREC
-            fp << " " << setw(18) << std_f[dd] / (fabs(tmp_avg_f[dd]) + eps);
-#else
-            fp << " " << setw(18) << std_f[dd] / (fabsf(tmp_avg_f_[dd]) + eps);
-#endif
-          } else {
             fp << " " << setw(18) << std_f[dd];	
-          }
         }
 	  }
 	  fp << endl;
diff --git a/source/lmp/pair_nnp.h.in b/source/lmp/pair_nnp.h.in
index fbc71a4e6f..d41546438c 100644
--- a/source/lmp/pair_nnp.h.in
+++ b/source/lmp/pair_nnp.h.in
@@ -82,6 +82,9 @@ private:
   int dim_aparam;
   int out_each;
   int out_rel;
+  bool single_model;
+  bool multi_models_mod_devi;
+  bool multi_models_no_mod_devi;
 #ifdef HIGH_PREC
   vector<double > fparam;
   vector<double > aparam;
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 58051ff45b..6fdc02cfd6 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -4,12 +4,24 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_D
 
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
 file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc )
+file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc )
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 
-if (BUILD_CPP_IF)
-  add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC})
+if (BUILD_CPP_IF) 
+    if (USE_CUDA_TOOLKIT)
+        add_library(${LIB_DEEPMD_OP} SHARED ${OP_CUDA_SRC})
+        add_subdirectory(cuda)
+        find_package(CUDA REQUIRED)
+        include_directories(${CUDA_INCLUDE_DIRS})
+        set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda)
+        target_link_libraries (${LIB_DEEPMD_OP} ${EXTRA_LIBS})
+        target_link_libraries (${LIB_DEEPMD_OP} ${CUDA_LIBRARIES})
+    else (USE_CUDA_TOOLKIT)
+        add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC})
+    endif (USE_CUDA_TOOLKIT)
 endif (BUILD_CPP_IF)
+
 if (BUILD_PY_IF)
   add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
   add_library(op_grads SHARED ${OP_GRADS_SRC})
diff --git a/source/op/cuda/CMakeLists.txt b/source/op/cuda/CMakeLists.txt
new file mode 100644
index 0000000000..a9847abd3e
--- /dev/null
+++ b/source/op/cuda/CMakeLists.txt
@@ -0,0 +1,32 @@
+# required cmake version
+cmake_minimum_required(VERSION 3.0)
+# project name
+project(deepmd_op_cuda)
+
+# SET(CUDA_SEPARABLE_COMPILATION ON)
+find_package(CUDA REQUIRED)
+if (NOT CUDA_FOUND)
+    message(STATUS "CUDA not found. Project will not be built.")
+endif(NOT CUDA_FOUND)
+
+# set c++ version c++11
+SET(CMAKE_CXX_STANDARD 11)
+SET(CMAKE_CUDA_STANDARD 11)
+# nvcc -o libdeepmd_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true -DHIGH_PREC=true -gencode arch=compute_61,code=sm_61 -shared -Xcompiler -fPIC deepmd_op.cu -L/usr/local/cuda/lib64 -lcudadevrt
+# very important here! Include path to cub.
+include_directories(cub) 
+# nvcc flags
+set(CUDA_NVCC_FLAGS -gencode arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic Pascal)
+                    -gencode arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4, Discrete GPU on the NVIDIA Drive PX2
+                    -gencode arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
+                    -gencode arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
+                    -O3; -Xcompiler -fPIC;
+    )
+
+set (SOURCE_FILES
+    descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu 
+)
+
+cuda_add_library(deepmd_op_cuda SHARED ${SOURCE_FILES})
+
+install(TARGETS deepmd_op_cuda DESTINATION lib/)
diff --git a/source/op/cuda/cub b/source/op/cuda/cub
new file mode 160000
index 0000000000..c3cceac115
--- /dev/null
+++ b/source/op/cuda/cub
@@ -0,0 +1 @@
+Subproject commit c3cceac115c072fb63df1836ff46d8c60d9eb304
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
new file mode 100644
index 0000000000..8893e8a00a
--- /dev/null
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -0,0 +1,382 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#define EIGEN_USE_GPU
+#include <vector>
+#include <climits>
+#include <stdio.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cuda_runtime.h>
+
+#define MAGIC_NUMBER 256
+
+#ifdef HIGH_PREC
+    typedef double  VALUETYPE;
+#else
+    typedef float   VALUETYPE;
+#endif
+
+typedef double compute_t;
+
+typedef unsigned long long int_64;
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
+
+template <
+    typename    Key,
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD>
+__launch_bounds__ (BLOCK_THREADS)
+__global__ void BlockSortKernel(
+    Key * d_in,
+    Key * d_out)            // Tile of output
+{   
+    enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
+    typedef cub::BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
+    // Specialize BlockRadixSort type for our thread block
+    typedef cub::BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+    // Shared memory
+    __shared__ union TempStorage
+    {
+        typename BlockLoadT::TempStorage        load;
+        typename BlockRadixSortT::TempStorage   sort;
+    } temp_storage;
+    // Per-thread tile items
+    Key items[ITEMS_PER_THREAD];
+    // Our current block's offset
+    int block_offset = blockIdx.x * TILE_SIZE;
+    // Load items into a blocked arrangement
+    BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+    // Barrier for smem reuse
+    __syncthreads();
+    // Sort keys
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
+    // Store output in striped fashion
+    cub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+}
+
+template<typename T>
+__device__ inline T dev_dot(T * arr1, T * arr2) {
+    return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
+}
+
+__device__ inline void spline5_switch(compute_t & vv,
+        compute_t & dd,
+        compute_t & xx, 
+		const compute_t & rmin, 
+		const compute_t & rmax) 
+{
+    if (xx < rmin) {
+        dd = 0;
+        vv = 1;
+    }
+    else if (xx < rmax) {
+        compute_t uu = (xx - rmin) / (rmax - rmin) ;
+        compute_t du = 1. / (rmax - rmin) ;
+        vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
+        dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
+    }
+    else {
+        dd = 0;
+        vv = 0;
+    }
+}
+
+__global__ void get_i_idx_se_a(const int nloc,
+                        const int * ilist,
+                        int * i_idx)
+{
+    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+    if(idy >= nloc) {
+        return;
+    }
+    i_idx[ilist[idy]] = idy;
+}
+
+__global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord,
+                            const int * type,
+                            const int  * jrange,
+                            const int  * jlist,
+                            const float rcut,
+                            int_64 * key,
+                            int * i_idx)
+{   
+    // <<<nloc, MAGIC_NUMBER>>>
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = threadIdx.x;
+    
+    const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]];
+    if (idy >= nsize) {
+        return;
+    }
+
+    const int * nei_idx = jlist + jrange[i_idx[idx]];
+    // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize);
+
+    int_64 * key_in = key + idx * MAGIC_NUMBER;
+
+    compute_t diff[3];
+    const int & j_idx = nei_idx[idy];
+    for (int dd = 0; dd < 3; dd++) {
+        diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
+    }
+    compute_t rr = sqrt(dev_dot(diff, diff)); 
+    if (rr <= rcut) {
+        key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx;
+    }
+}
+
+    // bubble_sort(sel_nei, num_nei);
+__global__ void format_nlist_fill_b_se_a(int * nlist,
+                            const int nlist_size,
+                            const int nloc,
+                            const int * jrange,
+                            const int * jlist,
+                            int_64 * key,
+                            const int * sec_a,
+                            const int sec_a_size,
+                            int * nei_iter_dev)
+{ 
+
+    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if(idy >= nloc) {
+        return;
+    }
+    
+    int * row_nlist = nlist + idy * nlist_size;
+    int * nei_iter = nei_iter_dev + idy * sec_a_size;
+    int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER;
+
+    for (int ii = 0; ii < sec_a_size; ii++) {
+        nei_iter[ii] = sec_a[ii];
+    }
+    
+    for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) {
+        const int & nei_type = key_out[kk] / 1E15;
+        if (nei_iter[nei_type] < sec_a[nei_type + 1]) {
+            row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000;
+        }
+    }
+}
+//it's ok!
+
+__global__ void compute_descriptor_se_a (VALUETYPE* descript,
+                            const int ndescrpt,
+                            VALUETYPE* descript_deriv,
+                            const int descript_deriv_size,
+                            VALUETYPE* rij,
+                            const int rij_size,
+                            const int* type,
+                            const VALUETYPE* avg,
+                            const VALUETYPE* std,
+                            int* nlist,
+                            const int nlist_size,
+                            const VALUETYPE* coord,
+                            const VALUETYPE rmin,
+                            const VALUETYPE rmax,
+                            compute_t* sel_a_diff_dev)
+{   
+    // <<<nloc, sec_a.back()>>>
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = threadIdx.x;
+    const int idx_deriv = idy * 4 * 3;	// 4 components time 3 directions
+    const int idx_value = idy * 4;	// 4 components
+
+    // else {return;}
+    VALUETYPE * row_descript = descript + idx * ndescrpt;
+    VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
+    VALUETYPE * row_rij = rij + idx * rij_size;
+    compute_t * sel_a_diff = sel_a_diff_dev + idx * nlist_size * 3;
+    int * row_nlist = nlist + idx * nlist_size;
+
+    if (row_nlist[idy] >= 0) {
+        const int & j_idx = row_nlist[idy];
+        for (int kk = 0; kk < 3; kk++) {
+            sel_a_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
+            row_rij[idy * 3 + kk] = sel_a_diff[idy * 3 + kk];
+        }
+        const compute_t * rr = &sel_a_diff[idy * 3 + 0];
+        compute_t nr2 = dev_dot(rr, rr);
+        compute_t inr = 1./sqrt(nr2);
+        compute_t nr = nr2 * inr;
+        compute_t inr2 = inr * inr;
+        compute_t inr4 = inr2 * inr2;
+        compute_t inr3 = inr4 * nr;
+        compute_t sw, dsw;
+        spline5_switch(sw, dsw, nr, rmin, rmax);
+        row_descript[idx_value + 0] = (1./nr)       ;//* sw;
+        row_descript[idx_value + 1] = (rr[0] / nr2) ;//* sw;
+        row_descript[idx_value + 2] = (rr[1] / nr2) ;//* sw;
+        row_descript[idx_value + 3] = (rr[2] / nr2) ;//* sw;
+
+        row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
+        row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
+        row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
+        // ****deriv of component x/r2
+        row_descript_deriv[idx_deriv + 3] = ((2. * rr[0] * rr[0] * inr4 - inr2) * sw - row_descript[idx_value + 1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
+        row_descript_deriv[idx_deriv + 4] = ((2. * rr[0] * rr[1] * inr4	) * sw - row_descript[idx_value + 1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
+        row_descript_deriv[idx_deriv + 5] = ((2. * rr[0] * rr[2] * inr4	) * sw - row_descript[idx_value + 1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
+        // ***deriv of component y/r2
+        row_descript_deriv[idx_deriv + 6] = ((2. * rr[1] * rr[0] * inr4	) * sw - row_descript[idx_value + 2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
+        row_descript_deriv[idx_deriv + 7] = ((2. * rr[1] * rr[1] * inr4 - inr2) * sw - row_descript[idx_value + 2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
+        row_descript_deriv[idx_deriv + 8] = ((2. * rr[1] * rr[2] * inr4	) * sw - row_descript[idx_value + 2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
+        // ***deriv of component z/r2
+        row_descript_deriv[idx_deriv + 9] = ((2. * rr[2] * rr[0] * inr4	) * sw - row_descript[idx_value + 3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
+        row_descript_deriv[idx_deriv +10] = ((2. * rr[2] * rr[1] * inr4	) * sw - row_descript[idx_value + 3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
+        row_descript_deriv[idx_deriv +11] = ((2. * rr[2] * rr[2] * inr4 - inr2) * sw - row_descript[idx_value + 3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
+        // 4 value components
+        row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
+        row_descript[idx_value + 1] *= sw; // * descript[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + idx_value + 1];
+        row_descript[idx_value + 2] *= sw; // * descript[idx * ndescrpt + idx_value + 2]);// - avg[type[idx] * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt + idx_value + 2];
+        row_descript[idx_value + 3] *= sw; // * descript[idx * ndescrpt + idx_value + 3]);// - avg[type[idx] * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt + idx_value + 3];
+    }
+
+    for (int ii = 0; ii < 4; ii++) {
+        row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii];
+    }
+    // idy nloc, idx ndescrpt * 3
+    // descript_deriv[idy * ndescrpt * 3 + idx] = (descript_deriv_dev[idy * (ndescrpt * 3) + idx]) / std[type[idy] * ndescrpt + idx / 3];
+    for (int ii = 0; ii < 12; ii++) {
+        row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3];
+    }
+}
+
+void DescrptSeALauncher(const VALUETYPE* coord,
+                            const int* type,
+                            const int* ilist,
+                            const int* jrange,
+                            const int* jlist,
+                            int* array_int,
+                            unsigned long long* array_longlong,
+                            compute_t* array_double,
+                            const VALUETYPE* avg,
+                            const VALUETYPE* std,
+                            VALUETYPE* descript,
+                            VALUETYPE* descript_deriv,
+                            VALUETYPE* rij,
+                            int* nlist,
+                            const int& ntypes,
+                            const int& nloc,       
+                            const int& nall,       
+                            const int& nnei,       
+                            const float& rcut_r,     
+                            const float& rcut_r_smth,
+                            const int& ndescrpt, 
+                            const std::vector<int>& sec_a,      
+                            const bool& fill_nei_a
+)
+{   
+    const int LEN = 256;
+    int nblock = (nloc + LEN -1) / LEN;
+    int * sec_a_dev = array_int;
+    int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size];
+    int * i_idx = array_int + sec_a.size() + nloc * sec_a.size();
+    int_64 * key = array_longlong;
+    compute_t * sel_a_diff = array_double;    // = new VALUETYPE *[nlist_size]; nnei
+    // int_64 * key = NULL;
+    // VALUETYPE * sel_a_diff = NULL;    // = new VALUETYPE *[nlist_size]; nnei
+
+    cudaError_t res = cudaSuccess;
+    // res = cudaMalloc((void**)&sec_a_dev, sizeof(int) * sec_a.size()); cudaErrcheck(res);
+    // res = cudaMalloc((void**)&nei_iter, sizeof(int) * nloc * sec_a.size()); cudaErrcheck(res);
+    // res = cudaMalloc((void**)&i_idx, sizeof(int) * nloc); cudaErrcheck(res);
+    // res = cudaMalloc((void**)&key, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2); cudaErrcheck(res);
+    // res = cudaMalloc((void**)&sel_a_diff, sizeof(VALUETYPE) * nnei * 3 * nloc); cudaErrcheck(res);
+    res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
+    res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
+    res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
+    res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res);
+    res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
+    // res = cudaMemset(rij, 0.0, sizeof(VALUETYPE) * nloc * nnei * 3); cudaErrcheck(res);
+
+    if (fill_nei_a) {
+        // ~~~
+        // cudaProfilerStart();
+        get_i_idx_se_a<<<nblock, LEN>>> (nloc, ilist, i_idx);
+
+        format_nlist_fill_a_se_a<<<nloc, MAGIC_NUMBER>>> (
+                            coord,
+                            type,
+                            jrange,
+                            jlist,
+                            rcut_r,
+                            key,
+                            i_idx
+        );
+        const int ITEMS_PER_THREAD = 4;
+        const int BLOCK_THREADS = 64;
+        // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+        BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+        
+        format_nlist_fill_b_se_a<<<nblock, LEN>>> (
+                            nlist,
+                            nnei,       
+                            nloc,
+                            jrange,
+                            jlist,
+                            key,
+                            sec_a_dev,
+                            sec_a.size(),
+                            nei_iter
+        );
+    }
+
+    compute_descriptor_se_a<<<nloc, sec_a.back()>>> (
+                            descript,
+                            ndescrpt,
+                            descript_deriv,
+                            ndescrpt * 3,
+                            rij,
+                            nnei * 3,
+                            type,
+                            avg,
+                            std,
+                            nlist,
+                            nnei,
+                            coord,
+                            rcut_r_smth,
+                            rcut_r,
+                            sel_a_diff
+    );
+////
+    // res = cudaFree(sec_a_dev);                  cudaErrcheck(res);
+    // res = cudaFree(key);                        cudaErrcheck(res);
+    // res = cudaFree(i_idx);                      cudaErrcheck(res);
+    // res = cudaFree(nei_iter);                   cudaErrcheck(res);
+    // res = cudaFree(sel_a_diff);                 cudaErrcheck(res);
+    //output some interesting things...
+}  
\ No newline at end of file
diff --git a/source/op/cuda/descrpt_se_r.cu b/source/op/cuda/descrpt_se_r.cu
new file mode 100644
index 0000000000..cc7dd4e904
--- /dev/null
+++ b/source/op/cuda/descrpt_se_r.cu
@@ -0,0 +1,344 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#define EIGEN_USE_GPU
+#include <vector>
+#include <climits>
+#include <stdio.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cuda_runtime.h>
+#include <fstream>
+
+#define MAGIC_NUMBER 256
+
+#ifdef HIGH_PREC
+    typedef double  VALUETYPE;
+#else
+    typedef float   VALUETYPE;
+#endif
+
+typedef double compute_t;
+
+typedef unsigned long long int_64;
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
+
+template <
+    typename    Key,
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD>
+__launch_bounds__ (BLOCK_THREADS)
+__global__ void BlockSortKernel(
+    Key * d_in,
+    Key * d_out)         // Tile of output
+{   
+    enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
+    typedef cub::BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
+    // Specialize BlockRadixSort type for our thread block
+    typedef cub::BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+    // Shared memory
+    __shared__ union TempStorage
+    {
+        typename BlockLoadT::TempStorage        load;
+        typename BlockRadixSortT::TempStorage   sort;
+    } temp_storage;
+    // Per-thread tile items
+    Key items[ITEMS_PER_THREAD];
+    // Our current block's offset
+    int block_offset = blockIdx.x * TILE_SIZE;
+    // Load items into a blocked arrangement
+    BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+    // Barrier for smem reuse
+    __syncthreads();
+    // Sort keys
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
+    // Store output in striped fashion
+    cub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+}
+
+template<typename T>
+__device__ inline T dev_dot(T * arr1, T * arr2) {
+    return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
+}
+
+__device__ inline void spline5_switch(compute_t & vv,
+        compute_t & dd,
+        compute_t & xx, 
+		const compute_t & rmin, 
+		const compute_t & rmax) 
+{
+    if (xx < rmin) {
+        dd = 0;
+        vv = 1;
+    }
+    else if (xx < rmax) {
+        compute_t uu = (xx - rmin) / (rmax - rmin) ;
+        compute_t du = 1. / (rmax - rmin) ;
+        vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
+        dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
+    }
+    else {
+        dd = 0;
+        vv = 0;
+    }
+}
+
+__global__ void get_i_idx_se_r(const int nloc,
+                        const int * ilist,
+                        int * i_idx) 
+{
+    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+    if(idy >= nloc) {
+        return;
+    }
+    i_idx[ilist[idy]] = idy;
+}
+
+__global__ void format_nlist_fill_a_se_r(const VALUETYPE * coord,
+                            const int * type,
+                            const int  * jrange,
+                            const int  * jlist,
+                            const float rcut,
+                            int_64 * key,
+                            int * i_idx)
+{   
+    // <<<nloc, MAGIC_NUMBER>>>
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = threadIdx.x;
+    
+    const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]];
+    if (idy >= nsize) {
+        return;
+    }
+
+    const int * nei_idx = jlist + jrange[i_idx[idx]];
+    // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize);
+
+    int_64 * key_in = key + idx * MAGIC_NUMBER;
+
+    compute_t diff[3];
+    const int & j_idx = nei_idx[idy];
+    for (int dd = 0; dd < 3; dd++) {
+        diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
+    }
+    compute_t rr = sqrt(dev_dot(diff, diff)); 
+    if (rr <= rcut) {
+        key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx;
+    }
+}
+
+    // bubble_sort(sel_nei, num_nei);
+__global__ void format_nlist_fill_b_se_r(int * nlist,
+                            const int nlist_size,
+                            const int nloc,
+                            const int * jrange,
+                            const int * jlist,
+                            int_64 * key,
+                            const int * sec,
+                            const int sec_a_size,
+                            int * nei_iter_dev)
+{ 
+
+    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if(idy >= nloc) {
+        return;
+    }
+    
+    int * row_nlist = nlist + idy * nlist_size;
+    int * nei_iter = nei_iter_dev + idy * sec_a_size;
+    int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER;
+
+    for (int ii = 0; ii < sec_a_size; ii++) {
+        nei_iter[ii] = sec[ii];
+    }
+    
+    for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) {
+        const int & nei_type = key_out[kk] / 1E15;
+        if (nei_iter[nei_type] < sec[nei_type + 1]) {
+            row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000;
+        }
+    }
+}
+//it's ok!
+
+__global__ void compute_descriptor_se_r (VALUETYPE* descript,
+                            const int ndescrpt,
+                            VALUETYPE* descript_deriv,
+                            const int descript_deriv_size,
+                            VALUETYPE* rij,
+                            const int rij_size,
+                            const int* type,
+                            const VALUETYPE* avg,
+                            const VALUETYPE* std,
+                            int* nlist,
+                            const int nlist_size,
+                            const VALUETYPE* coord,
+                            const VALUETYPE rmin,
+                            const VALUETYPE rmax,
+                            compute_t* sel_diff_dev)
+{   
+    // <<<nloc, sec.back()>>>
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = threadIdx.x;
+    const int idx_deriv = idy * 3;	// 1 components time 3 directions
+    const int idx_value = idy;	// 1 components
+
+    // else {return;}
+    VALUETYPE * row_descript = descript + idx * ndescrpt;
+    VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
+    VALUETYPE * row_rij = rij + idx * rij_size;
+    compute_t * sel_diff = sel_diff_dev + idx * nlist_size * 3;
+    int * row_nlist = nlist + idx * nlist_size;
+    
+    if (row_nlist[idy] >= 0) {
+        const int & j_idx = row_nlist[idy];
+        for (int kk = 0; kk < 3; kk++) {
+            sel_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
+            row_rij[idy * 3 + kk] = sel_diff[idy * 3 + kk];
+        }
+        const compute_t * rr = &sel_diff[idy * 3 + 0];
+        compute_t nr2 = dev_dot(rr, rr);
+        compute_t inr = 1./sqrt(nr2);
+        compute_t nr = nr2 * inr;
+        compute_t inr2 = inr * inr;
+        compute_t inr4 = inr2 * inr2;
+        compute_t inr3 = inr4 * nr;
+        compute_t sw, dsw;
+        spline5_switch(sw, dsw, nr, rmin, rmax);
+        row_descript[idx_value + 0] = (1./nr)       ;//* sw;
+
+        row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
+        row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
+        row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
+        // 1 value components
+        row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
+    }
+
+    for (int ii = 0; ii < 1; ii++) {
+        row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii];
+    }
+    for (int ii = 0; ii < 3; ii++) {
+        row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3];
+    }
+}
+
+void DescrptSeRLauncher(const VALUETYPE* coord,
+                        const int* type,
+                        const int* ilist,
+                        const int* jrange,
+                        const int* jlist,
+                        int* array_int,
+                        unsigned long long* array_longlong,
+                        compute_t* array_double,
+                        const VALUETYPE* avg,
+                        const VALUETYPE* std,
+                        VALUETYPE* descript,
+                        VALUETYPE* descript_deriv,
+                        VALUETYPE* rij,
+                        int* nlist,
+                        const int& ntypes,
+                        const int& nloc,
+                        const int& nall,
+                        const int& nnei,
+                        const float& rcut,
+                        const float& rcut_smth,
+                        const int& ndescrpt,
+                        const std::vector<int>& sec,
+                        const bool& fill_nei_a
+)
+{   
+    const int LEN = 256;
+    int nblock = (nloc + LEN -1) / LEN;
+    int * sec_dev = array_int;
+    int * nei_iter = array_int + sec.size(); // = new int[sec_a_size];
+    int * i_idx = array_int + sec.size() + nloc * sec.size();
+    int_64 * key = array_longlong;
+    compute_t * sel_diff = array_double;    // = new VALUETYPE *[nlist_size]; nnei
+    
+    cudaError_t res = cudaSuccess;
+    res = cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
+    res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
+    res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
+    res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res);
+    res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
+
+    if (fill_nei_a) {
+        // cudaProfilerStart();
+        get_i_idx_se_r<<<nblock, LEN>>> (nloc, ilist, i_idx);
+
+        format_nlist_fill_a_se_r<<<nloc, MAGIC_NUMBER>>> (
+                            coord,
+                            type,
+                            jrange,
+                            jlist,
+                            rcut,
+                            key,
+                            i_idx
+        );
+        const int ITEMS_PER_THREAD = 4;
+        const int BLOCK_THREADS = 64;
+        BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+        format_nlist_fill_b_se_r<<<nblock, LEN>>> (
+                            nlist,
+                            nnei,       
+                            nloc,
+                            jrange,
+                            jlist,
+                            key,
+                            sec_dev,
+                            sec.size(),
+                            nei_iter
+        );
+    }
+    compute_descriptor_se_r<<<nloc, sec.back()>>> (
+                            descript,
+                            ndescrpt,
+                            descript_deriv,
+                            ndescrpt * 3,
+                            rij,
+                            nnei * 3,
+                            type,
+                            avg,
+                            std,
+                            nlist,
+                            nnei,
+                            coord,
+                            rcut_smth,
+                            rcut,
+                            sel_diff
+    );
+}  
\ No newline at end of file
diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu
new file mode 100644
index 0000000000..080ff8ef75
--- /dev/null
+++ b/source/op/cuda/prod_force_se_a.cu
@@ -0,0 +1,102 @@
+#include <stdio.h>
+#include <iostream>
+#include <cuda_runtime.h>
+
+#ifdef HIGH_PREC
+    typedef double VALUETYPE;
+#else
+    typedef float  VALUETYPE;
+#endif
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
+
+__global__ void deriv_wrt_center_atom_se_a(VALUETYPE * force, 
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const int ndescrpt)
+{
+    const unsigned int idx = blockIdx.y;
+    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idz = threadIdx.y;
+
+    if (idy >= ndescrpt) {
+        return;
+    }
+    
+    atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
+}
+
+__global__ void deriv_wrt_neighbors_se_a(VALUETYPE * force, 
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const int * nlist,
+                        const int nloc,
+                        const int nnei,
+                        const int ndescrpt,
+                        const int n_a_sel,
+                        const int n_a_shift)
+{  
+    // idy -> nnei
+    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idy = blockIdx.y;
+    const unsigned int idz = threadIdx.y;
+    const unsigned int idw = threadIdx.z;
+    
+    if (idx >= nloc) {
+        return;
+    }
+    // deriv wrt neighbors
+    int j_idx = nlist[idx * nnei + idy];
+    if (j_idx < 0) {
+        return;
+    }
+    atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]);
+}
+
+void ProdForceSeALauncher(VALUETYPE * force, 
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const int * nlist,
+                        const int nloc,
+                        const int nall,
+                        const int ndescrpt,
+                        const int nnei,
+                        const int n_a_sel,
+                        const int n_a_shift)
+{   
+    // std::cout << "I'm here!" << std::endl;
+    cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3));
+    const int LEN1 = 256;
+    const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
+    dim3 grid(nblock1, nloc);
+    dim3 thread(LEN1, 3);
+    deriv_wrt_center_atom_se_a<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
+    
+    const int LEN = 64;
+    int nblock = (nloc + LEN -1) / LEN;
+    dim3 block_grid(nblock, nnei);
+    dim3 thread_grid(LEN, 3, 4);
+    deriv_wrt_neighbors_se_a<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift);
+}
\ No newline at end of file
diff --git a/source/op/cuda/prod_force_se_r.cu b/source/op/cuda/prod_force_se_r.cu
new file mode 100644
index 0000000000..765842d9c3
--- /dev/null
+++ b/source/op/cuda/prod_force_se_r.cu
@@ -0,0 +1,99 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#ifdef HIGH_PREC
+    typedef double VALUETYPE;
+#else
+    typedef float  VALUETYPE;
+#endif
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
+
+__global__ void deriv_wrt_center_atom_se_r(VALUETYPE * force, 
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const int ndescrpt)
+{
+    const unsigned int idx = blockIdx.y;
+    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idz = threadIdx.y;
+    
+    if (idy >= ndescrpt) {
+        return;
+    }
+
+    atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
+}
+
+__global__ void deriv_wrt_neighbors_se_r(VALUETYPE * force, 
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const int * nlist,
+                        const int nloc,
+                        const int nnei,
+                        const int ndescrpt,
+                        const int n_a_sel,
+                        const int n_a_shift)
+{  
+    // idy -> nnei
+    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idy = blockIdx.y;
+    const unsigned int idz = threadIdx.y;
+    
+    if (idx >= nloc) {
+        return;
+    }
+    // deriv wrt neighbors
+    int j_idx = nlist[idx * nnei + idy];
+    if (j_idx < 0) {
+        return;
+    }
+    atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
+}
+
+void ProdForceSeRLauncher(VALUETYPE * force, 
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const int * nlist,
+                        const int nloc,
+                        const int nall,
+                        const int ndescrpt,
+                        const int nnei,
+                        const int n_a_sel,
+                        const int n_a_shift)
+{
+    cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3));
+    const int LEN1 = 256;
+    const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
+    dim3 grid(nblock1, nloc);
+    dim3 thread(LEN1, 3);
+    deriv_wrt_center_atom_se_r<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
+    
+    const int LEN = 64;
+    int nblock = (nloc + LEN -1) / LEN;
+    dim3 block_grid(nblock, nnei);
+    dim3 thread_grid(LEN, 3);
+    deriv_wrt_neighbors_se_r<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift);
+}
\ No newline at end of file
diff --git a/source/op/cuda/prod_virial_se_a.cu b/source/op/cuda/prod_virial_se_a.cu
new file mode 100644
index 0000000000..f93e14bcec
--- /dev/null
+++ b/source/op/cuda/prod_virial_se_a.cu
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#define MUL 512
+
+#ifdef HIGH_PREC
+    typedef double VALUETYPE;
+#else
+    typedef float  VALUETYPE;
+#endif
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+    if (code != cudaSuccess) {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
+
+__global__ void deriv_wrt_neighbors_se_a(VALUETYPE * virial, 
+                        VALUETYPE * atom_virial,
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const VALUETYPE * rij,
+                        const int * nlist,
+                        const int nloc,
+                        const int nnei,
+                        const int ndescrpt,
+                        const int n_a_sel,
+                        const int n_a_shift) 
+{
+    // idx -> nloc
+    // idy -> nnei
+    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idy = blockIdx.y;
+    const unsigned int idz = threadIdx.y;
+    const unsigned int idw = threadIdx.z;
+
+    if (idx >= nloc) {
+        return;
+    }
+    int j_idx = nlist[idx * nnei + idy];
+    if (j_idx < 0) {
+        return;
+    }
+    // atomicAdd(virial + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]);
+    atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]);
+}
+
+void ProdVirialSeALauncher(VALUETYPE * virial, 
+                        VALUETYPE * atom_virial,
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const VALUETYPE * rij,
+                        const int * nlist,
+                        const int nloc,
+                        const int nall,
+                        const int nnei,
+                        const int ndescrpt,
+                        const int n_a_sel,
+                        const int n_a_shift) 
+{
+    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(VALUETYPE) * 9));
+    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(VALUETYPE) * 9 * nall));
+
+    const int LEN = 16;
+    int nblock = (nloc + LEN -1) / LEN;
+    dim3 block_grid(nblock, nnei);
+    dim3 thread_grid(LEN, 9, 4);
+    // compute virial of a frame
+    deriv_wrt_neighbors_se_a<<<block_grid, thread_grid>>>(
+                        virial, 
+                        atom_virial, 
+                        net_deriv, 
+                        in_deriv,
+                        rij,
+                        nlist,
+                        nloc,
+                        nnei,
+                        ndescrpt,
+                        n_a_sel,
+                        n_a_shift
+    );
+}
diff --git a/source/op/cuda/prod_virial_se_r.cu b/source/op/cuda/prod_virial_se_r.cu
new file mode 100644
index 0000000000..d5b6aad3cc
--- /dev/null
+++ b/source/op/cuda/prod_virial_se_r.cu
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include <iostream>
+#include <cuda_runtime.h>
+
+#define MUL 512
+
+#ifdef HIGH_PREC
+    typedef double VALUETYPE;
+#else
+    typedef float  VALUETYPE;
+#endif
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+    if (code != cudaSuccess) {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+// currently, double precision atomicAdd only support arch number larger than 6.0
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
+
+__global__ void deriv_wrt_neighbors_se_r(VALUETYPE * virial, 
+                        VALUETYPE * atom_virial,
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const VALUETYPE * rij,
+                        const int * nlist,
+                        const int nloc,
+                        const int nnei,
+                        const int ndescrpt,
+                        const int n_a_sel,
+                        const int n_a_shift) 
+{
+    // idx -> nloc
+    // idy -> nnei
+    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idy = blockIdx.y;
+    const unsigned int idz = threadIdx.y;
+
+    if (idx >= nloc) {
+        return;
+    }
+    int j_idx = nlist[idx * nnei + idy];
+    if (j_idx < 0) {
+        return;
+    }
+    atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz % 3]);
+}
+
+void ProdVirialSeRLauncher(VALUETYPE * virial, 
+                        VALUETYPE * atom_virial,
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const VALUETYPE * rij,
+                        const int * nlist,
+                        const int nloc,
+                        const int nall,
+                        const int nnei,
+                        const int ndescrpt,
+                        const int n_a_sel,
+                        const int n_a_shift) 
+{
+    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(VALUETYPE) * 9));
+    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(VALUETYPE) * 9 * nall));
+
+    const int LEN = 16;
+    int nblock = (nloc + LEN -1) / LEN;
+    dim3 block_grid(nblock, nnei);
+    dim3 thread_grid(LEN, 9);
+    // compute virial of a frame
+    deriv_wrt_neighbors_se_r<<<block_grid, thread_grid>>>(
+                        virial, 
+                        atom_virial, 
+                        net_deriv, 
+                        in_deriv,
+                        rij,
+                        nlist,
+                        nloc,
+                        nnei,
+                        ndescrpt,
+                        n_a_sel,
+                        n_a_shift
+    );
+}
\ No newline at end of file
diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc
new file mode 100644
index 0000000000..93c83016fb
--- /dev/null
+++ b/source/op/descrpt_se_a_gpu.cc
@@ -0,0 +1,259 @@
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <cuda_runtime.h>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+using namespace tensorflow;  // NOLINT(build/namespaces)
+#define MAGIC_NUMBER 256
+
+#ifdef HIGH_PREC
+    typedef double VALUETYPE ;
+#else
+    typedef float  VALUETYPE ;
+#endif
+
+typedef double compute_t;
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+using GPUDevice = Eigen::GpuDevice;
+
+#ifdef HIGH_PREC
+REGISTER_OP("DescrptSeA")
+    .Input("coord: double")     //atomic coordinates
+    .Input("type: int32")       //atomic type
+    .Input("natoms: int32")     //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers
+    .Input("box : double")
+    .Input("mesh : int32")
+    .Input("davg: double")      //average value of data
+    .Input("dstd: double")      //standard deviation
+    .Attr("rcut_a: float")      //no use
+    .Attr("rcut_r: float")
+    .Attr("rcut_r_smth: float")
+    .Attr("sel_a: list(int)")
+    .Attr("sel_r: list(int)")   //all zero
+    .Output("descrpt: double")
+    .Output("descrpt_deriv: double")
+    .Output("rij: double")
+    .Output("nlist: int32");
+    // only sel_a and rcut_r uesd.
+#else
+REGISTER_OP("DescrptSeA")
+    .Input("coord: float")
+    .Input("type: int32")
+    .Input("natoms: int32")
+    .Input("box : float")
+    .Input("mesh : int32")
+    .Input("davg: float")
+    .Input("dstd: float")
+    .Attr("rcut_a: float")
+    .Attr("rcut_r: float")
+    .Attr("rcut_r_smth: float")
+    .Attr("sel_a: list(int)")
+    .Attr("sel_r: list(int)")
+    .Output("descrpt: float")
+    .Output("descrpt_deriv: float")
+    .Output("rij: float")
+    .Output("nlist: int32");
+#endif
+
+void DescrptSeALauncher(const VALUETYPE* coord,
+                            const int* type,
+                            const int* ilist,
+                            const int* jrange,
+                            const int* jlist,
+                            int* array_int,
+                            unsigned long long* array_longlong,
+                            compute_t* array_double,
+                            const VALUETYPE* avg,
+                            const VALUETYPE* std,
+                            VALUETYPE* descript,
+                            VALUETYPE* descript_deriv,
+                            VALUETYPE* rij,
+                            int* nlist,
+                            const int& ntypes,
+                            const int& nloc,
+                            const int& nall,
+                            const int& nnei,
+                            const float& rcut_r,
+                            const float& rcut_r_smth,
+                            const int& ndescrpt,
+                            const std::vector<int>& sec_a,
+                            const bool& fill_nei_a
+);
+
+class DescrptSeAOp : public OpKernel {
+public:
+    explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) {
+        float nloc_f, nall_f;
+        OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a));
+        OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r));
+        OP_REQUIRES_OK(context, context->GetAttr("rcut_r_smth", &rcut_r_smth));
+        OP_REQUIRES_OK(context, context->GetAttr("sel_a", &sel_a));
+        OP_REQUIRES_OK(context, context->GetAttr("sel_r", &sel_r));
+        // OP_REQUIRES_OK(context, context->GetAttr("nloc", &nloc_f));
+        // OP_REQUIRES_OK(context, context->GetAttr("nall", &nall_f));
+        cum_sum (sec_a, sel_a);
+        cum_sum (sec_r, sel_r);
+        ndescrpt_a = sec_a.back() * 4;
+        ndescrpt_r = sec_r.back() * 1;
+        ndescrpt = ndescrpt_a + ndescrpt_r;
+        nnei_a = sec_a.back();
+        nnei_r = sec_r.back();
+        nnei = nnei_a + nnei_r;
+        fill_nei_a = (rcut_a < 0);
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& coord_tensor	= context->input(context_input_index++);
+        const Tensor& type_tensor	= context->input(context_input_index++);
+        const Tensor& natoms_tensor	= context->input(context_input_index++);
+        const Tensor& box_tensor	= context->input(context_input_index++);
+        const Tensor& mesh_tensor   = context->input(context_input_index++);
+        const Tensor& avg_tensor	= context->input(context_input_index++);
+        const Tensor& std_tensor	= context->input(context_input_index++);
+        // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3]
+        OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
+        OP_REQUIRES (context, (type_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of type should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
+        OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of box should be 2"));
+        OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
+        OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of avg should be 2"));
+        OP_REQUIRES (context, (std_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of std should be 2"));
+        OP_REQUIRES (context, (fill_nei_a),                         errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0"));
+        OP_REQUIRES (context, (sec_r.back() == 0),			        errors::InvalidArgument ("Rotational free descriptor only support all-angular information: sel_r should be all zero."));
+
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+
+        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
+        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        int nloc = natoms[0];
+        int nall = natoms[1];
+        int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something.
+        int nsamples = coord_tensor.shape().dim_size(0);
+        //
+        //// check the sizes
+        OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of avg should be ntype"));
+        OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of std should be ntype"));
+        
+        OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		    errors::InvalidArgument ("number of box should be 9"));
+        OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of avg should be ndescrpt"));
+        OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of std should be ndescrpt"));   
+        
+        OP_REQUIRES (context, (ntypes == int(sel_a.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
+        OP_REQUIRES (context, (ntypes == int(sel_r.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
+        
+        // Create output tensors
+        TensorShape descrpt_shape ;
+        descrpt_shape.AddDim (nsamples);
+        descrpt_shape.AddDim (nloc * ndescrpt);
+        TensorShape descrpt_deriv_shape ;
+        descrpt_deriv_shape.AddDim (nsamples);
+        descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
+        TensorShape rij_shape ;
+        rij_shape.AddDim (nsamples);
+        rij_shape.AddDim (nloc * nnei * 3);
+        TensorShape nlist_shape ;
+        nlist_shape.AddDim (nsamples);
+        nlist_shape.AddDim (nloc * nnei);
+
+        int context_output_index = 0;
+        Tensor* descrpt_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     descrpt_shape,
+	    					     &descrpt_tensor));
+        Tensor* descrpt_deriv_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     descrpt_deriv_shape,
+	    					     &descrpt_deriv_tensor));
+        Tensor* rij_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     rij_shape,
+	    					     &rij_tensor));
+        Tensor* nlist_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     nlist_shape,
+	    					     &nlist_tensor));
+        
+	    int * ilist = NULL, *jrange = NULL, *jlist = NULL;
+        int * array_int = NULL; unsigned long long *array_longlong = NULL; compute_t *array_double = NULL;
+        cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
+        cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
+        cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
+        cudaErrcheck(cudaMemcpy(&(array_int), 16 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
+        cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat<int>().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost));
+        cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat<int>().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost));
+
+        // cudaErrcheck(cudaMemcpy(jlist, host_jlist, sizeof(int) * nloc * MAGIC_NUMBER, cudaMemcpyHostToDevice));
+        // Launch computation
+        for (int II = 0; II < nsamples; II++) {
+            DescrptSeALauncher(coord_tensor.matrix<VALUETYPE>().data() + II * (nall * 3),    // related to the kk argument
+                        type_tensor.matrix<int>().data() + II * nall,           // also related to the kk argument
+                        ilist,
+                        jrange,
+                        jlist,
+                        array_int,
+                        array_longlong,
+                        array_double,
+                        avg_tensor.matrix<VALUETYPE>().data(),
+                        std_tensor.matrix<VALUETYPE>().data(),
+                        descrpt_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt),
+                        descrpt_deriv_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
+                        rij_tensor->matrix<VALUETYPE>().data() + II * (nloc * nnei * 3),
+                        nlist_tensor->matrix<int>().data() + II * (nloc * nnei),
+                        ntypes,
+                        nloc,
+                        nall,
+                        nnei,
+                        rcut_r,
+                        rcut_r_smth,
+                        ndescrpt,
+                        sec_a,
+                        fill_nei_a
+            );
+        }
+        // std::cout << "done" << std::endl;
+        delete[] natoms;
+    }
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+private:
+    float rcut_a;
+    float rcut_r;
+    float rcut_r_smth;
+    std::vector<int32> sel_r;
+    std::vector<int32> sel_a;
+    std::vector<int> sec_a;
+    std::vector<int> sec_r;
+    int ndescrpt, ndescrpt_a, ndescrpt_r;
+    int nnei, nnei_a, nnei_r, nloc, nall;
+    bool fill_nei_a;
+
+    //private func
+    void cum_sum (std::vector<int> & sec, const std::vector<int32> & n_sel) const {
+        sec.resize (n_sel.size() + 1);
+        sec[0] = 0;
+        for (int ii = 1; ii < sec.size(); ++ii) {
+            sec[ii] = sec[ii-1] + n_sel[ii-1];
+        }
+    }
+};
+
+REGISTER_KERNEL_BUILDER(Name("DescrptSeA").Device(DEVICE_GPU), DescrptSeAOp);
\ No newline at end of file
diff --git a/source/op/descrpt_se_r_gpu.cc b/source/op/descrpt_se_r_gpu.cc
new file mode 100644
index 0000000000..65e2682ef0
--- /dev/null
+++ b/source/op/descrpt_se_r_gpu.cc
@@ -0,0 +1,247 @@
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <cuda_runtime.h>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+using namespace tensorflow;  // NOLINT(build/namespaces)
+#define MAGIC_NUMBER 256
+
+#ifdef HIGH_PREC
+    typedef double VALUETYPE ;
+#else
+    typedef float  VALUETYPE ;
+#endif
+
+typedef double compute_t;
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+// sec_a kao,sec_r,
+
+using GPUDevice = Eigen::GpuDevice;
+
+#ifdef HIGH_PREC
+REGISTER_OP("DescrptSeR")
+    .Input("coord: double")
+    .Input("type: int32")
+    .Input("natoms: int32")
+    .Input("box: double")
+    .Input("mesh: int32")
+    .Input("davg: double")
+    .Input("dstd: double")
+    .Attr("rcut: float")
+    .Attr("rcut_smth: float")
+    .Attr("sel: list(int)")
+    .Output("descrpt: double")
+    .Output("descrpt_deriv: double")
+    .Output("rij: double")
+    .Output("nlist: int32");
+#else
+REGISTER_OP("DescrptSeR")
+    .Input("coord: float")
+    .Input("type: int32")
+    .Input("natoms: int32")
+    .Input("box: float")
+    .Input("mesh: int32")
+    .Input("davg: float")
+    .Input("dstd: float")
+    .Attr("rcut: float")
+    .Attr("rcut_smth: float")
+    .Attr("sel: list(int)")
+    .Output("descrpt: float")
+    .Output("descrpt_deriv: float")
+    .Output("rij: float")
+    .Output("nlist: int32");
+#endif
+
+void DescrptSeRLauncher(const VALUETYPE* coord,
+                            const int* type,
+                            const int* ilist,
+                            const int* jrange,
+                            const int* jlist,
+                            int* array_int,
+                            unsigned long long* array_longlong,
+                            compute_t* array_double,
+                            const VALUETYPE* avg,
+                            const VALUETYPE* std,
+                            VALUETYPE* descript,
+                            VALUETYPE* descript_deriv,
+                            VALUETYPE* rij,
+                            int* nlist,
+                            const int& ntypes,
+                            const int& nloc,
+                            const int& nall,
+                            const int& nnei,
+                            const float& rcut,
+                            const float& rcut_smth,
+                            const int& ndescrpt,
+                            const std::vector<int>& sec,
+                            const bool& fill_nei_a
+);
+
+class DescrptSeROp : public OpKernel {
+public:
+    explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) {
+        float nloc_f, nall_f;
+        OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
+        OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth));
+        OP_REQUIRES_OK(context, context->GetAttr("sel", &sel));
+        cum_sum (sec, sel);
+        sel_null.resize(3, 0);
+        cum_sum (sec_null, sel_null);
+        ndescrpt = sec.back() * 1;
+        nnei = sec.back();
+        fill_nei_a = true;
+        // count_nei_idx_overflow = 0;
+        // std::cout << "I'm in descrpt_se_r_gpu.cc" << std::endl;
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& coord_tensor	= context->input(context_input_index++);
+        const Tensor& type_tensor	= context->input(context_input_index++);
+        const Tensor& natoms_tensor	= context->input(context_input_index++);
+        const Tensor& box_tensor	= context->input(context_input_index++);
+        const Tensor& mesh_tensor   = context->input(context_input_index++);
+        const Tensor& avg_tensor	= context->input(context_input_index++);
+        const Tensor& std_tensor	= context->input(context_input_index++);
+        // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3]
+        OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
+        OP_REQUIRES (context, (type_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of type should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
+        OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of box should be 2"));
+        OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
+        OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of avg should be 2"));
+        OP_REQUIRES (context, (std_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of std should be 2"));
+        OP_REQUIRES (context, (fill_nei_a),                         errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0"));
+
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+
+        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
+        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        int nloc = natoms[0];
+        int nall = natoms[1];
+        int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something.
+        int nsamples = coord_tensor.shape().dim_size(0);
+        //
+        //// check the sizes
+        OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (ntypes == int(sel.size())),	                    errors::InvalidArgument ("number of types should match the length of sel array"));
+        OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of avg should be ntype"));
+        OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of std should be ntype"));
+        
+        OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		    errors::InvalidArgument ("number of box should be 9"));
+        OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of avg should be ndescrpt"));
+        OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of std should be ndescrpt"));   
+        
+        // Create output tensors
+        TensorShape descrpt_shape ;
+        descrpt_shape.AddDim (nsamples);
+        descrpt_shape.AddDim (nloc * ndescrpt);
+        TensorShape descrpt_deriv_shape ;
+        descrpt_deriv_shape.AddDim (nsamples);
+        descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
+        TensorShape rij_shape ;
+        rij_shape.AddDim (nsamples);
+        rij_shape.AddDim (nloc * nnei * 3);
+        TensorShape nlist_shape ;
+        nlist_shape.AddDim (nsamples);
+        nlist_shape.AddDim (nloc * nnei);
+
+        int context_output_index = 0;
+        Tensor* descrpt_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     descrpt_shape,
+	    					     &descrpt_tensor));
+        Tensor* descrpt_deriv_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     descrpt_deriv_shape,
+	    					     &descrpt_deriv_tensor));
+        Tensor* rij_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     rij_shape,
+	    					     &rij_tensor));
+        Tensor* nlist_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     nlist_shape,
+	    					     &nlist_tensor));
+        
+	    int * ilist = NULL, *jrange = NULL, *jlist = NULL;
+        int *array_int = NULL; unsigned long long *array_longlong = NULL; compute_t *array_double = NULL;
+        cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
+        cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
+        cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
+        cudaErrcheck(cudaMemcpy(&(array_int), 16 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
+        cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat<int>().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost));
+        cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat<int>().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost));
+
+        // cudaErrcheck(cudaMemcpy(jlist, host_jlist, sizeof(int) * nloc * MAGIC_NUMBER, cudaMemcpyHostToDevice));
+        // Launch computation
+        for (int II = 0; II < nsamples; II++) {
+            DescrptSeRLauncher( coord_tensor.matrix<VALUETYPE>().data() + II * (nall * 3),
+                                type_tensor.matrix<int>().data() + II * nall,
+                                ilist,
+                                jrange,
+                                jlist,
+                                array_int,
+                                array_longlong,
+                                array_double,
+                                avg_tensor.matrix<VALUETYPE>().data(),
+                                std_tensor.matrix<VALUETYPE>().data(),
+                                descrpt_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt),
+                                descrpt_deriv_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
+                                rij_tensor->matrix<VALUETYPE>().data() + II * (nloc * nnei * 3),
+                                nlist_tensor->matrix<int>().data() + II * (nloc * nnei),
+                                ntypes,
+                                nloc,
+                                nall,
+                                nnei,
+                                rcut,
+                                rcut_smth,
+                                ndescrpt,
+                                sec,
+                                fill_nei_a
+            );
+        }
+        // std::cout << "done" << std::endl;
+        delete[] natoms;
+    }
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+private:
+    float rcut;
+    float rcut_smth;
+    std::vector<int32> sel;
+    std::vector<int32> sel_null;
+    std::vector<int> sec;
+    std::vector<int> sec_null;
+    int nnei, ndescrpt, nloc, nall;
+    bool fill_nei_a;
+
+    //private func
+    void cum_sum (std::vector<int> & sec, const std::vector<int32> & n_sel) const {
+        sec.resize (n_sel.size() + 1);
+        sec[0] = 0;
+        for (int ii = 1; ii < sec.size(); ++ii) {
+            sec[ii] = sec[ii-1] + n_sel[ii-1];
+        }
+    }
+};
+
+REGISTER_KERNEL_BUILDER(Name("DescrptSeR").Device(DEVICE_GPU), DescrptSeROp);
\ No newline at end of file
diff --git a/source/op/prod_force_se_a_gpu.cc b/source/op/prod_force_se_a_gpu.cc
new file mode 100644
index 0000000000..2d159a8505
--- /dev/null
+++ b/source/op/prod_force_se_a_gpu.cc
@@ -0,0 +1,139 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <cuda_runtime.h>
+#include <iostream>
+
+using namespace tensorflow;
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+#ifdef HIGH_PREC
+typedef double VALUETYPE;
+#else
+typedef float  VALUETYPE;
+#endif
+
+#ifdef HIGH_PREC
+REGISTER_OP("ProdForceSeA")
+    .Input("net_deriv: double")
+    .Input("in_deriv: double")
+    .Input("nlist: int32")
+    .Input("natoms: int32")
+    .Attr("n_a_sel: int")
+    .Attr("n_r_sel: int")
+    .Output("force: double");
+#else
+REGISTER_OP("ProdForceSeA")
+    .Input("net_deriv: float")
+    .Input("in_deriv: float")
+    .Input("nlist: int32")
+    .Input("natoms: int32")
+    .Attr("n_a_sel: int")
+    .Attr("n_r_sel: int")
+    .Output("force: float");
+#endif
+
+void ProdForceSeALauncher(VALUETYPE * force, 
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const int * nlist,
+                        const int nloc,
+                        const int nall,
+                        const int ndescrpt,
+                        const int nnei,
+                        const int n_a_sel,
+                        const int n_a_shift);
+
+class ProdForceSeAOp : public OpKernel {
+public:
+    explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) {
+        OP_REQUIRES_OK(context, context->GetAttr("n_a_sel", &n_a_sel));
+        OP_REQUIRES_OK(context, context->GetAttr("n_r_sel", &n_r_sel));
+        n_a_shift = n_a_sel * 4;
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& net_deriv_tensor	= context->input(context_input_index++);
+        const Tensor& in_deriv_tensor	= context->input(context_input_index++);
+        const Tensor& nlist_tensor		= context->input(context_input_index++);
+        const Tensor& natoms_tensor		= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of net deriv should be 2"));
+        OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of input deriv should be 2"));
+        OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2),		errors::InvalidArgument ("Dim of nlist should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
+
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
+        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        int nloc = natoms[0];
+        int nall = natoms[1];
+        int nframes = net_deriv_tensor.shape().dim_size(0);
+        int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc;
+        int nnei = nlist_tensor.shape().dim_size(1) / nloc;
+
+        // check the sizes
+        OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+
+        OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
+        OP_REQUIRES (context, (nnei == n_a_sel + n_r_sel),				errors::InvalidArgument ("number of neighbors should match"));
+        OP_REQUIRES (context, (0 == n_r_sel),					errors::InvalidArgument ("Rotational free only support all-angular information"));
+
+        // Create an output tensor
+        TensorShape force_shape ;
+        force_shape.AddDim (nframes);
+        force_shape.AddDim (3 * nall);
+        Tensor* force_tensor = NULL;
+        int context_output_index = 0;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     force_shape, &force_tensor));
+
+        // flat the tensors
+        auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
+        auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+        auto nlist = nlist_tensor.flat<int>();
+        auto force = force_tensor->flat<VALUETYPE>();
+
+        assert (nframes == force_shape.dim_size(0));
+        assert (nframes == net_deriv_tensor.shape().dim_size(0));
+        assert (nframes == in_deriv_tensor.shape().dim_size(0));
+        assert (nframes == nlist_tensor.shape().dim_size(0));
+        assert (nall * 3 == force_shape.dim_size(1));
+        assert (nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1));
+        assert (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1));
+        assert (nloc * nnei == nlist_tensor.shape().dim_size(1));
+        assert (nnei * 4 == ndescrpt);	    
+
+        for (int II = 0; II < nframes; II++) {
+            ProdForceSeALauncher(force_tensor->flat<VALUETYPE>().data() + II * (nall * 3),
+                                net_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt),
+                                in_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
+                                nlist_tensor.flat<int>().data() + II * (nloc * nnei),
+                                nloc,
+                                nall, 
+                                ndescrpt,
+                                nnei,
+                                n_a_sel,
+                                n_a_shift
+            );
+        }
+        delete[] natoms;
+    }
+private:
+    int n_r_sel, n_a_sel, n_a_shift;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ProdForceSeA").Device(DEVICE_GPU), ProdForceSeAOp);
\ No newline at end of file
diff --git a/source/op/prod_force_se_r_gpu.cc b/source/op/prod_force_se_r_gpu.cc
new file mode 100644
index 0000000000..3dc0bc7853
--- /dev/null
+++ b/source/op/prod_force_se_r_gpu.cc
@@ -0,0 +1,133 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <cuda_runtime.h>
+#include <iostream>
+
+using namespace tensorflow;
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+#ifdef HIGH_PREC
+    typedef double VALUETYPE;
+#else
+    typedef float  VALUETYPE;
+#endif
+
+#ifdef HIGH_PREC
+REGISTER_OP("ProdForceSeR")
+    .Input("net_deriv: double")
+    .Input("in_deriv: double")
+    .Input("nlist: int32")
+    .Input("natoms: int32")
+    .Output("force: double");
+#else
+REGISTER_OP("ProdForceSeR")
+    .Input("net_deriv: float")
+    .Input("in_deriv: float")
+    .Input("nlist: int32")
+    .Input("natoms: int32")
+    .Output("force: float");
+#endif
+
+void ProdForceSeRLauncher(VALUETYPE * force, 
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const int * nlist,
+                        const int nloc,
+                        const int nall,
+                        const int ndescrpt,
+                        const int nnei,
+                        const int n_a_sel,
+                        const int n_a_shift);
+
+class ProdForceSeROp : public OpKernel {
+public:
+    explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) {
+        // std::cout << "I'm in prod_force_se_r_gpu.cc" << std::endl;
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& net_deriv_tensor	= context->input(context_input_index++);
+        const Tensor& in_deriv_tensor	= context->input(context_input_index++);
+        const Tensor& nlist_tensor		= context->input(context_input_index++);
+        const Tensor& natoms_tensor		= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of net deriv should be 2"));
+        OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of input deriv should be 2"));
+        OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2),		errors::InvalidArgument ("Dim of nlist should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
+
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
+        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        int nloc = natoms[0];
+        int nall = natoms[1];
+        int nframes = net_deriv_tensor.shape().dim_size(0);
+        int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc;
+        int nnei = nlist_tensor.shape().dim_size(1) / nloc;
+
+        // check the sizes
+        OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+
+        OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
+        // OP_REQUIRES (context, (nnei == n_a_sel + n_r_sel),				errors::InvalidArgument ("number of neighbors should match"));
+        // OP_REQUIRES (context, (0 == n_r_sel),					        errors::InvalidArgument ("Rotational free only support all-angular information"));
+
+        // Create an output tensor
+        TensorShape force_shape;
+        force_shape.AddDim (nframes);
+        force_shape.AddDim (3 * nall);
+        Tensor* force_tensor = NULL;
+        int context_output_index = 0;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     force_shape, &force_tensor));
+
+        // flat the tensors
+        auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
+        auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+        auto nlist = nlist_tensor.flat<int>();
+        auto force = force_tensor->flat<VALUETYPE>();
+
+        assert (nframes == force_shape.dim_size(0));
+        assert (nframes == net_deriv_tensor.shape().dim_size(0));
+        assert (nframes == in_deriv_tensor.shape().dim_size(0));
+        assert (nframes == nlist_tensor.shape().dim_size(0));
+        assert (nall * 3 == force_shape.dim_size(1));
+        assert (nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1));
+        assert (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1));
+        assert (nloc * nnei == nlist_tensor.shape().dim_size(1));
+        assert (nnei * 4 == ndescrpt);	    
+
+        for (int II = 0; II < nframes; II++) {
+            ProdForceSeRLauncher(force_tensor->flat<VALUETYPE>().data() + II * (nall * 3),
+                                net_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt),
+                                in_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
+                                nlist_tensor.flat<int>().data() + II * (nloc * nnei),
+                                nloc,
+                                nall, 
+                                ndescrpt,
+                                nnei,
+                                n_a_sel,
+                                n_a_shift
+            );
+        }
+        delete[] natoms;
+    }
+private:
+    int n_r_sel, n_a_sel, n_a_shift;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ProdForceSeR").Device(DEVICE_GPU), ProdForceSeROp);
\ No newline at end of file
diff --git a/source/op/prod_virial_se_a_gpu.cc b/source/op/prod_virial_se_a_gpu.cc
new file mode 100644
index 0000000000..42f70d06d2
--- /dev/null
+++ b/source/op/prod_virial_se_a_gpu.cc
@@ -0,0 +1,145 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <iostream>
+#include <cuda_runtime.h>
+
+#ifdef HIGH_PREC
+typedef double VALUETYPE;
+#else
+typedef float  VALUETYPE;
+#endif
+
+#ifdef HIGH_PREC
+REGISTER_OP("ProdVirialSeA")
+    .Input("net_deriv: double")
+    .Input("in_deriv: double")
+    .Input("rij: double")
+    .Input("nlist: int32")
+    .Input("natoms: int32")
+    .Attr("n_a_sel: int")
+    .Attr("n_r_sel: int")
+    .Output("virial: double")
+    .Output("atom_virial: double");
+#else
+REGISTER_OP("ProdVirialSeA")
+    .Input("net_deriv: float")
+    .Input("in_deriv: float")
+    .Input("rij: float")
+    .Input("nlist: int32")
+    .Input("natoms: int32")
+    .Attr("n_a_sel: int")
+    .Attr("n_r_sel: int")
+    .Output("virial: float")
+    .Output("atom_virial: float");
+#endif
+
+using namespace tensorflow;
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+void ProdVirialSeALauncher(VALUETYPE * virial, 
+                        VALUETYPE * atom_virial,
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const VALUETYPE * rij,
+                        const int * nlist,
+                        const int nloc,
+                        const int nall,
+                        const int nnei,
+                        const int ndescrpt,
+                        const int n_a_sel,
+                        const int n_a_shift);
+
+class ProdVirialSeAOp : public OpKernel {
+ public:
+    explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) {
+        OP_REQUIRES_OK(context, context->GetAttr("n_a_sel", &n_a_sel));
+        OP_REQUIRES_OK(context, context->GetAttr("n_r_sel", &n_r_sel));
+        n_a_shift = n_a_sel * 4;
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& net_deriv_tensor	= context->input(context_input_index++);
+        const Tensor& in_deriv_tensor	= context->input(context_input_index++);
+        const Tensor& rij_tensor		= context->input(context_input_index++);
+        const Tensor& nlist_tensor		= context->input(context_input_index++);
+        const Tensor& natoms_tensor		= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of net deriv should be 2"));
+        OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of input deriv should be 2"));
+        OP_REQUIRES (context, (rij_tensor.shape().dims() == 2),		    errors::InvalidArgument ("Dim of rij should be 2"));
+        OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2),		errors::InvalidArgument ("Dim of nlist should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
+
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
+        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        int nloc = natoms[0];
+        int nall = natoms[1];
+        int nnei = nlist_tensor.shape().dim_size(1) / nloc;
+        int nframes = net_deriv_tensor.shape().dim_size(0);
+        int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc;
+
+        // check the sizes
+        OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nframes == rij_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+
+        OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
+        OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)),	errors::InvalidArgument ("dim of rij should be nnei * 3"));
+        OP_REQUIRES (context, (nnei == n_a_sel + n_r_sel),				errors::InvalidArgument ("number of neighbors should match"));
+
+        // Create an output tensor
+        TensorShape virial_shape ;
+        virial_shape.AddDim (nframes);
+        virial_shape.AddDim (9);
+        Tensor* virial_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(0, virial_shape, &virial_tensor));
+        TensorShape atom_virial_shape;
+        atom_virial_shape.AddDim (nframes);
+        atom_virial_shape.AddDim (9 * nall);
+        Tensor* atom_virial_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
+
+        // flat the tensors
+        auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
+        auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+        auto rij = rij_tensor.flat<VALUETYPE>();
+        auto nlist = nlist_tensor.flat<int>();
+        auto virial = virial_tensor->flat<VALUETYPE>();
+        auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
+        
+        for (int II = 0; II < nframes; II++) {
+            ProdVirialSeALauncher(virial_tensor->flat<VALUETYPE>().data() + II * 9, 
+                                atom_virial_tensor->flat<VALUETYPE>().data() + II * (nall * 9),
+                                net_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt),
+                                in_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
+                                rij_tensor.flat<VALUETYPE>().data() + II * (nloc * nnei * 3),
+                                nlist_tensor.flat<int>().data() + II * (nloc * nnei),
+                                nloc,
+                                nall,
+                                nnei,
+                                ndescrpt,
+                                n_a_sel,
+                                n_a_shift
+            );
+        }
+        delete[] natoms;
+    }
+private:
+    int n_r_sel, n_a_sel, n_a_shift;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ProdVirialSeA").Device(DEVICE_GPU), ProdVirialSeAOp);
\ No newline at end of file
diff --git a/source/op/prod_virial_se_r_gpu.cc b/source/op/prod_virial_se_r_gpu.cc
new file mode 100644
index 0000000000..91f965b72c
--- /dev/null
+++ b/source/op/prod_virial_se_r_gpu.cc
@@ -0,0 +1,137 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <iostream>
+#include <cuda_runtime.h>
+
+#ifdef HIGH_PREC
+    typedef double VALUETYPE;
+#else
+    typedef float  VALUETYPE;
+#endif
+
+#ifdef HIGH_PREC
+REGISTER_OP("ProdVirialSeR")
+    .Input("net_deriv: double")
+    .Input("in_deriv: double")
+    .Input("rij: double")
+    .Input("nlist: int32")
+    .Input("natoms: int32")
+    .Output("virial: double")
+    .Output("atom_virial: double");
+#else
+REGISTER_OP("ProdVirialSeR")
+    .Input("net_deriv: float")
+    .Input("in_deriv: float")
+    .Input("rij: float")
+    .Input("nlist: int32")
+    .Input("natoms: int32")
+    .Output("virial: float")
+    .Output("atom_virial: float");
+#endif
+
+using namespace tensorflow;
+
+#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess) 
+    {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+void ProdVirialSeRLauncher(VALUETYPE * virial, 
+                        VALUETYPE * atom_virial,
+                        const VALUETYPE * net_deriv,
+                        const VALUETYPE * in_deriv,
+                        const VALUETYPE * rij,
+                        const int * nlist,
+                        const int nloc,
+                        const int nall,
+                        const int nnei,
+                        const int ndescrpt,
+                        const int n_a_sel,
+                        const int n_a_shift);
+
+class ProdVirialSeROp : public OpKernel {
+ public:
+    explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) {
+        // std::cout << "I'm in prod_virial_se_r_gpu.cc" << std::endl;
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& net_deriv_tensor	= context->input(context_input_index++);
+        const Tensor& in_deriv_tensor	= context->input(context_input_index++);
+        const Tensor& rij_tensor		= context->input(context_input_index++);
+        const Tensor& nlist_tensor		= context->input(context_input_index++);
+        const Tensor& natoms_tensor		= context->input(context_input_index++);
+        // set size of the sample
+        OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of net deriv should be 2"));
+        OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of input deriv should be 2"));
+        OP_REQUIRES (context, (rij_tensor.shape().dims() == 2),		    errors::InvalidArgument ("Dim of rij should be 2"));
+        OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2),		errors::InvalidArgument ("Dim of nlist should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
+        cudaErrcheck(cudaDeviceSynchronize());
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
+        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        int nloc = natoms[0];
+        int nall = natoms[1];
+        int nnei = nlist_tensor.shape().dim_size(1) / nloc;
+        int nframes = net_deriv_tensor.shape().dim_size(0);
+        int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc;
+
+        // check the sizes
+        OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nframes == rij_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+
+        OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
+        OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)),	        errors::InvalidArgument ("dim of rij should be nnei * 3"));
+
+        // Create an output tensor
+        TensorShape virial_shape;
+        virial_shape.AddDim (nframes);
+        virial_shape.AddDim (9);
+        Tensor* virial_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(0, virial_shape, &virial_tensor));
+        TensorShape atom_virial_shape ;
+        atom_virial_shape.AddDim (nframes);
+        atom_virial_shape.AddDim (9 * nall);
+        Tensor* atom_virial_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
+
+        // flat the tensors
+        auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
+        auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+        auto rij = rij_tensor.flat<VALUETYPE>();
+        auto nlist = nlist_tensor.flat<int>();
+        auto virial = virial_tensor->flat<VALUETYPE>();
+        auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
+
+        for (int II = 0; II < nframes; II++) {
+            ProdVirialSeRLauncher(virial_tensor->flat<VALUETYPE>().data() + II * 9, 
+                                atom_virial_tensor->flat<VALUETYPE>().data() + II * (nall * 9),
+                                net_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt),
+                                in_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
+                                rij_tensor.flat<VALUETYPE>().data() + II * (nloc * nnei * 3),
+                                nlist_tensor.flat<int>().data() + II * (nloc * nnei),
+                                nloc,
+                                nall,
+                                nnei,
+                                ndescrpt,
+                                n_a_sel,
+                                n_a_shift
+            );
+        }
+        delete[] natoms;
+    }
+private:
+    int n_r_sel, n_a_sel, n_a_shift;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ProdVirialSeR").Device(DEVICE_GPU), ProdVirialSeROp);
\ No newline at end of file
diff --git a/source/scripts/freeze.py b/source/scripts/freeze.py
index 62a7f4559f..2420251744 100755
--- a/source/scripts/freeze.py
+++ b/source/scripts/freeze.py
@@ -44,6 +44,8 @@ def _make_node_names(model_type = None) :
         nodes = "o_dipole,descrpt_attr/rcut,descrpt_attr/ntypes,model_attr/tmap,model_attr/sel_type,model_attr/model_type"
     elif model_type == 'polar':
         nodes = "o_polar,descrpt_attr/rcut,descrpt_attr/ntypes,model_attr/tmap,model_attr/sel_type,model_attr/model_type"
+    elif model_type == 'global_polar':
+        nodes = "o_global_polar,descrpt_attr/rcut,descrpt_attr/ntypes,model_attr/tmap,model_attr/sel_type,model_attr/model_type"
     else:
         raise RuntimeError('unknow model type ' + model_type)
     return nodes
diff --git a/source/train/CMakeLists.txt b/source/train/CMakeLists.txt
index 83f15df4c3..48164f8a9c 100644
--- a/source/train/CMakeLists.txt
+++ b/source/train/CMakeLists.txt
@@ -2,7 +2,7 @@
 
 configure_file("RunOptions.py.in" "${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py" @ONLY)
 
-file(GLOB LIB_PY main.py common.py env.py compat.py Network.py Deep*.py Data.py DataSystem.py Model*.py Descrpt*.py Fitting.py Loss.py LearningRate.py Trainer.py TabInter.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py)
+file(GLOB LIB_PY main.py common.py env.py compat.py calculator.py Network.py Deep*.py Data.py DataSystem.py Model*.py Descrpt*.py Fitting.py Loss.py LearningRate.py Trainer.py TabInter.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py)
 
 file(GLOB CLS_PY  Local.py Slurm.py)
 
diff --git a/source/train/DeepEval.py b/source/train/DeepEval.py
index f78699b6f8..3868f6e258 100644
--- a/source/train/DeepEval.py
+++ b/source/train/DeepEval.py
@@ -149,7 +149,8 @@ def get_sel_type(self):
     def eval(self,
              coords, 
              cells, 
-             atom_types) :
+             atom_types, 
+             atomic = True) :
         # standarize the shape of inputs
         coords = np.array(coords)
         cells = np.array(cells)
@@ -183,10 +184,13 @@ def eval(self,
             tensor.append(v_out[0])
 
         # reverse map of the outputs
-        tensor = np.array(tensor)
-        tensor = self.reverse_map(np.reshape(tensor, [nframes,-1,self.variable_dof]), sel_imap)
-
-        tensor = np.reshape(tensor, [nframes, len(sel_at), self.variable_dof])
+        if atomic:
+            tensor = np.array(tensor)
+            tensor = self.reverse_map(np.reshape(tensor, [nframes,-1,self.variable_dof]), sel_imap)
+            tensor = np.reshape(tensor, [nframes, len(sel_at), self.variable_dof])
+        else:
+            tensor = np.reshape(tensor, [nframes, self.variable_dof])
+        
         return tensor
 
     
diff --git a/source/train/DeepPolar.py b/source/train/DeepPolar.py
index 0ec37876ca..3af499dd07 100644
--- a/source/train/DeepPolar.py
+++ b/source/train/DeepPolar.py
@@ -9,3 +9,14 @@ def __init__(self,
                  model_file) :
         DeepTensor.__init__(self, model_file, 'polar', 9)
 
+    
+class DeepGlobalPolar (DeepTensor) :
+    def __init__(self, 
+                 model_file) :
+        DeepTensor.__init__(self, model_file, 'global_polar', 9)
+
+    def eval(self,
+             coords, 
+             cells, 
+             atom_types) :
+        return DeepTensor.eval(self, coords, cells, atom_types, atomic = False)
diff --git a/source/train/DescrptSeA.py b/source/train/DescrptSeA.py
index aa5221d7c7..527dff2cce 100644
--- a/source/train/DescrptSeA.py
+++ b/source/train/DescrptSeA.py
@@ -28,6 +28,7 @@ def __init__ (self, jdata):
                .add('neuron',   list,   default = [10, 20, 40]) \
                .add('axis_neuron', int, default = 4, alias = 'n_axis_neuron') \
                .add('resnet_dt',bool,   default = False) \
+               .add('trainable',bool,   default = True) \
                .add('seed',     int) 
         class_data = args.parse(jdata)
         self.sel_a = class_data['sel']
@@ -37,6 +38,7 @@ def __init__ (self, jdata):
         self.n_axis_neuron = class_data['axis_neuron']
         self.filter_resnet_dt = class_data['resnet_dt']
         self.seed = class_data['seed']
+        self.trainable = class_data['trainable']
 
         # descrpt config
         self.sel_r = [ 0 for ii in range(len(self.sel_a)) ]
@@ -167,7 +169,7 @@ def build (self,
 
         self.descrpt_reshape = tf.reshape(self.descrpt, [-1, self.ndescrpt])
 
-        self.dout, self.qmat = self._pass_filter(self.descrpt_reshape, natoms, suffix = suffix, reuse = reuse)
+        self.dout, self.qmat = self._pass_filter(self.descrpt_reshape, natoms, suffix = suffix, reuse = reuse, trainable = self.trainable)
 
         return self.dout
 
@@ -201,7 +203,8 @@ def _pass_filter(self,
                      inputs,
                      natoms,
                      reuse = None,
-                     suffix = '') :
+                     suffix = '', 
+                     trainable = True) :
         start_index = 0
         inputs = tf.reshape(inputs, [-1, self.ndescrpt * natoms[0]])
         shape = inputs.get_shape().as_list()
@@ -212,7 +215,7 @@ def _pass_filter(self,
                                  [ 0, start_index*      self.ndescrpt],
                                  [-1, natoms[2+type_i]* self.ndescrpt] )
             inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
-            layer, qmat = self._filter(inputs_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed)
+            layer, qmat = self._filter(inputs_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable)
             layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()])
             qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_rot_mat_1() * 3])
             output.append(layer)
@@ -297,7 +300,8 @@ def _filter(self,
                    bavg=0.0,
                    name='linear', 
                    reuse=None,
-                   seed=None):
+                   seed=None, 
+                trainable = True):
         # natom x (nei x 4)
         shape = inputs.get_shape().as_list()
         outputs_size = [1] + self.filter_neuron
@@ -320,16 +324,19 @@ def _filter(self,
               w = tf.get_variable('matrix_'+str(ii)+'_'+str(type_i), 
                                 [outputs_size[ii - 1], outputs_size[ii]], 
                                 global_tf_float_precision,
-                                tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed))
+                                  tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed), 
+                                  trainable = trainable)
               b = tf.get_variable('bias_'+str(ii)+'_'+str(type_i), 
                                 [1, outputs_size[ii]], 
                                 global_tf_float_precision,
-                                tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed))
+                                tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), 
+                                  trainable = trainable)
               if self.filter_resnet_dt :
                   idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), 
                                         [1, outputs_size[ii]], 
                                         global_tf_float_precision,
-                                        tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed))
+                                        tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed), 
+                                        trainable = trainable)
               if outputs_size[ii] == outputs_size[ii-1]:
                   if self.filter_resnet_dt :
                       xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
@@ -376,7 +383,8 @@ def _filter_type_ext(self,
                            bavg=0.0,
                            name='linear', 
                            reuse=None,
-                           seed=None):
+                           seed=None,
+                         trainable = True):
         # natom x (nei x 4)
         shape = inputs.get_shape().as_list()
         outputs_size = [1] + self.filter_neuron
@@ -401,16 +409,19 @@ def _filter_type_ext(self,
               w = tf.get_variable('matrix_'+str(ii)+'_'+str(type_i), 
                                 [outputs_size[ii - 1], outputs_size[ii]], 
                                 global_tf_float_precision,
-                                tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed))
+                                  tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed),
+                                  trainable = trainable)
               b = tf.get_variable('bias_'+str(ii)+'_'+str(type_i), 
                                 [1, outputs_size[ii]], 
                                 global_tf_float_precision,
-                                tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed))
+                                tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed),
+                                  trainable = trainable)
               if self.filter_resnet_dt :
                   idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), 
                                         [1, outputs_size[ii]], 
                                         global_tf_float_precision,
-                                        tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed))
+                                        tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed),
+                                        trainable = trainable)
               if outputs_size[ii] == outputs_size[ii-1]:
                   if self.filter_resnet_dt :
                       xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
diff --git a/source/train/DescrptSeAR.py b/source/train/DescrptSeAR.py
index 929b660248..138e9222f6 100644
--- a/source/train/DescrptSeAR.py
+++ b/source/train/DescrptSeAR.py
@@ -72,7 +72,7 @@ def build (self,
                reuse = None):
         # dout
         self.dout_a = self.descrpt_a.build(coord_, atype_, natoms, box, mesh, davg[0], dstd[0], suffix=suffix+'_a', reuse=reuse)
-        self.dout_r = self.descrpt_r.build(coord_, atype_, natoms, box, mesh, davg[1], dstd[1], suffix=suffix+'_r', reuse=reuse)
+        self.dout_r = self.descrpt_r.build(coord_, atype_, natoms, box, mesh, davg[1], dstd[1], suffix=suffix,      reuse=reuse)
         self.dout_a = tf.reshape(self.dout_a, [-1, self.descrpt_a.get_dim_out()])
         self.dout_r = tf.reshape(self.dout_r, [-1, self.descrpt_r.get_dim_out()])
         self.dout = tf.concat([self.dout_a, self.dout_r], axis = 1)
diff --git a/source/train/DescrptSeR.py b/source/train/DescrptSeR.py
index b057a0afd3..aaa91ca220 100644
--- a/source/train/DescrptSeR.py
+++ b/source/train/DescrptSeR.py
@@ -27,6 +27,7 @@ def __init__ (self, jdata):
                .add('rcut_smth',float,  default = 5.5) \
                .add('neuron',   list,   default = [10, 20, 40]) \
                .add('resnet_dt',bool,   default = False) \
+               .add('trainable',bool,   default = True) \
                .add('seed',     int) 
         class_data = args.parse(jdata)
         self.sel_r = class_data['sel']
@@ -35,6 +36,7 @@ def __init__ (self, jdata):
         self.filter_neuron = class_data['neuron']
         self.filter_resnet_dt = class_data['resnet_dt']
         self.seed = class_data['seed']        
+        self.trainable = class_data['trainable']
 
         # descrpt config
         self.sel_a = [ 0 for ii in range(len(self.sel_r)) ]
@@ -145,7 +147,7 @@ def build (self,
 
         self.descrpt_reshape = tf.reshape(self.descrpt, [-1, self.ndescrpt])
 
-        self.dout = self._pass_filter(self.descrpt_reshape, natoms, suffix = suffix, reuse = reuse)
+        self.dout = self._pass_filter(self.descrpt_reshape, natoms, suffix = suffix, reuse = reuse, trainable = self.trainable)
 
         return self.dout
 
@@ -171,7 +173,8 @@ def _pass_filter(self,
                      inputs,
                      natoms,
                      reuse = None,
-                     suffix = '') :
+                     suffix = '', 
+                     trainable = True) :
         start_index = 0
         inputs = tf.reshape(inputs, [-1, self.ndescrpt * natoms[0]])
         shape = inputs.get_shape().as_list()
@@ -181,7 +184,7 @@ def _pass_filter(self,
                                  [ 0, start_index*      self.ndescrpt],
                                  [-1, natoms[2+type_i]* self.ndescrpt] )
             inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
-            layer = self._filter_r(inputs_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed)
+            layer = self._filter_r(inputs_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable)
             layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()])
             output.append(layer)
             start_index += natoms[2+type_i]
@@ -253,7 +256,8 @@ def _filter_r(self,
                   bavg=0.0,
                   name='linear', 
                   reuse=None,
-                  seed=None):
+                  seed=None, 
+                  trainable = True):
         # natom x nei
         shape = inputs.get_shape().as_list()
         outputs_size = [1] + self.filter_neuron
@@ -274,16 +278,19 @@ def _filter_r(self,
                     w = tf.get_variable('matrix_'+str(ii)+'_'+str(type_i), 
                                         [outputs_size[ii - 1], outputs_size[ii]], 
                                         global_tf_float_precision,
-                                        tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed))
+                                        tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed), 
+                                        trainable = trainable)
                     b = tf.get_variable('bias_'+str(ii)+'_'+str(type_i), 
                                         [1, outputs_size[ii]], 
                                         global_tf_float_precision,
-                                        tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed))
+                                        tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), 
+                                        trainable = trainable)
                     if self.filter_resnet_dt :
                         idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), 
                                               [1, outputs_size[ii]], 
                                               global_tf_float_precision,
-                                              tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed))
+                                              tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed), 
+                                              trainable = trainable)
                     if outputs_size[ii] == outputs_size[ii-1]:
                         if self.filter_resnet_dt :
                             xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index 10472d828b..fb986399e7 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -386,6 +386,7 @@ def train (self,
             fp = open(self.disp_file, "a")
 
         cur_batch = self.sess.run(self.global_step)
+        is_first_step = True
         self.cur_batch = cur_batch
         self.run_opt.message("start training at lr %.2e (== %.2e), final lr will be %.2e" % 
                              (self.sess.run(self.learning_rate),
@@ -417,8 +418,9 @@ def train (self,
                 feed_dict_batch[self.place_holders[ii]] = batch_data[ii]
             feed_dict_batch[self.place_holders['is_training']] = True
 
-            if self.display_in_training and cur_batch == 0 :
+            if self.display_in_training and is_first_step :
                 self.test_on_the_fly(fp, data, feed_dict_batch)
+                is_first_step = False
             if self.timing_in_training : tic = time.time()
             self.sess.run([self.train_op], feed_dict = feed_dict_batch, options=prf_options, run_metadata=prf_run_metadata)
             if self.timing_in_training : toc = time.time()
diff --git a/source/train/calculator.py b/source/train/calculator.py
new file mode 100644
index 0000000000..1179eef891
--- /dev/null
+++ b/source/train/calculator.py
@@ -0,0 +1,49 @@
+"""An ASE calculator interface.
+
+Example:
+```python
+from ase import Atoms
+from deepmd.calculator import DP
+
+water = Atoms('H2O',
+              positions=[(0.7601, 1.9270, 1),
+                         (1.9575, 1, 1),
+                         (1., 1., 1.)],
+              cell=[100, 100, 100],
+              calculator=DP(model="frozen_model.pb"))
+print(water.get_potential_energy())
+print(water.get_forces())
+```
+
+Optimization is also available:
+```python
+from ase.optimize import BFGS
+dyn = BFGS(water)
+dyn.run(fmax=1e-6)
+print(water.get_positions())
+```
+"""
+
+from ase.calculators.calculator import Calculator, all_changes
+import deepmd.DeepPot as DeepPot
+
+
+class DP(Calculator):
+    name = "DP"
+    implemented_properties = ["energy", "forces", "stress"]
+
+    def __init__(self, model, label="DP", **kwargs):
+        Calculator.__init__(self, label=label, **kwargs)
+        self.dp = DeepPot(model)
+        self.type_dict = dict(zip(self.dp.get_type_map(), range(self.dp.get_ntypes())))
+
+    def calculate(self, atoms=None, properties=["energy", "forces", "stress"], system_changes=all_changes):
+        coord = atoms.get_positions().reshape([1, -1])
+        cell = atoms.get_cell().reshape([1, -1])
+        symbols = atoms.get_chemical_symbols()
+        atype = [self.type_dict[k] for k in symbols]
+        e, f, v = self.dp.eval(coord, cell, atype)
+        self.results['energy'] = e[0]
+        self.results['forces'] = f[0]
+        self.results['stress'] = v[0]
+