diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..c2225c5f76 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "source/op/cuda/cub"] + path = source/op/cuda/cub + url = git://github.com/NVlabs/cub.git diff --git a/README.md b/README.md index 59965dc881..a6187acc03 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ - [Include deepmd in the pair style](#include-deepmd-in-the-pair-style) - [Long-range interaction](#long-range-interaction) - [Run path-integral MD with i-PI](#run-path-integral-md-with-i-pi) + - [Use deep potential with ASE](#use-deep-potential-with-ase) - [Troubleshooting](#troubleshooting) # About DeePMD-kit @@ -150,7 +151,7 @@ One should remember to activate the virtual environment every time he/she uses d Clone the DeePMD-kit source code ```bash cd /some/workspace -git clone https://github.com/deepmodeling/deepmd-kit.git deepmd-kit +git clone --recursive https://github.com/deepmodeling/deepmd-kit.git deepmd-kit -b devel ``` If one downloads the .zip file from the github, then the default folder of source code would be `deepmd-kit-master` rather than `deepmd-kit`. For convenience, you may want to record the location of source to a variable, saying `deepmd_source_dir` by ```bash @@ -553,6 +554,30 @@ The option **`graph_file`** provides the file name of the frozen model. The `dp_ipi` gets the atom names from an [XYZ file](https://en.wikipedia.org/wiki/XYZ_file_format) provided by **`coord_file`** (meanwhile ignores all coordinates in it), and translates the names to atom types by rules provided by **`atom_type`**. +## Use deep potential with ASE + +Deep potential can be set up as a calculator with ASE to obtain potential energies and forces. +```python +from ase import Atoms +from deepmd.calculator import DP + +water = Atoms('H2O', + positions=[(0.7601, 1.9270, 1), + (1.9575, 1, 1), + (1., 1., 1.)], + cell=[100, 100, 100], + calculator=DP(model="frozen_model.pb")) +print(water.get_potential_energy()) +print(water.get_forces()) +``` + +Optimization is also available: +```python +from ase.optimize import BFGS +dyn = BFGS(water) +dyn.run(fmax=1e-6) +print(water.get_positions()) +``` # Troubleshooting In consequence of various differences of computers or systems, problems may occur. Some common circumstances are listed as follows. diff --git a/deepmd/__init__.py b/deepmd/__init__.py index b2711088e8..231145b989 100644 --- a/deepmd/__init__.py +++ b/deepmd/__init__.py @@ -3,6 +3,7 @@ from .DeepPot import DeepPot from .DeepDipole import DeepDipole from .DeepPolar import DeepPolar +from .DeepPolar import DeepGlobalPolar from .DeepWFC import DeepWFC set_mkl() diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 7b35067444..d83ffe9e0e 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -170,6 +170,11 @@ include_directories(${TensorFlow_INCLUDE_DIRS}) if (BUILD_CPP_IF) set (LIB_DEEPMD "deepmd") set (LIB_DEEPMD_OP "deepmd_op") + if (USE_CUDA_TOOLKIT) + set (LIB_DEEPMD_OP_CUDA "deepmd_op_cuda") + else() + set (LIB_DEEPMD_OP_CUDA "") + endif() if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9) set (LIB_DEEPMD_NATIVE "deepmd_native_md") set (LIB_DEEPMD_IPI "deepmd_ipi") diff --git a/source/lib/include/NNPAtomMap.h b/source/lib/include/NNPAtomMap.h index e09b3186b6..c7474981e7 100644 --- a/source/lib/include/NNPAtomMap.h +++ b/source/lib/include/NNPAtomMap.h @@ -8,6 +8,7 @@ template class NNPAtomMap { public: + NNPAtomMap(); NNPAtomMap(const vector::const_iterator in_begin, const vector::const_iterator in_end); void forward (typename vector::iterator out, diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h index a06faaa3b9..20e9bd8aaa 100644 --- a/source/lib/include/NNPInter.h +++ b/source/lib/include/NNPInter.h @@ -5,10 +5,11 @@ #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/shape_inference.h" - +#include "NNPAtomMap.h" #include #include "version.h" +typedef double compute_t; using namespace tensorflow; using namespace std; @@ -53,6 +54,7 @@ class NNPInter { public: NNPInter () ; + ~NNPInter() ; NNPInter (const string & model, const int & gpu_rank = 0); void init (const string & model, const int & gpu_rank = 0); void print_summary(const string &pre) const; @@ -74,6 +76,7 @@ class NNPInter const vector & box, const int nghost, const LammpsNeighborList & lmp_list, + const int & ago, const vector & fparam = vector(), const vector & aparam = vector()); void compute (ENERGYTYPE & ener, @@ -96,6 +99,7 @@ class NNPInter const vector & box, const int nghost, const LammpsNeighborList & lmp_list, + const int & ago, const vector & fparam = vector(), const vector & aparam = vector()); VALUETYPE cutoff () const {assert(inited); return rcut;}; @@ -118,12 +122,30 @@ class NNPInter void validate_fparam_aparam(const int & nloc, const vector &fparam, const vector &aparam)const ; + + // copy neighbor list info from host + bool init_nbor; + std::vector sec_a; + compute_t *array_double; + InternalNeighborList nlist; + NNPAtomMap nnpmap; + unsigned long long *array_longlong; + int *ilist, *jrange, *jlist, *array_int; + int ilist_size, jrange_size, jlist_size; + int arr_int_size, arr_ll_size, arr_dou_size; + + // function used for neighbor list copy + vector get_sel_a() const; +#ifdef USE_CUDA_TOOLKIT + void update_nbor(const InternalNeighborList & nlist, const int nloc); +#endif }; class NNPInterModelDevi { public: NNPInterModelDevi () ; + ~NNPInterModelDevi() ; NNPInterModelDevi (const vector & models, const int & gpu_rank = 0); void init (const vector & models, const int & gpu_rank = 0); public: @@ -144,6 +166,7 @@ class NNPInterModelDevi const vector & box, const int nghost, const LammpsNeighborList & lmp_list, + const int & ago, const vector & fparam = vector(), const vector & aparam = vector()); void compute (vector & all_ener, @@ -156,6 +179,7 @@ class NNPInterModelDevi const vector & box, const int nghost, const LammpsNeighborList & lmp_list, + const int & ago, const vector & fparam = vector(), const vector & aparam = vector()); VALUETYPE cutoff () const {assert(inited); return rcut;}; @@ -176,6 +200,9 @@ class NNPInterModelDevi void compute_std_f (vector & std, const vector & avg, const vector >& xx); + void compute_relative_std_f (vector & std, + const vector & avg, + const VALUETYPE eps); private: unsigned numb_models; vector sessions; @@ -193,6 +220,25 @@ class NNPInterModelDevi void validate_fparam_aparam(const int & nloc, const vector &fparam, const vector &aparam)const ; + + // copy neighbor list info from host + bool init_nbor; + compute_t *array_double; + vector > sec; + InternalNeighborList nlist; + NNPAtomMap nnpmap; + unsigned long long *array_longlong; + int max_sec_size = 0, max_sec_back = 0; + int *ilist, *jrange, *jlist, *array_int; + int ilist_size, jrange_size, jlist_size, arr_int_size, arr_ll_size, arr_dou_size; + + // function used for nborlist copy + void get_max_sec(); + vector > get_sel() const; + void cum_sum(const std::vector > n_sel); +#ifdef USE_CUDA_TOOLKIT + void update_nbor(const InternalNeighborList & nlist, const int nloc); +#endif }; diff --git a/source/lib/src/NNPAtomMap.cc b/source/lib/src/NNPAtomMap.cc index edb410a98b..dd4fd73a00 100644 --- a/source/lib/src/NNPAtomMap.cc +++ b/source/lib/src/NNPAtomMap.cc @@ -3,6 +3,10 @@ #include #include +template +NNPAtomMap:: +NNPAtomMap() {} + template NNPAtomMap:: NNPAtomMap(const vector::const_iterator in_begin, diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc index fa1f989b49..47a66893c3 100644 --- a/source/lib/src/NNPInter.cc +++ b/source/lib/src/NNPInter.cc @@ -2,11 +2,25 @@ #include "NNPAtomMap.h" #include "SimulationRegion.h" #include + +#define MAGIC_NUMBER 256 +typedef double compute_t; + #ifdef USE_CUDA_TOOLKIT #include "cuda_runtime.h" #include #include #include + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} #endif static @@ -18,6 +32,17 @@ checkStatus(const tensorflow::Status& status) { } } +static +std::vector cum_sum (const std::vector & n_sel) { + std::vector sec; + sec.resize (n_sel.size() + 1); + sec[0] = 0; + for (int ii = 1; ii < sec.size(); ++ii) { + sec[ii] = sec[ii-1] + n_sel[ii-1]; + } + return sec; +} + static void convert_nlist_lmp_internal (InternalNeighborList & list, const LammpsNeighborList & lmp_list) @@ -333,6 +358,137 @@ make_input_tensors (std::vector> & input_tensors, return nloc; } +static int make_input_tensors ( + vector> & input_tensors, + const vector & dcoord_, + const int & ntypes, + const vector & datype_, + const vector & dbox, + const int * ilist, + const int * jrange, + const int * jlist, + int * array_int, + unsigned long long * array_longlong, + compute_t * array_double, + const vector & fparam_, + const vector & aparam_, + const NNPAtomMap & nnpmap, + const int & nghost) +{ + assert (dbox.size() == 9); + + int nframes = 1; + int nall = dcoord_.size() / 3; + int nloc = nall - nghost; + assert (nall == datype_.size()); + + vector datype = nnpmap.get_type(); + vector type_count (ntypes, 0); + for (unsigned ii = 0; ii < datype.size(); ++ii) { + type_count[datype[ii]] ++; + } + datype.insert (datype.end(), datype_.begin() + nloc, datype_.end()); + + TensorShape coord_shape ; + coord_shape.AddDim (nframes); + coord_shape.AddDim (nall * 3); + TensorShape type_shape ; + type_shape.AddDim (nframes); + type_shape.AddDim (nall); + TensorShape box_shape ; + box_shape.AddDim (nframes); + box_shape.AddDim (9); + TensorShape mesh_shape; + mesh_shape.AddDim (32); + TensorShape natoms_shape; + natoms_shape.AddDim (2 + ntypes); + TensorShape fparam_shape; + fparam_shape.AddDim (nframes); + fparam_shape.AddDim (fparam_.size()); + TensorShape aparam_shape ; + aparam_shape.AddDim (nframes); + aparam_shape.AddDim (aparam_.size()); + + #ifdef HIGH_PREC + Tensor coord_tensor (DT_DOUBLE, coord_shape); + Tensor box_tensor (DT_DOUBLE, box_shape); + Tensor fparam_tensor(DT_DOUBLE, fparam_shape); + Tensor aparam_tensor(DT_DOUBLE, fparam_shape); + #else + Tensor coord_tensor (DT_FLOAT, coord_shape); + Tensor box_tensor (DT_FLOAT, box_shape); + Tensor fparam_tensor(DT_FLOAT, fparam_shape); + Tensor aparam_tensor(DT_FLOAT, fparam_shape); + #endif + Tensor type_tensor (DT_INT32, type_shape); + Tensor mesh_tensor (DT_INT32, mesh_shape); + Tensor natoms_tensor(DT_INT32, natoms_shape); + + auto coord = coord_tensor.matrix (); + auto type = type_tensor.matrix (); + auto box = box_tensor.matrix (); + auto mesh = mesh_tensor.flat (); + auto natoms = natoms_tensor.flat (); + auto fparam = fparam_tensor.matrix (); + auto aparam = aparam_tensor.matrix (); + + vector dcoord (dcoord_); + nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3); + + for (int ii = 0; ii < nframes; ++ii) { + for (int jj = 0; jj < nall * 3; ++jj) { + coord(ii, jj) = dcoord[jj]; + } + for (int jj = 0; jj < 9; ++jj) { + box(ii, jj) = dbox[jj]; + } + for (int jj = 0; jj < nall; ++jj) { + type(ii, jj) = datype[jj]; + } + for (int jj = 0; jj < fparam_.size(); ++jj) { + fparam(ii, jj) = fparam_[jj]; + } + for (int jj = 0; jj < aparam_.size(); ++jj) { + aparam(ii, jj) = aparam_[jj]; + } + } + + for (int ii = 0; ii < 32; ++ii) mesh(ii) = 0; + + mesh (0) = sizeof(int *) / sizeof(int); + assert (mesh(0) * sizeof(int) == sizeof(int *)); + const int & stride = mesh(0); + // mesh (1) = dlist.ilist.size(); + mesh (1) = nloc; + assert (mesh(1) == nloc); + assert (stride <= 4); + memcpy (&mesh(4), &(ilist), sizeof(int *)); + memcpy (&mesh(8), &(jrange), sizeof(int *)); + memcpy (&mesh(12), &(jlist), sizeof(int *)); + memcpy (&mesh(16), &(array_int), sizeof(int *)); + memcpy (&mesh(20), &(array_longlong), sizeof(unsigned long long *)); + memcpy (&mesh(24), &(array_double), sizeof(compute_t *)); + + natoms (0) = nloc; + natoms (1) = nall; + for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii]; + + input_tensors = { + {"t_coord", coord_tensor}, + {"t_type", type_tensor}, + {"t_box", box_tensor}, + {"t_mesh", mesh_tensor}, + {"t_natoms", natoms_tensor}, + }; + if (fparam_.size() > 0) { + input_tensors.push_back({"t_fparam", fparam_tensor}); + } + if (aparam_.size() > 0) { + input_tensors.push_back({"t_aparam", aparam_tensor}); + } + return nloc; +} + static void run_model (ENERGYTYPE & dener, vector & dforce_, @@ -356,107 +512,197 @@ run_model (ENERGYTYPE & dener, return; } +#ifdef USE_CUDA_TOOLKIT std::vector output_tensors; - checkStatus (session->Run(input_tensors, - {"o_energy", "o_force", "o_virial"}, + {"o_energy", "o_force", "o_atom_virial"}, {}, &output_tensors)); Tensor output_e = output_tensors[0]; Tensor output_f = output_tensors[1]; - Tensor output_v = output_tensors[2]; + Tensor output_av = output_tensors[2]; auto oe = output_e.flat (); auto of = output_f.flat (); - auto ov = output_v.flat (); + auto oav = output_av.flat (); dener = oe(0); vector dforce (3 * nall); + vector datom_virial (9 * nall); dvirial.resize (9); for (unsigned ii = 0; ii < nall * 3; ++ii){ dforce[ii] = of(ii); } - for (unsigned ii = 0; ii < 9; ++ii){ - dvirial[ii] = ov(ii); + for (int ii = 0; ii < nall * 9; ++ii) { + datom_virial[ii] = oav(ii); } + for (int ii = 0; ii < nall; ++ii) { + dvirial[0] += 1.0 * datom_virial[9*ii+0]; + dvirial[1] += 1.0 * datom_virial[9*ii+1]; + dvirial[2] += 1.0 * datom_virial[9*ii+2]; + dvirial[3] += 1.0 * datom_virial[9*ii+3]; + dvirial[4] += 1.0 * datom_virial[9*ii+4]; + dvirial[5] += 1.0 * datom_virial[9*ii+5]; + dvirial[6] += 1.0 * datom_virial[9*ii+6]; + dvirial[7] += 1.0 * datom_virial[9*ii+7]; + dvirial[8] += 1.0 * datom_virial[9*ii+8]; + } + dforce_ = dforce; nnpmap.backward (dforce_.begin(), dforce.begin(), 3); -} - -static void -run_model (ENERGYTYPE & dener, - vector & dforce_, - vector & dvirial, - vector & datom_energy_, - vector & datom_virial_, - Session * session, - const std::vector> & input_tensors, - const NNPAtomMap &nnpmap, - const int nghost = 0) -{ - unsigned nloc = nnpmap.get_type().size(); - unsigned nall = nloc + nghost; - if (nloc == 0) { - dener = 0; - // no backward map needed - // dforce of size nall * 3 - dforce_.resize(nall * 3); - fill(dforce_.begin(), dforce_.end(), 0.0); - // dvirial of size 9 - dvirial.resize(9); - fill(dvirial.begin(), dvirial.end(), 0.0); - // datom_energy_ of size nall - datom_energy_.resize(nall); - fill(datom_energy_.begin(), datom_energy_.end(), 0.0); - // datom_virial_ of size nall * 9 - datom_virial_.resize(nall * 9); - fill(datom_virial_.begin(), datom_virial_.end(), 0.0); - return; - } - +#else std::vector output_tensors; checkStatus (session->Run(input_tensors, - {"o_energy", "o_force", "o_virial", "o_atom_energy", "o_atom_virial"}, + {"o_energy", "o_force", "o_virial"}, {}, &output_tensors)); - + Tensor output_e = output_tensors[0]; Tensor output_f = output_tensors[1]; Tensor output_v = output_tensors[2]; - Tensor output_ae = output_tensors[3]; - Tensor output_av = output_tensors[4]; auto oe = output_e.flat (); auto of = output_f.flat (); auto ov = output_v.flat (); - auto oae = output_ae.flat (); - auto oav = output_av.flat (); dener = oe(0); vector dforce (3 * nall); - vector datom_energy (nall, 0); - vector datom_virial (9 * nall); dvirial.resize (9); - for (int ii = 0; ii < nall * 3; ++ii){ + for (unsigned ii = 0; ii < nall * 3; ++ii){ dforce[ii] = of(ii); } - for (int ii = 0; ii < nloc; ++ii){ - datom_energy[ii] = oae(ii); - } - for (int ii = 0; ii < nall * 9; ++ii){ - datom_virial[ii] = oav(ii); - } - for (int ii = 0; ii < 9; ++ii){ + for (unsigned ii = 0; ii < 9; ++ii){ dvirial[ii] = ov(ii); } dforce_ = dforce; - datom_energy_ = datom_energy; - datom_virial_ = datom_virial; nnpmap.backward (dforce_.begin(), dforce.begin(), 3); - nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1); - nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9); +#endif +} + +static void run_model (ENERGYTYPE & dener, + vector & dforce_, + vector & dvirial, + vector & datom_energy_, + vector & datom_virial_, + Session * session, + const std::vector> & input_tensors, + const NNPAtomMap & nnpmap, + const int & nghost = 0) +{ + unsigned nloc = nnpmap.get_type().size(); + unsigned nall = nloc + nghost; + if (nloc == 0) { + dener = 0; + // no backward map needed + // dforce of size nall * 3 + dforce_.resize(nall * 3); + fill(dforce_.begin(), dforce_.end(), 0.0); + // dvirial of size 9 + dvirial.resize(9); + fill(dvirial.begin(), dvirial.end(), 0.0); + // datom_energy_ of size nall + datom_energy_.resize(nall); + fill(datom_energy_.begin(), datom_energy_.end(), 0.0); + // datom_virial_ of size nall * 9 + datom_virial_.resize(nall * 9); + fill(datom_virial_.begin(), datom_virial_.end(), 0.0); + return; + } +#ifdef USE_CUDA_TOOLKIT + std::vector output_tensors; + + checkStatus (session->Run(input_tensors, + {"o_energy", "o_force", "o_atom_energy", "o_atom_virial"}, + {}, + &output_tensors)); + + Tensor output_e = output_tensors[0]; + Tensor output_f = output_tensors[1]; + Tensor output_ae = output_tensors[2]; + Tensor output_av = output_tensors[3]; + + auto oe = output_e.flat (); + auto of = output_f.flat (); + auto oae = output_ae.flat (); + auto oav = output_av.flat (); + + dener = oe(0); + vector dforce (3 * nall); + vector datom_energy (nall, 0); + vector datom_virial (9 * nall); + dvirial.resize (9); + for (int ii = 0; ii < nall * 3; ++ii) { + dforce[ii] = of(ii); + } + for (int ii = 0; ii < nloc; ++ii) { + datom_energy[ii] = oae(ii); + } + for (int ii = 0; ii < nall * 9; ++ii) { + datom_virial[ii] = oav(ii); + } + for (int ii = 0; ii < nall; ++ii) { + dvirial[0] += 1.0 * datom_virial[9*ii+0]; + dvirial[1] += 1.0 * datom_virial[9*ii+1]; + dvirial[2] += 1.0 * datom_virial[9*ii+2]; + dvirial[3] += 1.0 * datom_virial[9*ii+3]; + dvirial[4] += 1.0 * datom_virial[9*ii+4]; + dvirial[5] += 1.0 * datom_virial[9*ii+5]; + dvirial[6] += 1.0 * datom_virial[9*ii+6]; + dvirial[7] += 1.0 * datom_virial[9*ii+7]; + dvirial[8] += 1.0 * datom_virial[9*ii+8]; + } + dforce_ = dforce; + datom_energy_ = datom_energy; + datom_virial_ = datom_virial; + nnpmap.backward (dforce_.begin(), dforce.begin(), 3); + nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1); + nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9); +#else + std::vector output_tensors; + + checkStatus (session->Run(input_tensors, + {"o_energy", "o_force", "o_virial", "o_atom_energy", "o_atom_virial"}, + {}, + &output_tensors)); + + Tensor output_e = output_tensors[0]; + Tensor output_f = output_tensors[1]; + Tensor output_v = output_tensors[2]; + Tensor output_ae = output_tensors[3]; + Tensor output_av = output_tensors[4]; + + auto oe = output_e.flat (); + auto of = output_f.flat (); + auto ov = output_v.flat (); + auto oae = output_ae.flat (); + auto oav = output_av.flat (); + + dener = oe(0); + vector dforce (3 * nall); + vector datom_energy (nall, 0); + vector datom_virial (9 * nall); + dvirial.resize (9); + for (int ii = 0; ii < nall * 3; ++ii) { + dforce[ii] = of(ii); + } + for (int ii = 0; ii < nloc; ++ii) { + datom_energy[ii] = oae(ii); + } + for (int ii = 0; ii < nall * 9; ++ii) { + datom_virial[ii] = oav(ii); + } + for (int ii = 0; ii < 9; ++ii) { + dvirial[ii] = ov(ii); + } + dforce_ = dforce; + datom_energy_ = datom_energy; + datom_virial_ = datom_virial; + nnpmap.backward (dforce_.begin(), dforce.begin(), 3); + nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1); + nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9); +#endif } static void @@ -484,19 +730,94 @@ get_env_nthreads(int & num_intra_nthreads, NNPInter:: NNPInter () - : inited (false) + : inited (false), init_nbor (false) { get_env_nthreads(num_intra_nthreads, num_inter_nthreads); } NNPInter:: NNPInter (const string & model, const int & gpu_rank) - : inited (false) + : inited (false), init_nbor (false) { get_env_nthreads(num_intra_nthreads, num_inter_nthreads); init(model, gpu_rank); } +NNPInter::~NNPInter() { + #ifdef USE_CUDA_TOOLKIT + if (init_nbor) { + cudaErrcheck(cudaFree(ilist)); + cudaErrcheck(cudaFree(jrange)); + cudaErrcheck(cudaFree(jlist)); + cudaErrcheck(cudaFree(array_int)); + cudaErrcheck(cudaFree(array_longlong)); + cudaErrcheck(cudaFree(array_double)); + } + #endif +} + +#ifdef USE_CUDA_TOOLKIT +void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) { + if (!init_nbor) { + sec_a = cum_sum(get_sel_a()); + cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); + cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); + cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); + cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc))); + cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2)); + #ifdef HIGH_PREC + cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3)); + #else + cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3)); + #endif + ilist_size = nlist.ilist.size(); + jrange_size = nlist.jrange.size(); + jlist_size = nlist.jlist.size(); + arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc; + arr_ll_size = nloc * MAGIC_NUMBER * 2; + arr_dou_size = nloc * sec_a.back() * 3; + init_nbor = true; + } + if (ilist_size < nlist.ilist.size()) { + cudaErrcheck(cudaFree(ilist)); + cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); + ilist_size = nlist.ilist.size(); + } + if (jrange_size < nlist.jrange.size()) { + cudaErrcheck(cudaFree(jrange)); + cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); + jrange_size = nlist.jrange.size(); + } + if (jlist_size < nlist.jlist.size()) { + cudaErrcheck(cudaFree(jlist)); + cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); + jlist_size = nlist.jlist.size(); + } + if (arr_int_size < sec_a.size() + nloc * sec_a.size() + nloc) { + cudaErrcheck(cudaFree(array_int)); + cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc))); + arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc; + } + if (arr_ll_size < nloc * MAGIC_NUMBER * 2) { + cudaErrcheck(cudaFree(array_longlong)); + cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2)); + arr_ll_size = nloc * MAGIC_NUMBER * 2; + } + if (arr_dou_size < nloc * sec_a.back() * 3) { + cudaErrcheck(cudaFree(array_double)); + #ifdef HIGH_PREC + cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3)); + #else + cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3)); + #endif + arr_dou_size = nloc * sec_a.back() * 3; + } + cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice)); + cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice)); + cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice)); +} +#endif // USE_CUDA_TOOLKIT + #ifdef USE_CUDA_TOOLKIT void NNPInter:: @@ -533,6 +854,14 @@ init (const string & model, const int & gpu_rank) if (dfparam < 0) dfparam = 0; if (daparam < 0) daparam = 0; inited = true; + + init_nbor = false; + array_int = NULL; + array_double = NULL; + array_longlong = NULL; + ilist = NULL; jrange = NULL; jlist = NULL; + ilist_size = 0; jrange_size = 0; jlist_size = 0; + arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0; } #else void @@ -560,6 +889,14 @@ init (const string & model, const int & gpu_rank) // ntypes = get_ntypes(); // dfparam = get_dfparam(); inited = true; + + init_nbor = false; + array_int = NULL; + array_double = NULL; + array_longlong = NULL; + ilist = NULL; jrange = NULL; jlist = NULL; + ilist_size = 0; jrange_size = 0; jlist_size = 0; + arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0; } #endif @@ -594,6 +931,50 @@ get_scalar (const string & name) const return orc(0); } +std::string graph_info(const GraphDef & graph_def) { + // std::stringstream buffer; + // std::streambuf * old = std::cout.rdbuf(buffer.rdbuf()); + std::string str = ""; + for (int ii = 0; ii < graph_def.node_size(); ii++) { + if (graph_def.node(ii).name() == "DescrptSeA") { + // str = graph_def.node(ii).PrintDebugString(); + str = graph_def.node(ii).DebugString(); + return str; + // std::cout << str << std::endl; + } + if (graph_def.node(ii).name() == "DescrptSeR") { + // str = graph_def.node(ii).PrintDebugString(); + str = graph_def.node(ii).DebugString(); + return str; + // std::cout << str << std::endl; + } + } +} + +// init the tmp array data +std::vector NNPInter::get_sel_a () const { + std::vector sel_a; + std::istringstream is(graph_info(graph_def)); + std::string line = ""; + while(is >> line) { + if (line.find("sel_a") != line.npos) { + while (std::getline(is, line) && line != "}") { + if (line.find("i:") != line.npos) { + sel_a.push_back(atoi((line.substr(line.find("i:") + 2)).c_str())); + } + } break; + } + if (line.find("sel") != line.npos) { + while (std::getline(is, line) && line != "}") { + if (line.find("i:") != line.npos) { + sel_a.push_back(atoi((line.substr(line.find("i:") + 2)).c_str())); + } + } break; + } + } + return sel_a; +} + void NNPInter:: validate_fparam_aparam(const int & nloc, @@ -622,7 +1003,7 @@ compute (ENERGYTYPE & dener, { int nall = dcoord_.size() / 3; int nloc = nall - nghost; - NNPAtomMap nnpmap (datype_.begin(), datype_.begin() + nloc); + nnpmap = NNPAtomMap (datype_.begin(), datype_.begin() + nloc); assert (nloc == nnpmap.get_type().size()); validate_fparam_aparam(nloc, fparam, aparam); @@ -643,24 +1024,34 @@ compute (ENERGYTYPE & dener, const vector & dbox, const int nghost, const LammpsNeighborList & lmp_list, + const int & ago, const vector & fparam, const vector & aparam) { int nall = dcoord_.size() / 3; int nloc = nall - nghost; - NNPAtomMap nnpmap (datype_.begin(), datype_.begin() + nloc); - assert (nloc == nnpmap.get_type().size()); - validate_fparam_aparam(nloc, fparam, aparam); - - InternalNeighborList nlist; - convert_nlist_lmp_internal (nlist, lmp_list); - shuffle_nlist (nlist, nnpmap); - - std::vector> input_tensors; - int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); - assert (nloc == ret); - - run_model (dener, dforce_, dvirial, session, input_tensors, nnpmap, nghost); + validate_fparam_aparam(nloc, fparam, aparam); + std::vector> input_tensors; + + // agp == 0 means that the LAMMPS nbor list has been updated + if (ago == 0) { + nnpmap = NNPAtomMap (datype_.begin(), datype_.begin() + nloc); + assert (nloc == nnpmap.get_type().size()); + + // InternalNeighborList nlist; + convert_nlist_lmp_internal (nlist, lmp_list); + shuffle_nlist (nlist, nnpmap); + #ifdef USE_CUDA_TOOLKIT + update_nbor(nlist, nloc); + #endif + } + #ifdef USE_CUDA_TOOLKIT + int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost); + #else + int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); + #endif + assert (nloc == ret); + run_model (dener, dforce_, dvirial, session, input_tensors, nnpmap, nghost); } @@ -677,7 +1068,7 @@ compute (ENERGYTYPE & dener, const vector & fparam, const vector & aparam) { - NNPAtomMap nnpmap (datype_.begin(), datype_.end()); + nnpmap = NNPAtomMap (datype_.begin(), datype_.end()); validate_fparam_aparam(nnpmap.get_type().size(), fparam, aparam); std::vector> input_tensors; @@ -700,24 +1091,34 @@ compute (ENERGYTYPE & dener, const vector & dbox, const int nghost, const LammpsNeighborList & lmp_list, + const int & ago, const vector & fparam, const vector & aparam) { int nall = dcoord_.size() / 3; int nloc = nall - nghost; - NNPAtomMap nnpmap (datype_.begin(), datype_.begin() + nloc); - assert (nloc == nnpmap.get_type().size()); - validate_fparam_aparam(nloc, fparam, aparam); + validate_fparam_aparam(nloc, fparam, aparam); + std::vector> input_tensors; - InternalNeighborList nlist; - convert_nlist_lmp_internal (nlist, lmp_list); - shuffle_nlist (nlist, nnpmap); + if (ago == 0) { + nnpmap = NNPAtomMap (datype_.begin(), datype_.begin() + nloc); + assert (nloc == nnpmap.get_type().size()); - std::vector> input_tensors; - int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); - assert (nloc == ret); + // InternalNeighborList nlist; + convert_nlist_lmp_internal (nlist, lmp_list); + shuffle_nlist (nlist, nnpmap); + #ifdef USE_CUDA_TOOLKIT + update_nbor(nlist, nloc); + #endif - run_model (dener, dforce_, dvirial, datom_energy_, datom_virial_, session, input_tensors, nnpmap, nghost); + } + #ifdef USE_CUDA_TOOLKIT + int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost); + #else + int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); + #endif + assert (nloc == ret); + run_model (dener, dforce_, dvirial, datom_energy_, datom_virial_, session, input_tensors, nnpmap, nghost); } @@ -726,6 +1127,7 @@ compute (ENERGYTYPE & dener, NNPInterModelDevi:: NNPInterModelDevi () : inited (false), + init_nbor (false), numb_models (0) { get_env_nthreads(num_intra_nthreads, num_inter_nthreads); @@ -734,12 +1136,26 @@ NNPInterModelDevi () NNPInterModelDevi:: NNPInterModelDevi (const vector & models, const int & gpu_rank) : inited (false), + init_nbor(false), numb_models (0) { get_env_nthreads(num_intra_nthreads, num_inter_nthreads); init(models, gpu_rank); } +NNPInterModelDevi::~NNPInterModelDevi() { +#ifdef USE_CUDA_TOOLKIT + if (init_nbor) { + cudaErrcheck(cudaFree(ilist)); + cudaErrcheck(cudaFree(jrange)); + cudaErrcheck(cudaFree(jlist)); + cudaErrcheck(cudaFree(array_int)); + cudaErrcheck(cudaFree(array_longlong)); + cudaErrcheck(cudaFree(array_double)); + } +#endif +} + #ifdef USE_CUDA_TOOLKIT void NNPInterModelDevi:: @@ -784,6 +1200,14 @@ init (const vector & models, const int & gpu_rank) // cell_size = rcut; // ntypes = get_ntypes(); inited = true; + + init_nbor = false; + array_int = NULL; + array_double = NULL; + array_longlong = NULL; + ilist = NULL; jrange = NULL; jlist = NULL; + ilist_size = 0; jrange_size = 0; jlist_size = 0; + arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0; } #else void @@ -813,6 +1237,14 @@ init (const vector & models, const int & gpu_rank) // cell_size = rcut; // ntypes = get_ntypes(); inited = true; + + init_nbor = false; + array_int = NULL; + array_double = NULL; + array_longlong = NULL; + ilist = NULL; jrange = NULL; jlist = NULL; + ilist_size = 0; jrange_size = 0; jlist_size = 0; + arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0; } #endif @@ -840,6 +1272,128 @@ get_scalar(const string name) const return myrcut; } +// init the tmp array data +std::vector > +NNPInterModelDevi:: +get_sel () const +{ + std::vector > sec; + for (int ii = 0; ii < numb_models; ii++) { + std::vector sel; + std::istringstream is(graph_info(graph_defs[ii])); + std::string line = ""; + while(is >> line) { + if (line.find("sel") != line.npos) { + while (std::getline(is, line) && line != "}") { + if (line.find("i:") != line.npos) { + sel.push_back(atoi((line.substr(line.find("i:") + 2)).c_str())); + } + } break; + } + if (line.find("sel_a") != line.npos) { + while (std::getline(is, line) && line != "}") { + if (line.find("i:") != line.npos) { + sel.push_back(atoi((line.substr(line.find("i:") + 2)).c_str())); + } + } break; + } + } + sec.push_back(sel); + } + return sec; +} + +void +NNPInterModelDevi:: +cum_sum (const std::vector > n_sel) +{ + for (int ii = 0; ii < numb_models; ++ii) { + std::vector _sec; + _sec.resize (n_sel[ii].size() + 1); + _sec[0] = 0; + for (int jj = 1; jj < _sec.size(); ++jj) { + _sec[jj] = _sec[jj-1] + n_sel[ii][jj-1]; + } + sec.push_back(_sec); + } +} + +void +NNPInterModelDevi:: +get_max_sec() +{ + for (int ii = 0; ii < numb_models; ii++) { + this->max_sec_size = max_sec_size < sec[ii].size() ? sec[ii].size() : max_sec_size; + this->max_sec_back = max_sec_back < sec[ii].back() ? sec[ii].back() : max_sec_back; + } +} + +#ifdef USE_CUDA_TOOLKIT +void +NNPInterModelDevi:: +update_nbor(const InternalNeighborList & nlist, const int nloc) +{ + if (!init_nbor) { + cum_sum(get_sel()); + get_max_sec(); + cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); + cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); + cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); + cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc))); + cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2)); + #ifdef HIGH_PREC + cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3)); + #else + cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3)); + #endif + ilist_size = nlist.ilist.size(); + jrange_size = nlist.jrange.size(); + jlist_size = nlist.jlist.size(); + arr_int_size = max_sec_size + nloc * max_sec_size + nloc; + arr_ll_size = nloc * MAGIC_NUMBER * 2; + arr_dou_size = nloc * max_sec_back * 3; + init_nbor = true; + } + if (ilist_size < nlist.ilist.size()) { + cudaErrcheck(cudaFree(ilist)); + cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); + ilist_size = nlist.ilist.size(); + } + if (jrange_size < nlist.jrange.size()) { + cudaErrcheck(cudaFree(jrange)); + cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); + jrange_size = nlist.jrange.size(); + } + if (jlist_size < nlist.jlist.size()) { + cudaErrcheck(cudaFree(jlist)); + cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); + jlist_size = nlist.jlist.size(); + } + if (arr_int_size < max_sec_size + nloc * max_sec_size + nloc) { + cudaErrcheck(cudaFree(array_int)); + cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc))); + arr_int_size = max_sec_size + nloc * max_sec_size + nloc; + } + if (arr_ll_size < nloc * MAGIC_NUMBER * 2) { + cudaErrcheck(cudaFree(array_longlong)); + cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2)); + arr_ll_size = nloc * MAGIC_NUMBER * 2; + } + if (arr_dou_size < nloc * max_sec_back * 3) { + cudaErrcheck(cudaFree(array_double)); + #ifdef HIGH_PREC + cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3)); + #else + cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3)); + #endif + arr_dou_size = nloc * max_sec_back * 3; + } + cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice)); + cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice)); + cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice)); +} +#endif //USE_CUDA_TOOLKIT + void NNPInterModelDevi:: validate_fparam_aparam(const int & nloc, @@ -868,7 +1422,7 @@ compute (ENERGYTYPE & dener, { if (numb_models == 0) return; - NNPAtomMap nnpmap (datype_.begin(), datype_.end()); + nnpmap = NNPAtomMap (datype_.begin(), datype_.end()); validate_fparam_aparam(nnpmap.get_type().size(), fparam, aparam); std::vector> input_tensors; @@ -911,31 +1465,42 @@ compute (vector & all_energy, const vector & dbox, const int nghost, const LammpsNeighborList & lmp_list, + const int & ago, const vector & fparam, const vector & aparam) { if (numb_models == 0) return; int nall = dcoord_.size() / 3; int nloc = nall - nghost; - NNPAtomMap nnpmap (datype_.begin(), datype_.begin() + nloc); - assert (nloc == nnpmap.get_type().size()); validate_fparam_aparam(nloc, fparam, aparam); - - InternalNeighborList nlist; - convert_nlist_lmp_internal (nlist, lmp_list); - shuffle_nlist (nlist, nnpmap); - std::vector> input_tensors; - int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); - assert (nloc == ret); - all_energy.resize (numb_models); - all_force.resize (numb_models); - all_virial.resize (numb_models); + // agp == 0 means that the LAMMPS nbor list has been updated + if (ago == 0) { + nnpmap = NNPAtomMap (datype_.begin(), datype_.begin() + nloc); + assert (nloc == nnpmap.get_type().size()); - for (unsigned ii = 0; ii < numb_models; ++ii){ - run_model (all_energy[ii], all_force[ii], all_virial[ii], sessions[ii], input_tensors, nnpmap, nghost); - } + // InternalNeighborList nlist; + convert_nlist_lmp_internal (nlist, lmp_list); + shuffle_nlist (nlist, nnpmap); + #ifdef USE_CUDA_TOOLKIT + update_nbor(nlist, nloc); + #endif + + } + #ifdef USE_CUDA_TOOLKIT + int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost); + #else + int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); + #endif + + all_energy.resize (numb_models); + all_force.resize (numb_models); + all_virial.resize (numb_models); + assert (nloc == ret); + for (unsigned ii = 0; ii < numb_models; ++ii) { + run_model (all_energy[ii], all_force[ii], all_virial[ii], sessions[ii], input_tensors, nnpmap, nghost); + } } void @@ -950,33 +1515,44 @@ compute (vector & all_energy, const vector & dbox, const int nghost, const LammpsNeighborList & lmp_list, + const int & ago, const vector & fparam, const vector & aparam) { if (numb_models == 0) return; int nall = dcoord_.size() / 3; int nloc = nall - nghost; - NNPAtomMap nnpmap (datype_.begin(), datype_.begin() + nloc); - assert (nloc == nnpmap.get_type().size()); validate_fparam_aparam(nloc, fparam, aparam); - - InternalNeighborList nlist; - convert_nlist_lmp_internal (nlist, lmp_list); - shuffle_nlist (nlist, nnpmap); - std::vector> input_tensors; - int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); - assert (nloc == ret); - - all_energy.resize (numb_models); - all_force .resize (numb_models); - all_virial.resize (numb_models); - all_atom_energy.resize (numb_models); - all_atom_virial.resize (numb_models); - for (unsigned ii = 0; ii < numb_models; ++ii){ - run_model (all_energy[ii], all_force[ii], all_virial[ii], all_atom_energy[ii], all_atom_virial[ii], sessions[ii], input_tensors, nnpmap, nghost); - } + // agp == 0 means that the LAMMPS nbor list has been updated + if (ago == 0) { + nnpmap = NNPAtomMap (datype_.begin(), datype_.begin() + nloc); + assert (nloc == nnpmap.get_type().size()); + + // InternalNeighborList nlist; + convert_nlist_lmp_internal (nlist, lmp_list); + shuffle_nlist (nlist, nnpmap); + #ifdef USE_CUDA_TOOLKIT + update_nbor(nlist, nloc); + #endif + + } + #ifdef USE_CUDA_TOOLKIT + int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost); + #else + int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); + #endif + + all_energy.resize (numb_models); + all_force .resize (numb_models); + all_virial.resize (numb_models); + all_atom_energy.resize (numb_models); + all_atom_virial.resize (numb_models); + assert (nloc == ret); + for (unsigned ii = 0; ii < numb_models; ++ii) { + run_model (all_energy[ii], all_force[ii], all_virial[ii], all_atom_energy[ii], all_atom_virial[ii], sessions[ii], input_tensors, nnpmap, nghost); + } } void @@ -1114,3 +1690,22 @@ compute_std_f (vector & std, } } +void +NNPInterModelDevi:: +compute_relative_std_f (vector &std, + const vector &avg, + const VALUETYPE eps) +{ + unsigned nloc = std.size(); + for (unsigned ii = 0; ii < nloc; ++ii){ + const VALUETYPE * tmp_avg = &(avg[ii*3]); + VALUETYPE vdiff[3]; + vdiff[0] = tmp_avg[0]; + vdiff[1] = tmp_avg[1]; + vdiff[2] = tmp_avg[2]; + VALUETYPE f_norm = sqrt(MathUtilities::dot(vdiff, vdiff)); + // relative std = std/(abs(f)+eps) + std[ii] /= f_norm + eps; + } +} + diff --git a/source/lmp/env.sh.in b/source/lmp/env.sh.in index 9b7676fcd9..2a9253ba70 100644 --- a/source/lmp/env.sh.in +++ b/source/lmp/env.sh.in @@ -8,4 +8,4 @@ TF_RPATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -Wl,-rpath=/g"` NNP_INC=" -std=c++11 @PREC_DEF@ @TTM_DEF@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/deepmd " NNP_PATH=" -L$TF_LIBRARY_PATH -L$DEEPMD_ROOT/lib" -NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib" +NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_OP_CUDA@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib" diff --git a/source/lmp/pair_nnp.cpp b/source/lmp/pair_nnp.cpp index 372adcba77..0b74dd38eb 100644 --- a/source/lmp/pair_nnp.cpp +++ b/source/lmp/pair_nnp.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include "atom.h" @@ -42,6 +43,7 @@ static int stringCmp(const void *a, const void* b) int PairNNP::get_node_rank() { char host_name[MPI_MAX_PROCESSOR_NAME]; + memset(host_name, '\0', sizeof(char) * MPI_MAX_PROCESSOR_NAME); char (*host_names)[MPI_MAX_PROCESSOR_NAME]; int n, namelen, color, rank, nprocs, myrank; size_t bytes; @@ -53,6 +55,10 @@ int PairNNP::get_node_rank() { bytes = nprocs * sizeof(char[MPI_MAX_PROCESSOR_NAME]); host_names = (char (*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes); + for (int ii = 0; ii < nprocs; ii++) { + memset(host_names[ii], '\0', sizeof(char) * MPI_MAX_PROCESSOR_NAME); + } + strcpy(host_names[rank], host_name); for (n=0; n 1 ? 0 : neighbor->ago; + int ago = neighbor->ago; + if (numb_models > 1) { + if (multi_models_no_mod_devi && (out_freq > 0 && update->ntimestep % out_freq == 0)) { + ago = 0; + } + else if (multi_models_mod_devi && (out_freq == 0 || update->ntimestep % out_freq != 0)) { + ago = 0; + } + } // compute - bool single_model = (numb_models == 1); - bool multi_models_no_mod_devi = (numb_models > 1 && (out_freq == 0 || update->ntimestep % out_freq != 0)); - bool multi_models_mod_devi = (numb_models > 1 && (out_freq > 0 && update->ntimestep % out_freq == 0)); + single_model = (numb_models == 1); + multi_models_no_mod_devi = (numb_models > 1 && (out_freq == 0 || update->ntimestep % out_freq != 0)); + multi_models_mod_devi = (numb_models > 1 && (out_freq > 0 && update->ntimestep % out_freq == 0)); if (do_ghost) { LammpsNeighborList lmp_list (list->inum, list->ilist, list->numneigh, list->firstneigh); if (single_model || multi_models_no_mod_devi) { if ( ! (eflag_atom || vflag_atom) ) { #ifdef HIGH_PREC - nnp_inter.compute (dener, dforce, dvirial, dcoord, dtype, dbox, nghost, lmp_list, fparam, daparam); + nnp_inter.compute (dener, dforce, dvirial, dcoord, dtype, dbox, nghost, lmp_list, ago, fparam, daparam); #else vector dcoord_(dcoord.size()); vector dbox_(dbox.size()); @@ -306,7 +324,7 @@ void PairNNP::compute(int eflag, int vflag) vector dforce_(dforce.size(), 0); vector dvirial_(dvirial.size(), 0); double dener_ = 0; - nnp_inter.compute (dener_, dforce_, dvirial_, dcoord_, dtype, dbox_, nghost, lmp_list, fparam, daparam); + nnp_inter.compute (dener_, dforce_, dvirial_, dcoord_, dtype, dbox_, nghost, lmp_list, ago, fparam, daparam); for (unsigned dd = 0; dd < dforce.size(); ++dd) dforce[dd] = dforce_[dd]; for (unsigned dd = 0; dd < dvirial.size(); ++dd) dvirial[dd] = dvirial_[dd]; dener = dener_; @@ -317,7 +335,7 @@ void PairNNP::compute(int eflag, int vflag) vector deatom (nall * 1, 0); vector dvatom (nall * 9, 0); #ifdef HIGH_PREC - nnp_inter.compute (dener, dforce, dvirial, deatom, dvatom, dcoord, dtype, dbox, nghost, lmp_list, fparam, daparam); + nnp_inter.compute (dener, dforce, dvirial, deatom, dvatom, dcoord, dtype, dbox, nghost, lmp_list, ago, fparam, daparam); #else vector dcoord_(dcoord.size()); vector dbox_(dbox.size()); @@ -328,7 +346,7 @@ void PairNNP::compute(int eflag, int vflag) vector deatom_(dforce.size(), 0); vector dvatom_(dforce.size(), 0); double dener_ = 0; - nnp_inter.compute (dener_, dforce_, dvirial_, deatom_, dvatom_, dcoord_, dtype, dbox_, nghost, lmp_list, fparam, daparam); + nnp_inter.compute (dener_, dforce_, dvirial_, deatom_, dvatom_, dcoord_, dtype, dbox_, nghost, lmp_list, ago, fparam, daparam); for (unsigned dd = 0; dd < dforce.size(); ++dd) dforce[dd] = dforce_[dd]; for (unsigned dd = 0; dd < dvirial.size(); ++dd) dvirial[dd] = dvirial_[dd]; for (unsigned dd = 0; dd < deatom.size(); ++dd) deatom[dd] = deatom_[dd]; @@ -358,7 +376,7 @@ void PairNNP::compute(int eflag, int vflag) vector> all_virial; vector> all_atom_energy; vector> all_atom_virial; - nnp_inter_model_devi.compute(all_energy, all_force, all_virial, all_atom_energy, all_atom_virial, dcoord, dtype, dbox, nghost, lmp_list, fparam, daparam); + nnp_inter_model_devi.compute(all_energy, all_force, all_virial, all_atom_energy, all_atom_virial, dcoord, dtype, dbox, nghost, lmp_list, ago, fparam, daparam); // nnp_inter_model_devi.compute_avg (dener, all_energy); // nnp_inter_model_devi.compute_avg (dforce, all_force); // nnp_inter_model_devi.compute_avg (dvirial, all_virial); @@ -384,7 +402,7 @@ void PairNNP::compute(int eflag, int vflag) vector> all_virial_; vector> all_atom_energy_; vector> all_atom_virial_; - nnp_inter_model_devi.compute(all_energy_, all_force_, all_virial_, all_atom_energy_, all_atom_virial_, dcoord_, dtype, dbox_, nghost, lmp_list, fparam, daparam); + nnp_inter_model_devi.compute(all_energy_, all_force_, all_virial_, all_atom_energy_, all_atom_virial_, dcoord_, dtype, dbox_, nghost, lmp_list, ago, fparam, daparam); // nnp_inter_model_devi.compute_avg (dener_, all_energy_); // nnp_inter_model_devi.compute_avg (dforce_, all_force_); // nnp_inter_model_devi.compute_avg (dvirial_, all_virial_); @@ -432,6 +450,9 @@ void PairNNP::compute(int eflag, int vflag) vector tmp_avg_f; nnp_inter_model_devi.compute_avg (tmp_avg_f, all_force); nnp_inter_model_devi.compute_std_f (std_f, tmp_avg_f, all_force); + if (out_rel == 1){ + nnp_inter_model_devi.compute_relative_std_f (std_f, tmp_avg_f, eps); + } #else vector tmp_avg_f_, std_f_; for (unsigned ii = 0; ii < all_force_.size(); ++ii){ @@ -443,6 +464,9 @@ void PairNNP::compute(int eflag, int vflag) nnp_inter_model_devi.compute_std_f (std_f_, tmp_avg_f_, all_force_); std_f.resize(std_f_.size()); for (int dd = 0; dd < std_f_.size(); ++dd) std_f[dd] = std_f_[dd]; + if (out_rel == 1){ + nnp_inter_model_devi.compute_relative_std_f (std_f_, tmp_avg_f_, eps); + } #endif double min = numeric_limits::max(), max = 0, avg = 0; ana_st(max, min, avg, std_f, nlocal); @@ -496,16 +520,7 @@ void PairNNP::compute(int eflag, int vflag) // 1. If the atom_style is not atomic (e.g. charge), the order of std_f is different from that of atom ids. // 2. std_f is not gathered by MPI. for (int dd = 0; dd < all_nlocal; ++dd) { - if (out_rel == 1){ - // relative std = std/(abs(f)+1) -#ifdef HIGH_PREC - fp << " " << setw(18) << std_f[dd] / (fabs(tmp_avg_f[dd]) + eps); -#else - fp << " " << setw(18) << std_f[dd] / (fabsf(tmp_avg_f_[dd]) + eps); -#endif - } else { fp << " " << setw(18) << std_f[dd]; - } } } fp << endl; diff --git a/source/lmp/pair_nnp.h.in b/source/lmp/pair_nnp.h.in index fbc71a4e6f..d41546438c 100644 --- a/source/lmp/pair_nnp.h.in +++ b/source/lmp/pair_nnp.h.in @@ -82,6 +82,9 @@ private: int dim_aparam; int out_each; int out_rel; + bool single_model; + bool multi_models_mod_devi; + bool multi_models_no_mod_devi; #ifdef HIGH_PREC vector fparam; vector aparam; diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt index 58051ff45b..6fdc02cfd6 100644 --- a/source/op/CMakeLists.txt +++ b/source/op/CMakeLists.txt @@ -4,12 +4,24 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_D set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} ) file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ) +file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ) file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc ) file(GLOB OP_PY *.py) -if (BUILD_CPP_IF) - add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC}) +if (BUILD_CPP_IF) + if (USE_CUDA_TOOLKIT) + add_library(${LIB_DEEPMD_OP} SHARED ${OP_CUDA_SRC}) + add_subdirectory(cuda) + find_package(CUDA REQUIRED) + include_directories(${CUDA_INCLUDE_DIRS}) + set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda) + target_link_libraries (${LIB_DEEPMD_OP} ${EXTRA_LIBS}) + target_link_libraries (${LIB_DEEPMD_OP} ${CUDA_LIBRARIES}) + else (USE_CUDA_TOOLKIT) + add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC}) + endif (USE_CUDA_TOOLKIT) endif (BUILD_CPP_IF) + if (BUILD_PY_IF) add_library(op_abi SHARED ${OP_SRC} ${OP_LIB}) add_library(op_grads SHARED ${OP_GRADS_SRC}) diff --git a/source/op/cuda/CMakeLists.txt b/source/op/cuda/CMakeLists.txt new file mode 100644 index 0000000000..a9847abd3e --- /dev/null +++ b/source/op/cuda/CMakeLists.txt @@ -0,0 +1,32 @@ +# required cmake version +cmake_minimum_required(VERSION 3.0) +# project name +project(deepmd_op_cuda) + +# SET(CUDA_SEPARABLE_COMPILATION ON) +find_package(CUDA REQUIRED) +if (NOT CUDA_FOUND) + message(STATUS "CUDA not found. Project will not be built.") +endif(NOT CUDA_FOUND) + +# set c++ version c++11 +SET(CMAKE_CXX_STANDARD 11) +SET(CMAKE_CUDA_STANDARD 11) +# nvcc -o libdeepmd_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true -DHIGH_PREC=true -gencode arch=compute_61,code=sm_61 -shared -Xcompiler -fPIC deepmd_op.cu -L/usr/local/cuda/lib64 -lcudadevrt +# very important here! Include path to cub. +include_directories(cub) +# nvcc flags +set(CUDA_NVCC_FLAGS -gencode arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic Pascal) + -gencode arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4, Discrete GPU on the NVIDIA Drive PX2 + -gencode arch=compute_70,code=sm_70; # Volta - GV100/Tesla V100, GTX 1180 (GV104) + -gencode arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000 + -O3; -Xcompiler -fPIC; + ) + +set (SOURCE_FILES + descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu +) + +cuda_add_library(deepmd_op_cuda SHARED ${SOURCE_FILES}) + +install(TARGETS deepmd_op_cuda DESTINATION lib/) diff --git a/source/op/cuda/cub b/source/op/cuda/cub new file mode 160000 index 0000000000..c3cceac115 --- /dev/null +++ b/source/op/cuda/cub @@ -0,0 +1 @@ +Subproject commit c3cceac115c072fb63df1836ff46d8c60d9eb304 diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu new file mode 100644 index 0000000000..8893e8a00a --- /dev/null +++ b/source/op/cuda/descrpt_se_a.cu @@ -0,0 +1,382 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#define EIGEN_USE_GPU +#include +#include +#include +#include +#include +#include +#include + +#define MAGIC_NUMBER 256 + +#ifdef HIGH_PREC + typedef double VALUETYPE; +#else + typedef float VALUETYPE; +#endif + +typedef double compute_t; + +typedef unsigned long long int_64; + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +static __inline__ __device__ double atomicAdd(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); + } while (assumed != old); + return __longlong_as_double(old); +} +#endif + +template < + typename Key, + int BLOCK_THREADS, + int ITEMS_PER_THREAD> +__launch_bounds__ (BLOCK_THREADS) +__global__ void BlockSortKernel( + Key * d_in, + Key * d_out) // Tile of output +{ + enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; + // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) + typedef cub::BlockLoad BlockLoadT; + // Specialize BlockRadixSort type for our thread block + typedef cub::BlockRadixSort BlockRadixSortT; + // Shared memory + __shared__ union TempStorage + { + typename BlockLoadT::TempStorage load; + typename BlockRadixSortT::TempStorage sort; + } temp_storage; + // Per-thread tile items + Key items[ITEMS_PER_THREAD]; + // Our current block's offset + int block_offset = blockIdx.x * TILE_SIZE; + // Load items into a blocked arrangement + BlockLoadT(temp_storage.load).Load(d_in + block_offset, items); + // Barrier for smem reuse + __syncthreads(); + // Sort keys + BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items); + // Store output in striped fashion + cub::StoreDirectStriped(threadIdx.x, d_out + block_offset, items); +} + +template +__device__ inline T dev_dot(T * arr1, T * arr2) { + return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; +} + +__device__ inline void spline5_switch(compute_t & vv, + compute_t & dd, + compute_t & xx, + const compute_t & rmin, + const compute_t & rmax) +{ + if (xx < rmin) { + dd = 0; + vv = 1; + } + else if (xx < rmax) { + compute_t uu = (xx - rmin) / (rmax - rmin) ; + compute_t du = 1. / (rmax - rmin) ; + vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; + dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; + } + else { + dd = 0; + vv = 0; + } +} + +__global__ void get_i_idx_se_a(const int nloc, + const int * ilist, + int * i_idx) +{ + const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; + if(idy >= nloc) { + return; + } + i_idx[ilist[idy]] = idy; +} + +__global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord, + const int * type, + const int * jrange, + const int * jlist, + const float rcut, + int_64 * key, + int * i_idx) +{ + // <<>> + const unsigned int idx = blockIdx.x; + const unsigned int idy = threadIdx.x; + + const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]]; + if (idy >= nsize) { + return; + } + + const int * nei_idx = jlist + jrange[i_idx[idx]]; + // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize); + + int_64 * key_in = key + idx * MAGIC_NUMBER; + + compute_t diff[3]; + const int & j_idx = nei_idx[idy]; + for (int dd = 0; dd < 3; dd++) { + diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; + } + compute_t rr = sqrt(dev_dot(diff, diff)); + if (rr <= rcut) { + key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx; + } +} + + // bubble_sort(sel_nei, num_nei); +__global__ void format_nlist_fill_b_se_a(int * nlist, + const int nlist_size, + const int nloc, + const int * jrange, + const int * jlist, + int_64 * key, + const int * sec_a, + const int sec_a_size, + int * nei_iter_dev) +{ + + const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; + + if(idy >= nloc) { + return; + } + + int * row_nlist = nlist + idy * nlist_size; + int * nei_iter = nei_iter_dev + idy * sec_a_size; + int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER; + + for (int ii = 0; ii < sec_a_size; ii++) { + nei_iter[ii] = sec_a[ii]; + } + + for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) { + const int & nei_type = key_out[kk] / 1E15; + if (nei_iter[nei_type] < sec_a[nei_type + 1]) { + row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000; + } + } +} +//it's ok! + +__global__ void compute_descriptor_se_a (VALUETYPE* descript, + const int ndescrpt, + VALUETYPE* descript_deriv, + const int descript_deriv_size, + VALUETYPE* rij, + const int rij_size, + const int* type, + const VALUETYPE* avg, + const VALUETYPE* std, + int* nlist, + const int nlist_size, + const VALUETYPE* coord, + const VALUETYPE rmin, + const VALUETYPE rmax, + compute_t* sel_a_diff_dev) +{ + // <<>> + const unsigned int idx = blockIdx.x; + const unsigned int idy = threadIdx.x; + const int idx_deriv = idy * 4 * 3; // 4 components time 3 directions + const int idx_value = idy * 4; // 4 components + + // else {return;} + VALUETYPE * row_descript = descript + idx * ndescrpt; + VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size; + VALUETYPE * row_rij = rij + idx * rij_size; + compute_t * sel_a_diff = sel_a_diff_dev + idx * nlist_size * 3; + int * row_nlist = nlist + idx * nlist_size; + + if (row_nlist[idy] >= 0) { + const int & j_idx = row_nlist[idy]; + for (int kk = 0; kk < 3; kk++) { + sel_a_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; + row_rij[idy * 3 + kk] = sel_a_diff[idy * 3 + kk]; + } + const compute_t * rr = &sel_a_diff[idy * 3 + 0]; + compute_t nr2 = dev_dot(rr, rr); + compute_t inr = 1./sqrt(nr2); + compute_t nr = nr2 * inr; + compute_t inr2 = inr * inr; + compute_t inr4 = inr2 * inr2; + compute_t inr3 = inr4 * nr; + compute_t sw, dsw; + spline5_switch(sw, dsw, nr, rmin, rmax); + row_descript[idx_value + 0] = (1./nr) ;//* sw; + row_descript[idx_value + 1] = (rr[0] / nr2) ;//* sw; + row_descript[idx_value + 2] = (rr[1] / nr2) ;//* sw; + row_descript[idx_value + 3] = (rr[2] / nr2) ;//* sw; + + row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3]; + row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3]; + row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3]; + // ****deriv of component x/r2 + row_descript_deriv[idx_deriv + 3] = ((2. * rr[0] * rr[0] * inr4 - inr2) * sw - row_descript[idx_value + 1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3]; + row_descript_deriv[idx_deriv + 4] = ((2. * rr[0] * rr[1] * inr4 ) * sw - row_descript[idx_value + 1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3]; + row_descript_deriv[idx_deriv + 5] = ((2. * rr[0] * rr[2] * inr4 ) * sw - row_descript[idx_value + 1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3]; + // ***deriv of component y/r2 + row_descript_deriv[idx_deriv + 6] = ((2. * rr[1] * rr[0] * inr4 ) * sw - row_descript[idx_value + 2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3]; + row_descript_deriv[idx_deriv + 7] = ((2. * rr[1] * rr[1] * inr4 - inr2) * sw - row_descript[idx_value + 2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3]; + row_descript_deriv[idx_deriv + 8] = ((2. * rr[1] * rr[2] * inr4 ) * sw - row_descript[idx_value + 2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3]; + // ***deriv of component z/r2 + row_descript_deriv[idx_deriv + 9] = ((2. * rr[2] * rr[0] * inr4 ) * sw - row_descript[idx_value + 3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3]; + row_descript_deriv[idx_deriv +10] = ((2. * rr[2] * rr[1] * inr4 ) * sw - row_descript[idx_value + 3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3]; + row_descript_deriv[idx_deriv +11] = ((2. * rr[2] * rr[2] * inr4 - inr2) * sw - row_descript[idx_value + 3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3]; + // 4 value components + row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0]; + row_descript[idx_value + 1] *= sw; // * descript[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + idx_value + 1]; + row_descript[idx_value + 2] *= sw; // * descript[idx * ndescrpt + idx_value + 2]);// - avg[type[idx] * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt + idx_value + 2]; + row_descript[idx_value + 3] *= sw; // * descript[idx * ndescrpt + idx_value + 3]);// - avg[type[idx] * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt + idx_value + 3]; + } + + for (int ii = 0; ii < 4; ii++) { + row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii]; + } + // idy nloc, idx ndescrpt * 3 + // descript_deriv[idy * ndescrpt * 3 + idx] = (descript_deriv_dev[idy * (ndescrpt * 3) + idx]) / std[type[idy] * ndescrpt + idx / 3]; + for (int ii = 0; ii < 12; ii++) { + row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3]; + } +} + +void DescrptSeALauncher(const VALUETYPE* coord, + const int* type, + const int* ilist, + const int* jrange, + const int* jlist, + int* array_int, + unsigned long long* array_longlong, + compute_t* array_double, + const VALUETYPE* avg, + const VALUETYPE* std, + VALUETYPE* descript, + VALUETYPE* descript_deriv, + VALUETYPE* rij, + int* nlist, + const int& ntypes, + const int& nloc, + const int& nall, + const int& nnei, + const float& rcut_r, + const float& rcut_r_smth, + const int& ndescrpt, + const std::vector& sec_a, + const bool& fill_nei_a +) +{ + const int LEN = 256; + int nblock = (nloc + LEN -1) / LEN; + int * sec_a_dev = array_int; + int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size]; + int * i_idx = array_int + sec_a.size() + nloc * sec_a.size(); + int_64 * key = array_longlong; + compute_t * sel_a_diff = array_double; // = new VALUETYPE *[nlist_size]; nnei + // int_64 * key = NULL; + // VALUETYPE * sel_a_diff = NULL; // = new VALUETYPE *[nlist_size]; nnei + + cudaError_t res = cudaSuccess; + // res = cudaMalloc((void**)&sec_a_dev, sizeof(int) * sec_a.size()); cudaErrcheck(res); + // res = cudaMalloc((void**)&nei_iter, sizeof(int) * nloc * sec_a.size()); cudaErrcheck(res); + // res = cudaMalloc((void**)&i_idx, sizeof(int) * nloc); cudaErrcheck(res); + // res = cudaMalloc((void**)&key, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2); cudaErrcheck(res); + // res = cudaMalloc((void**)&sel_a_diff, sizeof(VALUETYPE) * nnei * 3 * nloc); cudaErrcheck(res); + res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); + res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); + res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); + res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res); + res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); + // res = cudaMemset(rij, 0.0, sizeof(VALUETYPE) * nloc * nnei * 3); cudaErrcheck(res); + + if (fill_nei_a) { + // ~~~ + // cudaProfilerStart(); + get_i_idx_se_a<<>> (nloc, ilist, i_idx); + + format_nlist_fill_a_se_a<<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx + ); + const int ITEMS_PER_THREAD = 4; + const int BLOCK_THREADS = 64; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); + + format_nlist_fill_b_se_a<<>> ( + nlist, + nnei, + nloc, + jrange, + jlist, + key, + sec_a_dev, + sec_a.size(), + nei_iter + ); + } + + compute_descriptor_se_a<<>> ( + descript, + ndescrpt, + descript_deriv, + ndescrpt * 3, + rij, + nnei * 3, + type, + avg, + std, + nlist, + nnei, + coord, + rcut_r_smth, + rcut_r, + sel_a_diff + ); +//// + // res = cudaFree(sec_a_dev); cudaErrcheck(res); + // res = cudaFree(key); cudaErrcheck(res); + // res = cudaFree(i_idx); cudaErrcheck(res); + // res = cudaFree(nei_iter); cudaErrcheck(res); + // res = cudaFree(sel_a_diff); cudaErrcheck(res); + //output some interesting things... +} \ No newline at end of file diff --git a/source/op/cuda/descrpt_se_r.cu b/source/op/cuda/descrpt_se_r.cu new file mode 100644 index 0000000000..cc7dd4e904 --- /dev/null +++ b/source/op/cuda/descrpt_se_r.cu @@ -0,0 +1,344 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#define EIGEN_USE_GPU +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAGIC_NUMBER 256 + +#ifdef HIGH_PREC + typedef double VALUETYPE; +#else + typedef float VALUETYPE; +#endif + +typedef double compute_t; + +typedef unsigned long long int_64; + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +static __inline__ __device__ double atomicAdd(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); + } while (assumed != old); + return __longlong_as_double(old); +} +#endif + +template < + typename Key, + int BLOCK_THREADS, + int ITEMS_PER_THREAD> +__launch_bounds__ (BLOCK_THREADS) +__global__ void BlockSortKernel( + Key * d_in, + Key * d_out) // Tile of output +{ + enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; + // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) + typedef cub::BlockLoad BlockLoadT; + // Specialize BlockRadixSort type for our thread block + typedef cub::BlockRadixSort BlockRadixSortT; + // Shared memory + __shared__ union TempStorage + { + typename BlockLoadT::TempStorage load; + typename BlockRadixSortT::TempStorage sort; + } temp_storage; + // Per-thread tile items + Key items[ITEMS_PER_THREAD]; + // Our current block's offset + int block_offset = blockIdx.x * TILE_SIZE; + // Load items into a blocked arrangement + BlockLoadT(temp_storage.load).Load(d_in + block_offset, items); + // Barrier for smem reuse + __syncthreads(); + // Sort keys + BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items); + // Store output in striped fashion + cub::StoreDirectStriped(threadIdx.x, d_out + block_offset, items); +} + +template +__device__ inline T dev_dot(T * arr1, T * arr2) { + return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; +} + +__device__ inline void spline5_switch(compute_t & vv, + compute_t & dd, + compute_t & xx, + const compute_t & rmin, + const compute_t & rmax) +{ + if (xx < rmin) { + dd = 0; + vv = 1; + } + else if (xx < rmax) { + compute_t uu = (xx - rmin) / (rmax - rmin) ; + compute_t du = 1. / (rmax - rmin) ; + vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; + dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; + } + else { + dd = 0; + vv = 0; + } +} + +__global__ void get_i_idx_se_r(const int nloc, + const int * ilist, + int * i_idx) +{ + const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; + if(idy >= nloc) { + return; + } + i_idx[ilist[idy]] = idy; +} + +__global__ void format_nlist_fill_a_se_r(const VALUETYPE * coord, + const int * type, + const int * jrange, + const int * jlist, + const float rcut, + int_64 * key, + int * i_idx) +{ + // <<>> + const unsigned int idx = blockIdx.x; + const unsigned int idy = threadIdx.x; + + const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]]; + if (idy >= nsize) { + return; + } + + const int * nei_idx = jlist + jrange[i_idx[idx]]; + // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize); + + int_64 * key_in = key + idx * MAGIC_NUMBER; + + compute_t diff[3]; + const int & j_idx = nei_idx[idy]; + for (int dd = 0; dd < 3; dd++) { + diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; + } + compute_t rr = sqrt(dev_dot(diff, diff)); + if (rr <= rcut) { + key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx; + } +} + + // bubble_sort(sel_nei, num_nei); +__global__ void format_nlist_fill_b_se_r(int * nlist, + const int nlist_size, + const int nloc, + const int * jrange, + const int * jlist, + int_64 * key, + const int * sec, + const int sec_a_size, + int * nei_iter_dev) +{ + + const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; + + if(idy >= nloc) { + return; + } + + int * row_nlist = nlist + idy * nlist_size; + int * nei_iter = nei_iter_dev + idy * sec_a_size; + int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER; + + for (int ii = 0; ii < sec_a_size; ii++) { + nei_iter[ii] = sec[ii]; + } + + for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) { + const int & nei_type = key_out[kk] / 1E15; + if (nei_iter[nei_type] < sec[nei_type + 1]) { + row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000; + } + } +} +//it's ok! + +__global__ void compute_descriptor_se_r (VALUETYPE* descript, + const int ndescrpt, + VALUETYPE* descript_deriv, + const int descript_deriv_size, + VALUETYPE* rij, + const int rij_size, + const int* type, + const VALUETYPE* avg, + const VALUETYPE* std, + int* nlist, + const int nlist_size, + const VALUETYPE* coord, + const VALUETYPE rmin, + const VALUETYPE rmax, + compute_t* sel_diff_dev) +{ + // <<>> + const unsigned int idx = blockIdx.x; + const unsigned int idy = threadIdx.x; + const int idx_deriv = idy * 3; // 1 components time 3 directions + const int idx_value = idy; // 1 components + + // else {return;} + VALUETYPE * row_descript = descript + idx * ndescrpt; + VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size; + VALUETYPE * row_rij = rij + idx * rij_size; + compute_t * sel_diff = sel_diff_dev + idx * nlist_size * 3; + int * row_nlist = nlist + idx * nlist_size; + + if (row_nlist[idy] >= 0) { + const int & j_idx = row_nlist[idy]; + for (int kk = 0; kk < 3; kk++) { + sel_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; + row_rij[idy * 3 + kk] = sel_diff[idy * 3 + kk]; + } + const compute_t * rr = &sel_diff[idy * 3 + 0]; + compute_t nr2 = dev_dot(rr, rr); + compute_t inr = 1./sqrt(nr2); + compute_t nr = nr2 * inr; + compute_t inr2 = inr * inr; + compute_t inr4 = inr2 * inr2; + compute_t inr3 = inr4 * nr; + compute_t sw, dsw; + spline5_switch(sw, dsw, nr, rmin, rmax); + row_descript[idx_value + 0] = (1./nr) ;//* sw; + + row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3]; + row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3]; + row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3]; + // 1 value components + row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0]; + } + + for (int ii = 0; ii < 1; ii++) { + row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii]; + } + for (int ii = 0; ii < 3; ii++) { + row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3]; + } +} + +void DescrptSeRLauncher(const VALUETYPE* coord, + const int* type, + const int* ilist, + const int* jrange, + const int* jlist, + int* array_int, + unsigned long long* array_longlong, + compute_t* array_double, + const VALUETYPE* avg, + const VALUETYPE* std, + VALUETYPE* descript, + VALUETYPE* descript_deriv, + VALUETYPE* rij, + int* nlist, + const int& ntypes, + const int& nloc, + const int& nall, + const int& nnei, + const float& rcut, + const float& rcut_smth, + const int& ndescrpt, + const std::vector& sec, + const bool& fill_nei_a +) +{ + const int LEN = 256; + int nblock = (nloc + LEN -1) / LEN; + int * sec_dev = array_int; + int * nei_iter = array_int + sec.size(); // = new int[sec_a_size]; + int * i_idx = array_int + sec.size() + nloc * sec.size(); + int_64 * key = array_longlong; + compute_t * sel_diff = array_double; // = new VALUETYPE *[nlist_size]; nnei + + cudaError_t res = cudaSuccess; + res = cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); + res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); + res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); + res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res); + res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); + + if (fill_nei_a) { + // cudaProfilerStart(); + get_i_idx_se_r<<>> (nloc, ilist, i_idx); + + format_nlist_fill_a_se_r<<>> ( + coord, + type, + jrange, + jlist, + rcut, + key, + i_idx + ); + const int ITEMS_PER_THREAD = 4; + const int BLOCK_THREADS = 64; + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); + format_nlist_fill_b_se_r<<>> ( + nlist, + nnei, + nloc, + jrange, + jlist, + key, + sec_dev, + sec.size(), + nei_iter + ); + } + compute_descriptor_se_r<<>> ( + descript, + ndescrpt, + descript_deriv, + ndescrpt * 3, + rij, + nnei * 3, + type, + avg, + std, + nlist, + nnei, + coord, + rcut_smth, + rcut, + sel_diff + ); +} \ No newline at end of file diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu new file mode 100644 index 0000000000..080ff8ef75 --- /dev/null +++ b/source/op/cuda/prod_force_se_a.cu @@ -0,0 +1,102 @@ +#include +#include +#include + +#ifdef HIGH_PREC + typedef double VALUETYPE; +#else + typedef float VALUETYPE; +#endif + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +static __inline__ __device__ double atomicAdd(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); + } while (assumed != old); + return __longlong_as_double(old); +} +#endif + +__global__ void deriv_wrt_center_atom_se_a(VALUETYPE * force, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const int ndescrpt) +{ + const unsigned int idx = blockIdx.y; + const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idz = threadIdx.y; + + if (idy >= ndescrpt) { + return; + } + + atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); +} + +__global__ void deriv_wrt_neighbors_se_a(VALUETYPE * force, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const int * nlist, + const int nloc, + const int nnei, + const int ndescrpt, + const int n_a_sel, + const int n_a_shift) +{ + // idy -> nnei + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idy = blockIdx.y; + const unsigned int idz = threadIdx.y; + const unsigned int idw = threadIdx.z; + + if (idx >= nloc) { + return; + } + // deriv wrt neighbors + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]); +} + +void ProdForceSeALauncher(VALUETYPE * force, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const int * nlist, + const int nloc, + const int nall, + const int ndescrpt, + const int nnei, + const int n_a_sel, + const int n_a_shift) +{ + // std::cout << "I'm here!" << std::endl; + cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3)); + const int LEN1 = 256; + const int nblock1 = (ndescrpt + LEN1 -1) / LEN1; + dim3 grid(nblock1, nloc); + dim3 thread(LEN1, 3); + deriv_wrt_center_atom_se_a<<>>(force, net_deriv, in_deriv, ndescrpt); + + const int LEN = 64; + int nblock = (nloc + LEN -1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 3, 4); + deriv_wrt_neighbors_se_a<<>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift); +} \ No newline at end of file diff --git a/source/op/cuda/prod_force_se_r.cu b/source/op/cuda/prod_force_se_r.cu new file mode 100644 index 0000000000..765842d9c3 --- /dev/null +++ b/source/op/cuda/prod_force_se_r.cu @@ -0,0 +1,99 @@ +#include +#include + +#ifdef HIGH_PREC + typedef double VALUETYPE; +#else + typedef float VALUETYPE; +#endif + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +static __inline__ __device__ double atomicAdd(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); + } while (assumed != old); + return __longlong_as_double(old); +} +#endif + +__global__ void deriv_wrt_center_atom_se_r(VALUETYPE * force, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const int ndescrpt) +{ + const unsigned int idx = blockIdx.y; + const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idz = threadIdx.y; + + if (idy >= ndescrpt) { + return; + } + + atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); +} + +__global__ void deriv_wrt_neighbors_se_r(VALUETYPE * force, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const int * nlist, + const int nloc, + const int nnei, + const int ndescrpt, + const int n_a_sel, + const int n_a_shift) +{ + // idy -> nnei + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idy = blockIdx.y; + const unsigned int idz = threadIdx.y; + + if (idx >= nloc) { + return; + } + // deriv wrt neighbors + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); +} + +void ProdForceSeRLauncher(VALUETYPE * force, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const int * nlist, + const int nloc, + const int nall, + const int ndescrpt, + const int nnei, + const int n_a_sel, + const int n_a_shift) +{ + cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3)); + const int LEN1 = 256; + const int nblock1 = (ndescrpt + LEN1 -1) / LEN1; + dim3 grid(nblock1, nloc); + dim3 thread(LEN1, 3); + deriv_wrt_center_atom_se_r<<>>(force, net_deriv, in_deriv, ndescrpt); + + const int LEN = 64; + int nblock = (nloc + LEN -1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 3); + deriv_wrt_neighbors_se_r<<>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift); +} \ No newline at end of file diff --git a/source/op/cuda/prod_virial_se_a.cu b/source/op/cuda/prod_virial_se_a.cu new file mode 100644 index 0000000000..f93e14bcec --- /dev/null +++ b/source/op/cuda/prod_virial_se_a.cu @@ -0,0 +1,98 @@ +#include +#include + +#define MUL 512 + +#ifdef HIGH_PREC + typedef double VALUETYPE; +#else + typedef float VALUETYPE; +#endif + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { + if (code != cudaSuccess) { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +static __inline__ __device__ double atomicAdd(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); + } while (assumed != old); + return __longlong_as_double(old); +} +#endif + +__global__ void deriv_wrt_neighbors_se_a(VALUETYPE * virial, + VALUETYPE * atom_virial, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const VALUETYPE * rij, + const int * nlist, + const int nloc, + const int nnei, + const int ndescrpt, + const int n_a_sel, + const int n_a_shift) +{ + // idx -> nloc + // idy -> nnei + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idy = blockIdx.y; + const unsigned int idz = threadIdx.y; + const unsigned int idw = threadIdx.z; + + if (idx >= nloc) { + return; + } + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + // atomicAdd(virial + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]); + atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]); +} + +void ProdVirialSeALauncher(VALUETYPE * virial, + VALUETYPE * atom_virial, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const VALUETYPE * rij, + const int * nlist, + const int nloc, + const int nall, + const int nnei, + const int ndescrpt, + const int n_a_sel, + const int n_a_shift) +{ + cudaErrcheck(cudaMemset(virial, 0.0, sizeof(VALUETYPE) * 9)); + cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(VALUETYPE) * 9 * nall)); + + const int LEN = 16; + int nblock = (nloc + LEN -1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 9, 4); + // compute virial of a frame + deriv_wrt_neighbors_se_a<<>>( + virial, + atom_virial, + net_deriv, + in_deriv, + rij, + nlist, + nloc, + nnei, + ndescrpt, + n_a_sel, + n_a_shift + ); +} diff --git a/source/op/cuda/prod_virial_se_r.cu b/source/op/cuda/prod_virial_se_r.cu new file mode 100644 index 0000000000..d5b6aad3cc --- /dev/null +++ b/source/op/cuda/prod_virial_se_r.cu @@ -0,0 +1,98 @@ +#include +#include +#include + +#define MUL 512 + +#ifdef HIGH_PREC + typedef double VALUETYPE; +#else + typedef float VALUETYPE; +#endif + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { + if (code != cudaSuccess) { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +// currently, double precision atomicAdd only support arch number larger than 6.0 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +static __inline__ __device__ double atomicAdd(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); + } while (assumed != old); + return __longlong_as_double(old); +} +#endif + +__global__ void deriv_wrt_neighbors_se_r(VALUETYPE * virial, + VALUETYPE * atom_virial, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const VALUETYPE * rij, + const int * nlist, + const int nloc, + const int nnei, + const int ndescrpt, + const int n_a_sel, + const int n_a_shift) +{ + // idx -> nloc + // idy -> nnei + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idy = blockIdx.y; + const unsigned int idz = threadIdx.y; + + if (idx >= nloc) { + return; + } + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz % 3]); +} + +void ProdVirialSeRLauncher(VALUETYPE * virial, + VALUETYPE * atom_virial, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const VALUETYPE * rij, + const int * nlist, + const int nloc, + const int nall, + const int nnei, + const int ndescrpt, + const int n_a_sel, + const int n_a_shift) +{ + cudaErrcheck(cudaMemset(virial, 0.0, sizeof(VALUETYPE) * 9)); + cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(VALUETYPE) * 9 * nall)); + + const int LEN = 16; + int nblock = (nloc + LEN -1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 9); + // compute virial of a frame + deriv_wrt_neighbors_se_r<<>>( + virial, + atom_virial, + net_deriv, + in_deriv, + rij, + nlist, + nloc, + nnei, + ndescrpt, + n_a_sel, + n_a_shift + ); +} \ No newline at end of file diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc new file mode 100644 index 0000000000..93c83016fb --- /dev/null +++ b/source/op/descrpt_se_a_gpu.cc @@ -0,0 +1,259 @@ +#include +#include +#include +#include +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" + +using namespace tensorflow; // NOLINT(build/namespaces) +#define MAGIC_NUMBER 256 + +#ifdef HIGH_PREC + typedef double VALUETYPE ; +#else + typedef float VALUETYPE ; +#endif + +typedef double compute_t; + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +using GPUDevice = Eigen::GpuDevice; + +#ifdef HIGH_PREC +REGISTER_OP("DescrptSeA") + .Input("coord: double") //atomic coordinates + .Input("type: int32") //atomic type + .Input("natoms: int32") //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers + .Input("box : double") + .Input("mesh : int32") + .Input("davg: double") //average value of data + .Input("dstd: double") //standard deviation + .Attr("rcut_a: float") //no use + .Attr("rcut_r: float") + .Attr("rcut_r_smth: float") + .Attr("sel_a: list(int)") + .Attr("sel_r: list(int)") //all zero + .Output("descrpt: double") + .Output("descrpt_deriv: double") + .Output("rij: double") + .Output("nlist: int32"); + // only sel_a and rcut_r uesd. +#else +REGISTER_OP("DescrptSeA") + .Input("coord: float") + .Input("type: int32") + .Input("natoms: int32") + .Input("box : float") + .Input("mesh : int32") + .Input("davg: float") + .Input("dstd: float") + .Attr("rcut_a: float") + .Attr("rcut_r: float") + .Attr("rcut_r_smth: float") + .Attr("sel_a: list(int)") + .Attr("sel_r: list(int)") + .Output("descrpt: float") + .Output("descrpt_deriv: float") + .Output("rij: float") + .Output("nlist: int32"); +#endif + +void DescrptSeALauncher(const VALUETYPE* coord, + const int* type, + const int* ilist, + const int* jrange, + const int* jlist, + int* array_int, + unsigned long long* array_longlong, + compute_t* array_double, + const VALUETYPE* avg, + const VALUETYPE* std, + VALUETYPE* descript, + VALUETYPE* descript_deriv, + VALUETYPE* rij, + int* nlist, + const int& ntypes, + const int& nloc, + const int& nall, + const int& nnei, + const float& rcut_r, + const float& rcut_r_smth, + const int& ndescrpt, + const std::vector& sec_a, + const bool& fill_nei_a +); + +class DescrptSeAOp : public OpKernel { +public: + explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) { + float nloc_f, nall_f; + OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a)); + OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r)); + OP_REQUIRES_OK(context, context->GetAttr("rcut_r_smth", &rcut_r_smth)); + OP_REQUIRES_OK(context, context->GetAttr("sel_a", &sel_a)); + OP_REQUIRES_OK(context, context->GetAttr("sel_r", &sel_r)); + // OP_REQUIRES_OK(context, context->GetAttr("nloc", &nloc_f)); + // OP_REQUIRES_OK(context, context->GetAttr("nall", &nall_f)); + cum_sum (sec_a, sel_a); + cum_sum (sec_r, sel_r); + ndescrpt_a = sec_a.back() * 4; + ndescrpt_r = sec_r.back() * 1; + ndescrpt = ndescrpt_a + ndescrpt_r; + nnei_a = sec_a.back(); + nnei_r = sec_r.back(); + nnei = nnei_a + nnei_r; + fill_nei_a = (rcut_a < 0); + } + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& coord_tensor = context->input(context_input_index++); + const Tensor& type_tensor = context->input(context_input_index++); + const Tensor& natoms_tensor = context->input(context_input_index++); + const Tensor& box_tensor = context->input(context_input_index++); + const Tensor& mesh_tensor = context->input(context_input_index++); + const Tensor& avg_tensor = context->input(context_input_index++); + const Tensor& std_tensor = context->input(context_input_index++); + // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3] + OP_REQUIRES (context, (coord_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of coord should be 2")); + OP_REQUIRES (context, (type_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of type should be 2")); + OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); + OP_REQUIRES (context, (box_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of box should be 2")); + OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of mesh should be 1")); + OP_REQUIRES (context, (avg_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of avg should be 2")); + OP_REQUIRES (context, (std_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of std should be 2")); + OP_REQUIRES (context, (fill_nei_a), errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0")); + OP_REQUIRES (context, (sec_r.back() == 0), errors::InvalidArgument ("Rotational free descriptor only support all-angular information: sel_r should be all zero.")); + + OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); + + int * natoms = new int[natoms_tensor.shape().dim_size(0)]; + cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + int nloc = natoms[0]; + int nall = natoms[1]; + int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something. + int nsamples = coord_tensor.shape().dim_size(0); + // + //// check the sizes + OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of avg should be ntype")); + OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of std should be ntype")); + + OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); + OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); + OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of box should be 9")); + OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of avg should be ndescrpt")); + OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of std should be ndescrpt")); + + OP_REQUIRES (context, (ntypes == int(sel_a.size())), errors::InvalidArgument ("number of types should match the length of sel array")); + OP_REQUIRES (context, (ntypes == int(sel_r.size())), errors::InvalidArgument ("number of types should match the length of sel array")); + + // Create output tensors + TensorShape descrpt_shape ; + descrpt_shape.AddDim (nsamples); + descrpt_shape.AddDim (nloc * ndescrpt); + TensorShape descrpt_deriv_shape ; + descrpt_deriv_shape.AddDim (nsamples); + descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3); + TensorShape rij_shape ; + rij_shape.AddDim (nsamples); + rij_shape.AddDim (nloc * nnei * 3); + TensorShape nlist_shape ; + nlist_shape.AddDim (nsamples); + nlist_shape.AddDim (nloc * nnei); + + int context_output_index = 0; + Tensor* descrpt_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + descrpt_shape, + &descrpt_tensor)); + Tensor* descrpt_deriv_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + descrpt_deriv_shape, + &descrpt_deriv_tensor)); + Tensor* rij_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + rij_shape, + &rij_tensor)); + Tensor* nlist_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + nlist_shape, + &nlist_tensor)); + + int * ilist = NULL, *jrange = NULL, *jlist = NULL; + int * array_int = NULL; unsigned long long *array_longlong = NULL; compute_t *array_double = NULL; + cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); + cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); + cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); + cudaErrcheck(cudaMemcpy(&(array_int), 16 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); + cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost)); + cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost)); + + // cudaErrcheck(cudaMemcpy(jlist, host_jlist, sizeof(int) * nloc * MAGIC_NUMBER, cudaMemcpyHostToDevice)); + // Launch computation + for (int II = 0; II < nsamples; II++) { + DescrptSeALauncher(coord_tensor.matrix().data() + II * (nall * 3), // related to the kk argument + type_tensor.matrix().data() + II * nall, // also related to the kk argument + ilist, + jrange, + jlist, + array_int, + array_longlong, + array_double, + avg_tensor.matrix().data(), + std_tensor.matrix().data(), + descrpt_tensor->matrix().data() + II * (nloc * ndescrpt), + descrpt_deriv_tensor->matrix().data() + II * (nloc * ndescrpt * 3), + rij_tensor->matrix().data() + II * (nloc * nnei * 3), + nlist_tensor->matrix().data() + II * (nloc * nnei), + ntypes, + nloc, + nall, + nnei, + rcut_r, + rcut_r_smth, + ndescrpt, + sec_a, + fill_nei_a + ); + } + // std::cout << "done" << std::endl; + delete[] natoms; + } + +///////////////////////////////////////////////////////////////////////////////////////////// +private: + float rcut_a; + float rcut_r; + float rcut_r_smth; + std::vector sel_r; + std::vector sel_a; + std::vector sec_a; + std::vector sec_r; + int ndescrpt, ndescrpt_a, ndescrpt_r; + int nnei, nnei_a, nnei_r, nloc, nall; + bool fill_nei_a; + + //private func + void cum_sum (std::vector & sec, const std::vector & n_sel) const { + sec.resize (n_sel.size() + 1); + sec[0] = 0; + for (int ii = 1; ii < sec.size(); ++ii) { + sec[ii] = sec[ii-1] + n_sel[ii-1]; + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("DescrptSeA").Device(DEVICE_GPU), DescrptSeAOp); \ No newline at end of file diff --git a/source/op/descrpt_se_r_gpu.cc b/source/op/descrpt_se_r_gpu.cc new file mode 100644 index 0000000000..65e2682ef0 --- /dev/null +++ b/source/op/descrpt_se_r_gpu.cc @@ -0,0 +1,247 @@ +#include +#include +#include +#include +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" + +using namespace tensorflow; // NOLINT(build/namespaces) +#define MAGIC_NUMBER 256 + +#ifdef HIGH_PREC + typedef double VALUETYPE ; +#else + typedef float VALUETYPE ; +#endif + +typedef double compute_t; + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +// sec_a kao,sec_r, + +using GPUDevice = Eigen::GpuDevice; + +#ifdef HIGH_PREC +REGISTER_OP("DescrptSeR") + .Input("coord: double") + .Input("type: int32") + .Input("natoms: int32") + .Input("box: double") + .Input("mesh: int32") + .Input("davg: double") + .Input("dstd: double") + .Attr("rcut: float") + .Attr("rcut_smth: float") + .Attr("sel: list(int)") + .Output("descrpt: double") + .Output("descrpt_deriv: double") + .Output("rij: double") + .Output("nlist: int32"); +#else +REGISTER_OP("DescrptSeR") + .Input("coord: float") + .Input("type: int32") + .Input("natoms: int32") + .Input("box: float") + .Input("mesh: int32") + .Input("davg: float") + .Input("dstd: float") + .Attr("rcut: float") + .Attr("rcut_smth: float") + .Attr("sel: list(int)") + .Output("descrpt: float") + .Output("descrpt_deriv: float") + .Output("rij: float") + .Output("nlist: int32"); +#endif + +void DescrptSeRLauncher(const VALUETYPE* coord, + const int* type, + const int* ilist, + const int* jrange, + const int* jlist, + int* array_int, + unsigned long long* array_longlong, + compute_t* array_double, + const VALUETYPE* avg, + const VALUETYPE* std, + VALUETYPE* descript, + VALUETYPE* descript_deriv, + VALUETYPE* rij, + int* nlist, + const int& ntypes, + const int& nloc, + const int& nall, + const int& nnei, + const float& rcut, + const float& rcut_smth, + const int& ndescrpt, + const std::vector& sec, + const bool& fill_nei_a +); + +class DescrptSeROp : public OpKernel { +public: + explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) { + float nloc_f, nall_f; + OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut)); + OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth)); + OP_REQUIRES_OK(context, context->GetAttr("sel", &sel)); + cum_sum (sec, sel); + sel_null.resize(3, 0); + cum_sum (sec_null, sel_null); + ndescrpt = sec.back() * 1; + nnei = sec.back(); + fill_nei_a = true; + // count_nei_idx_overflow = 0; + // std::cout << "I'm in descrpt_se_r_gpu.cc" << std::endl; + } + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& coord_tensor = context->input(context_input_index++); + const Tensor& type_tensor = context->input(context_input_index++); + const Tensor& natoms_tensor = context->input(context_input_index++); + const Tensor& box_tensor = context->input(context_input_index++); + const Tensor& mesh_tensor = context->input(context_input_index++); + const Tensor& avg_tensor = context->input(context_input_index++); + const Tensor& std_tensor = context->input(context_input_index++); + // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3] + OP_REQUIRES (context, (coord_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of coord should be 2")); + OP_REQUIRES (context, (type_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of type should be 2")); + OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); + OP_REQUIRES (context, (box_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of box should be 2")); + OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of mesh should be 1")); + OP_REQUIRES (context, (avg_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of avg should be 2")); + OP_REQUIRES (context, (std_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of std should be 2")); + OP_REQUIRES (context, (fill_nei_a), errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0")); + + OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); + + int * natoms = new int[natoms_tensor.shape().dim_size(0)]; + cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + int nloc = natoms[0]; + int nall = natoms[1]; + int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something. + int nsamples = coord_tensor.shape().dim_size(0); + // + //// check the sizes + OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (ntypes == int(sel.size())), errors::InvalidArgument ("number of types should match the length of sel array")); + OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of avg should be ntype")); + OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of std should be ntype")); + + OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); + OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); + OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of box should be 9")); + OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of avg should be ndescrpt")); + OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of std should be ndescrpt")); + + // Create output tensors + TensorShape descrpt_shape ; + descrpt_shape.AddDim (nsamples); + descrpt_shape.AddDim (nloc * ndescrpt); + TensorShape descrpt_deriv_shape ; + descrpt_deriv_shape.AddDim (nsamples); + descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3); + TensorShape rij_shape ; + rij_shape.AddDim (nsamples); + rij_shape.AddDim (nloc * nnei * 3); + TensorShape nlist_shape ; + nlist_shape.AddDim (nsamples); + nlist_shape.AddDim (nloc * nnei); + + int context_output_index = 0; + Tensor* descrpt_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + descrpt_shape, + &descrpt_tensor)); + Tensor* descrpt_deriv_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + descrpt_deriv_shape, + &descrpt_deriv_tensor)); + Tensor* rij_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + rij_shape, + &rij_tensor)); + Tensor* nlist_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + nlist_shape, + &nlist_tensor)); + + int * ilist = NULL, *jrange = NULL, *jlist = NULL; + int *array_int = NULL; unsigned long long *array_longlong = NULL; compute_t *array_double = NULL; + cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); + cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); + cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); + cudaErrcheck(cudaMemcpy(&(array_int), 16 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); + cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost)); + cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost)); + + // cudaErrcheck(cudaMemcpy(jlist, host_jlist, sizeof(int) * nloc * MAGIC_NUMBER, cudaMemcpyHostToDevice)); + // Launch computation + for (int II = 0; II < nsamples; II++) { + DescrptSeRLauncher( coord_tensor.matrix().data() + II * (nall * 3), + type_tensor.matrix().data() + II * nall, + ilist, + jrange, + jlist, + array_int, + array_longlong, + array_double, + avg_tensor.matrix().data(), + std_tensor.matrix().data(), + descrpt_tensor->matrix().data() + II * (nloc * ndescrpt), + descrpt_deriv_tensor->matrix().data() + II * (nloc * ndescrpt * 3), + rij_tensor->matrix().data() + II * (nloc * nnei * 3), + nlist_tensor->matrix().data() + II * (nloc * nnei), + ntypes, + nloc, + nall, + nnei, + rcut, + rcut_smth, + ndescrpt, + sec, + fill_nei_a + ); + } + // std::cout << "done" << std::endl; + delete[] natoms; + } + +///////////////////////////////////////////////////////////////////////////////////////////// + +private: + float rcut; + float rcut_smth; + std::vector sel; + std::vector sel_null; + std::vector sec; + std::vector sec_null; + int nnei, ndescrpt, nloc, nall; + bool fill_nei_a; + + //private func + void cum_sum (std::vector & sec, const std::vector & n_sel) const { + sec.resize (n_sel.size() + 1); + sec[0] = 0; + for (int ii = 1; ii < sec.size(); ++ii) { + sec[ii] = sec[ii-1] + n_sel[ii-1]; + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("DescrptSeR").Device(DEVICE_GPU), DescrptSeROp); \ No newline at end of file diff --git a/source/op/prod_force_se_a_gpu.cc b/source/op/prod_force_se_a_gpu.cc new file mode 100644 index 0000000000..2d159a8505 --- /dev/null +++ b/source/op/prod_force_se_a_gpu.cc @@ -0,0 +1,139 @@ +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" +#include +#include + +using namespace tensorflow; + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +#ifdef HIGH_PREC +typedef double VALUETYPE; +#else +typedef float VALUETYPE; +#endif + +#ifdef HIGH_PREC +REGISTER_OP("ProdForceSeA") + .Input("net_deriv: double") + .Input("in_deriv: double") + .Input("nlist: int32") + .Input("natoms: int32") + .Attr("n_a_sel: int") + .Attr("n_r_sel: int") + .Output("force: double"); +#else +REGISTER_OP("ProdForceSeA") + .Input("net_deriv: float") + .Input("in_deriv: float") + .Input("nlist: int32") + .Input("natoms: int32") + .Attr("n_a_sel: int") + .Attr("n_r_sel: int") + .Output("force: float"); +#endif + +void ProdForceSeALauncher(VALUETYPE * force, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const int * nlist, + const int nloc, + const int nall, + const int ndescrpt, + const int nnei, + const int n_a_sel, + const int n_a_shift); + +class ProdForceSeAOp : public OpKernel { +public: + explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("n_a_sel", &n_a_sel)); + OP_REQUIRES_OK(context, context->GetAttr("n_r_sel", &n_r_sel)); + n_a_shift = n_a_sel * 4; + } + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& net_deriv_tensor = context->input(context_input_index++); + const Tensor& in_deriv_tensor = context->input(context_input_index++); + const Tensor& nlist_tensor = context->input(context_input_index++); + const Tensor& natoms_tensor = context->input(context_input_index++); + + // set size of the sample + OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of net deriv should be 2")); + OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of input deriv should be 2")); + OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of nlist should be 2")); + OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); + + OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); + int * natoms = new int[natoms_tensor.shape().dim_size(0)]; + cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + int nloc = natoms[0]; + int nall = natoms[1]; + int nframes = net_deriv_tensor.shape().dim_size(0); + int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc; + int nnei = nlist_tensor.shape().dim_size(1) / nloc; + + // check the sizes + OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + + OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match")); + OP_REQUIRES (context, (nnei == n_a_sel + n_r_sel), errors::InvalidArgument ("number of neighbors should match")); + OP_REQUIRES (context, (0 == n_r_sel), errors::InvalidArgument ("Rotational free only support all-angular information")); + + // Create an output tensor + TensorShape force_shape ; + force_shape.AddDim (nframes); + force_shape.AddDim (3 * nall); + Tensor* force_tensor = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + force_shape, &force_tensor)); + + // flat the tensors + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto nlist = nlist_tensor.flat(); + auto force = force_tensor->flat(); + + assert (nframes == force_shape.dim_size(0)); + assert (nframes == net_deriv_tensor.shape().dim_size(0)); + assert (nframes == in_deriv_tensor.shape().dim_size(0)); + assert (nframes == nlist_tensor.shape().dim_size(0)); + assert (nall * 3 == force_shape.dim_size(1)); + assert (nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1)); + assert (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)); + assert (nloc * nnei == nlist_tensor.shape().dim_size(1)); + assert (nnei * 4 == ndescrpt); + + for (int II = 0; II < nframes; II++) { + ProdForceSeALauncher(force_tensor->flat().data() + II * (nall * 3), + net_deriv_tensor.flat().data() + II * (nloc * ndescrpt), + in_deriv_tensor.flat().data() + II * (nloc * ndescrpt * 3), + nlist_tensor.flat().data() + II * (nloc * nnei), + nloc, + nall, + ndescrpt, + nnei, + n_a_sel, + n_a_shift + ); + } + delete[] natoms; + } +private: + int n_r_sel, n_a_sel, n_a_shift; +}; + +REGISTER_KERNEL_BUILDER(Name("ProdForceSeA").Device(DEVICE_GPU), ProdForceSeAOp); \ No newline at end of file diff --git a/source/op/prod_force_se_r_gpu.cc b/source/op/prod_force_se_r_gpu.cc new file mode 100644 index 0000000000..3dc0bc7853 --- /dev/null +++ b/source/op/prod_force_se_r_gpu.cc @@ -0,0 +1,133 @@ +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" +#include +#include + +using namespace tensorflow; + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +#ifdef HIGH_PREC + typedef double VALUETYPE; +#else + typedef float VALUETYPE; +#endif + +#ifdef HIGH_PREC +REGISTER_OP("ProdForceSeR") + .Input("net_deriv: double") + .Input("in_deriv: double") + .Input("nlist: int32") + .Input("natoms: int32") + .Output("force: double"); +#else +REGISTER_OP("ProdForceSeR") + .Input("net_deriv: float") + .Input("in_deriv: float") + .Input("nlist: int32") + .Input("natoms: int32") + .Output("force: float"); +#endif + +void ProdForceSeRLauncher(VALUETYPE * force, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const int * nlist, + const int nloc, + const int nall, + const int ndescrpt, + const int nnei, + const int n_a_sel, + const int n_a_shift); + +class ProdForceSeROp : public OpKernel { +public: + explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) { + // std::cout << "I'm in prod_force_se_r_gpu.cc" << std::endl; + } + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& net_deriv_tensor = context->input(context_input_index++); + const Tensor& in_deriv_tensor = context->input(context_input_index++); + const Tensor& nlist_tensor = context->input(context_input_index++); + const Tensor& natoms_tensor = context->input(context_input_index++); + + // set size of the sample + OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of net deriv should be 2")); + OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of input deriv should be 2")); + OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of nlist should be 2")); + OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); + + OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); + int * natoms = new int[natoms_tensor.shape().dim_size(0)]; + cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + int nloc = natoms[0]; + int nall = natoms[1]; + int nframes = net_deriv_tensor.shape().dim_size(0); + int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc; + int nnei = nlist_tensor.shape().dim_size(1) / nloc; + + // check the sizes + OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + + OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match")); + // OP_REQUIRES (context, (nnei == n_a_sel + n_r_sel), errors::InvalidArgument ("number of neighbors should match")); + // OP_REQUIRES (context, (0 == n_r_sel), errors::InvalidArgument ("Rotational free only support all-angular information")); + + // Create an output tensor + TensorShape force_shape; + force_shape.AddDim (nframes); + force_shape.AddDim (3 * nall); + Tensor* force_tensor = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + force_shape, &force_tensor)); + + // flat the tensors + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto nlist = nlist_tensor.flat(); + auto force = force_tensor->flat(); + + assert (nframes == force_shape.dim_size(0)); + assert (nframes == net_deriv_tensor.shape().dim_size(0)); + assert (nframes == in_deriv_tensor.shape().dim_size(0)); + assert (nframes == nlist_tensor.shape().dim_size(0)); + assert (nall * 3 == force_shape.dim_size(1)); + assert (nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1)); + assert (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)); + assert (nloc * nnei == nlist_tensor.shape().dim_size(1)); + assert (nnei * 4 == ndescrpt); + + for (int II = 0; II < nframes; II++) { + ProdForceSeRLauncher(force_tensor->flat().data() + II * (nall * 3), + net_deriv_tensor.flat().data() + II * (nloc * ndescrpt), + in_deriv_tensor.flat().data() + II * (nloc * ndescrpt * 3), + nlist_tensor.flat().data() + II * (nloc * nnei), + nloc, + nall, + ndescrpt, + nnei, + n_a_sel, + n_a_shift + ); + } + delete[] natoms; + } +private: + int n_r_sel, n_a_sel, n_a_shift; +}; + +REGISTER_KERNEL_BUILDER(Name("ProdForceSeR").Device(DEVICE_GPU), ProdForceSeROp); \ No newline at end of file diff --git a/source/op/prod_virial_se_a_gpu.cc b/source/op/prod_virial_se_a_gpu.cc new file mode 100644 index 0000000000..42f70d06d2 --- /dev/null +++ b/source/op/prod_virial_se_a_gpu.cc @@ -0,0 +1,145 @@ +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" +#include +#include + +#ifdef HIGH_PREC +typedef double VALUETYPE; +#else +typedef float VALUETYPE; +#endif + +#ifdef HIGH_PREC +REGISTER_OP("ProdVirialSeA") + .Input("net_deriv: double") + .Input("in_deriv: double") + .Input("rij: double") + .Input("nlist: int32") + .Input("natoms: int32") + .Attr("n_a_sel: int") + .Attr("n_r_sel: int") + .Output("virial: double") + .Output("atom_virial: double"); +#else +REGISTER_OP("ProdVirialSeA") + .Input("net_deriv: float") + .Input("in_deriv: float") + .Input("rij: float") + .Input("nlist: int32") + .Input("natoms: int32") + .Attr("n_a_sel: int") + .Attr("n_r_sel: int") + .Output("virial: float") + .Output("atom_virial: float"); +#endif + +using namespace tensorflow; + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +void ProdVirialSeALauncher(VALUETYPE * virial, + VALUETYPE * atom_virial, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const VALUETYPE * rij, + const int * nlist, + const int nloc, + const int nall, + const int nnei, + const int ndescrpt, + const int n_a_sel, + const int n_a_shift); + +class ProdVirialSeAOp : public OpKernel { + public: + explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("n_a_sel", &n_a_sel)); + OP_REQUIRES_OK(context, context->GetAttr("n_r_sel", &n_r_sel)); + n_a_shift = n_a_sel * 4; + } + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& net_deriv_tensor = context->input(context_input_index++); + const Tensor& in_deriv_tensor = context->input(context_input_index++); + const Tensor& rij_tensor = context->input(context_input_index++); + const Tensor& nlist_tensor = context->input(context_input_index++); + const Tensor& natoms_tensor = context->input(context_input_index++); + + // set size of the sample + OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of net deriv should be 2")); + OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of input deriv should be 2")); + OP_REQUIRES (context, (rij_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of rij should be 2")); + OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of nlist should be 2")); + OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); + + OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); + int * natoms = new int[natoms_tensor.shape().dim_size(0)]; + cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + int nloc = natoms[0]; + int nall = natoms[1]; + int nnei = nlist_tensor.shape().dim_size(1) / nloc; + int nframes = net_deriv_tensor.shape().dim_size(0); + int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc; + + // check the sizes + OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nframes == rij_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + + OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match")); + OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)), errors::InvalidArgument ("dim of rij should be nnei * 3")); + OP_REQUIRES (context, (nnei == n_a_sel + n_r_sel), errors::InvalidArgument ("number of neighbors should match")); + + // Create an output tensor + TensorShape virial_shape ; + virial_shape.AddDim (nframes); + virial_shape.AddDim (9); + Tensor* virial_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(0, virial_shape, &virial_tensor)); + TensorShape atom_virial_shape; + atom_virial_shape.AddDim (nframes); + atom_virial_shape.AddDim (9 * nall); + Tensor* atom_virial_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); + + // flat the tensors + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); + auto nlist = nlist_tensor.flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); + + for (int II = 0; II < nframes; II++) { + ProdVirialSeALauncher(virial_tensor->flat().data() + II * 9, + atom_virial_tensor->flat().data() + II * (nall * 9), + net_deriv_tensor.flat().data() + II * (nloc * ndescrpt), + in_deriv_tensor.flat().data() + II * (nloc * ndescrpt * 3), + rij_tensor.flat().data() + II * (nloc * nnei * 3), + nlist_tensor.flat().data() + II * (nloc * nnei), + nloc, + nall, + nnei, + ndescrpt, + n_a_sel, + n_a_shift + ); + } + delete[] natoms; + } +private: + int n_r_sel, n_a_sel, n_a_shift; +}; + +REGISTER_KERNEL_BUILDER(Name("ProdVirialSeA").Device(DEVICE_GPU), ProdVirialSeAOp); \ No newline at end of file diff --git a/source/op/prod_virial_se_r_gpu.cc b/source/op/prod_virial_se_r_gpu.cc new file mode 100644 index 0000000000..91f965b72c --- /dev/null +++ b/source/op/prod_virial_se_r_gpu.cc @@ -0,0 +1,137 @@ +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" +#include +#include + +#ifdef HIGH_PREC + typedef double VALUETYPE; +#else + typedef float VALUETYPE; +#endif + +#ifdef HIGH_PREC +REGISTER_OP("ProdVirialSeR") + .Input("net_deriv: double") + .Input("in_deriv: double") + .Input("rij: double") + .Input("nlist: int32") + .Input("natoms: int32") + .Output("virial: double") + .Output("atom_virial: double"); +#else +REGISTER_OP("ProdVirialSeR") + .Input("net_deriv: float") + .Input("in_deriv: float") + .Input("rij: float") + .Input("nlist: int32") + .Input("natoms: int32") + .Output("virial: float") + .Output("atom_virial: float"); +#endif + +using namespace tensorflow; + +#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +void ProdVirialSeRLauncher(VALUETYPE * virial, + VALUETYPE * atom_virial, + const VALUETYPE * net_deriv, + const VALUETYPE * in_deriv, + const VALUETYPE * rij, + const int * nlist, + const int nloc, + const int nall, + const int nnei, + const int ndescrpt, + const int n_a_sel, + const int n_a_shift); + +class ProdVirialSeROp : public OpKernel { + public: + explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) { + // std::cout << "I'm in prod_virial_se_r_gpu.cc" << std::endl; + } + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& net_deriv_tensor = context->input(context_input_index++); + const Tensor& in_deriv_tensor = context->input(context_input_index++); + const Tensor& rij_tensor = context->input(context_input_index++); + const Tensor& nlist_tensor = context->input(context_input_index++); + const Tensor& natoms_tensor = context->input(context_input_index++); + // set size of the sample + OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of net deriv should be 2")); + OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of input deriv should be 2")); + OP_REQUIRES (context, (rij_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of rij should be 2")); + OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of nlist should be 2")); + OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); + cudaErrcheck(cudaDeviceSynchronize()); + OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); + int * natoms = new int[natoms_tensor.shape().dim_size(0)]; + cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + int nloc = natoms[0]; + int nall = natoms[1]; + int nnei = nlist_tensor.shape().dim_size(1) / nloc; + int nframes = net_deriv_tensor.shape().dim_size(0); + int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc; + + // check the sizes + OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nframes == rij_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + + OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match")); + OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)), errors::InvalidArgument ("dim of rij should be nnei * 3")); + + // Create an output tensor + TensorShape virial_shape; + virial_shape.AddDim (nframes); + virial_shape.AddDim (9); + Tensor* virial_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(0, virial_shape, &virial_tensor)); + TensorShape atom_virial_shape ; + atom_virial_shape.AddDim (nframes); + atom_virial_shape.AddDim (9 * nall); + Tensor* atom_virial_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); + + // flat the tensors + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); + auto nlist = nlist_tensor.flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); + + for (int II = 0; II < nframes; II++) { + ProdVirialSeRLauncher(virial_tensor->flat().data() + II * 9, + atom_virial_tensor->flat().data() + II * (nall * 9), + net_deriv_tensor.flat().data() + II * (nloc * ndescrpt), + in_deriv_tensor.flat().data() + II * (nloc * ndescrpt * 3), + rij_tensor.flat().data() + II * (nloc * nnei * 3), + nlist_tensor.flat().data() + II * (nloc * nnei), + nloc, + nall, + nnei, + ndescrpt, + n_a_sel, + n_a_shift + ); + } + delete[] natoms; + } +private: + int n_r_sel, n_a_sel, n_a_shift; +}; + +REGISTER_KERNEL_BUILDER(Name("ProdVirialSeR").Device(DEVICE_GPU), ProdVirialSeROp); \ No newline at end of file diff --git a/source/scripts/freeze.py b/source/scripts/freeze.py index 62a7f4559f..2420251744 100755 --- a/source/scripts/freeze.py +++ b/source/scripts/freeze.py @@ -44,6 +44,8 @@ def _make_node_names(model_type = None) : nodes = "o_dipole,descrpt_attr/rcut,descrpt_attr/ntypes,model_attr/tmap,model_attr/sel_type,model_attr/model_type" elif model_type == 'polar': nodes = "o_polar,descrpt_attr/rcut,descrpt_attr/ntypes,model_attr/tmap,model_attr/sel_type,model_attr/model_type" + elif model_type == 'global_polar': + nodes = "o_global_polar,descrpt_attr/rcut,descrpt_attr/ntypes,model_attr/tmap,model_attr/sel_type,model_attr/model_type" else: raise RuntimeError('unknow model type ' + model_type) return nodes diff --git a/source/train/CMakeLists.txt b/source/train/CMakeLists.txt index 83f15df4c3..48164f8a9c 100644 --- a/source/train/CMakeLists.txt +++ b/source/train/CMakeLists.txt @@ -2,7 +2,7 @@ configure_file("RunOptions.py.in" "${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py" @ONLY) -file(GLOB LIB_PY main.py common.py env.py compat.py Network.py Deep*.py Data.py DataSystem.py Model*.py Descrpt*.py Fitting.py Loss.py LearningRate.py Trainer.py TabInter.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py) +file(GLOB LIB_PY main.py common.py env.py compat.py calculator.py Network.py Deep*.py Data.py DataSystem.py Model*.py Descrpt*.py Fitting.py Loss.py LearningRate.py Trainer.py TabInter.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py) file(GLOB CLS_PY Local.py Slurm.py) diff --git a/source/train/DeepEval.py b/source/train/DeepEval.py index f78699b6f8..3868f6e258 100644 --- a/source/train/DeepEval.py +++ b/source/train/DeepEval.py @@ -149,7 +149,8 @@ def get_sel_type(self): def eval(self, coords, cells, - atom_types) : + atom_types, + atomic = True) : # standarize the shape of inputs coords = np.array(coords) cells = np.array(cells) @@ -183,10 +184,13 @@ def eval(self, tensor.append(v_out[0]) # reverse map of the outputs - tensor = np.array(tensor) - tensor = self.reverse_map(np.reshape(tensor, [nframes,-1,self.variable_dof]), sel_imap) - - tensor = np.reshape(tensor, [nframes, len(sel_at), self.variable_dof]) + if atomic: + tensor = np.array(tensor) + tensor = self.reverse_map(np.reshape(tensor, [nframes,-1,self.variable_dof]), sel_imap) + tensor = np.reshape(tensor, [nframes, len(sel_at), self.variable_dof]) + else: + tensor = np.reshape(tensor, [nframes, self.variable_dof]) + return tensor diff --git a/source/train/DeepPolar.py b/source/train/DeepPolar.py index 0ec37876ca..3af499dd07 100644 --- a/source/train/DeepPolar.py +++ b/source/train/DeepPolar.py @@ -9,3 +9,14 @@ def __init__(self, model_file) : DeepTensor.__init__(self, model_file, 'polar', 9) + +class DeepGlobalPolar (DeepTensor) : + def __init__(self, + model_file) : + DeepTensor.__init__(self, model_file, 'global_polar', 9) + + def eval(self, + coords, + cells, + atom_types) : + return DeepTensor.eval(self, coords, cells, atom_types, atomic = False) diff --git a/source/train/DescrptSeA.py b/source/train/DescrptSeA.py index aa5221d7c7..527dff2cce 100644 --- a/source/train/DescrptSeA.py +++ b/source/train/DescrptSeA.py @@ -28,6 +28,7 @@ def __init__ (self, jdata): .add('neuron', list, default = [10, 20, 40]) \ .add('axis_neuron', int, default = 4, alias = 'n_axis_neuron') \ .add('resnet_dt',bool, default = False) \ + .add('trainable',bool, default = True) \ .add('seed', int) class_data = args.parse(jdata) self.sel_a = class_data['sel'] @@ -37,6 +38,7 @@ def __init__ (self, jdata): self.n_axis_neuron = class_data['axis_neuron'] self.filter_resnet_dt = class_data['resnet_dt'] self.seed = class_data['seed'] + self.trainable = class_data['trainable'] # descrpt config self.sel_r = [ 0 for ii in range(len(self.sel_a)) ] @@ -167,7 +169,7 @@ def build (self, self.descrpt_reshape = tf.reshape(self.descrpt, [-1, self.ndescrpt]) - self.dout, self.qmat = self._pass_filter(self.descrpt_reshape, natoms, suffix = suffix, reuse = reuse) + self.dout, self.qmat = self._pass_filter(self.descrpt_reshape, natoms, suffix = suffix, reuse = reuse, trainable = self.trainable) return self.dout @@ -201,7 +203,8 @@ def _pass_filter(self, inputs, natoms, reuse = None, - suffix = '') : + suffix = '', + trainable = True) : start_index = 0 inputs = tf.reshape(inputs, [-1, self.ndescrpt * natoms[0]]) shape = inputs.get_shape().as_list() @@ -212,7 +215,7 @@ def _pass_filter(self, [ 0, start_index* self.ndescrpt], [-1, natoms[2+type_i]* self.ndescrpt] ) inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt]) - layer, qmat = self._filter(inputs_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed) + layer, qmat = self._filter(inputs_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable) layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()]) qmat = tf.reshape(qmat, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_rot_mat_1() * 3]) output.append(layer) @@ -297,7 +300,8 @@ def _filter(self, bavg=0.0, name='linear', reuse=None, - seed=None): + seed=None, + trainable = True): # natom x (nei x 4) shape = inputs.get_shape().as_list() outputs_size = [1] + self.filter_neuron @@ -320,16 +324,19 @@ def _filter(self, w = tf.get_variable('matrix_'+str(ii)+'_'+str(type_i), [outputs_size[ii - 1], outputs_size[ii]], global_tf_float_precision, - tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed)) + tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed), + trainable = trainable) b = tf.get_variable('bias_'+str(ii)+'_'+str(type_i), [1, outputs_size[ii]], global_tf_float_precision, - tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed)) + tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), + trainable = trainable) if self.filter_resnet_dt : idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), [1, outputs_size[ii]], global_tf_float_precision, - tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed)) + tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed), + trainable = trainable) if outputs_size[ii] == outputs_size[ii-1]: if self.filter_resnet_dt : xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt @@ -376,7 +383,8 @@ def _filter_type_ext(self, bavg=0.0, name='linear', reuse=None, - seed=None): + seed=None, + trainable = True): # natom x (nei x 4) shape = inputs.get_shape().as_list() outputs_size = [1] + self.filter_neuron @@ -401,16 +409,19 @@ def _filter_type_ext(self, w = tf.get_variable('matrix_'+str(ii)+'_'+str(type_i), [outputs_size[ii - 1], outputs_size[ii]], global_tf_float_precision, - tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed)) + tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed), + trainable = trainable) b = tf.get_variable('bias_'+str(ii)+'_'+str(type_i), [1, outputs_size[ii]], global_tf_float_precision, - tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed)) + tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), + trainable = trainable) if self.filter_resnet_dt : idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), [1, outputs_size[ii]], global_tf_float_precision, - tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed)) + tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed), + trainable = trainable) if outputs_size[ii] == outputs_size[ii-1]: if self.filter_resnet_dt : xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt diff --git a/source/train/DescrptSeAR.py b/source/train/DescrptSeAR.py index 929b660248..138e9222f6 100644 --- a/source/train/DescrptSeAR.py +++ b/source/train/DescrptSeAR.py @@ -72,7 +72,7 @@ def build (self, reuse = None): # dout self.dout_a = self.descrpt_a.build(coord_, atype_, natoms, box, mesh, davg[0], dstd[0], suffix=suffix+'_a', reuse=reuse) - self.dout_r = self.descrpt_r.build(coord_, atype_, natoms, box, mesh, davg[1], dstd[1], suffix=suffix+'_r', reuse=reuse) + self.dout_r = self.descrpt_r.build(coord_, atype_, natoms, box, mesh, davg[1], dstd[1], suffix=suffix, reuse=reuse) self.dout_a = tf.reshape(self.dout_a, [-1, self.descrpt_a.get_dim_out()]) self.dout_r = tf.reshape(self.dout_r, [-1, self.descrpt_r.get_dim_out()]) self.dout = tf.concat([self.dout_a, self.dout_r], axis = 1) diff --git a/source/train/DescrptSeR.py b/source/train/DescrptSeR.py index b057a0afd3..aaa91ca220 100644 --- a/source/train/DescrptSeR.py +++ b/source/train/DescrptSeR.py @@ -27,6 +27,7 @@ def __init__ (self, jdata): .add('rcut_smth',float, default = 5.5) \ .add('neuron', list, default = [10, 20, 40]) \ .add('resnet_dt',bool, default = False) \ + .add('trainable',bool, default = True) \ .add('seed', int) class_data = args.parse(jdata) self.sel_r = class_data['sel'] @@ -35,6 +36,7 @@ def __init__ (self, jdata): self.filter_neuron = class_data['neuron'] self.filter_resnet_dt = class_data['resnet_dt'] self.seed = class_data['seed'] + self.trainable = class_data['trainable'] # descrpt config self.sel_a = [ 0 for ii in range(len(self.sel_r)) ] @@ -145,7 +147,7 @@ def build (self, self.descrpt_reshape = tf.reshape(self.descrpt, [-1, self.ndescrpt]) - self.dout = self._pass_filter(self.descrpt_reshape, natoms, suffix = suffix, reuse = reuse) + self.dout = self._pass_filter(self.descrpt_reshape, natoms, suffix = suffix, reuse = reuse, trainable = self.trainable) return self.dout @@ -171,7 +173,8 @@ def _pass_filter(self, inputs, natoms, reuse = None, - suffix = '') : + suffix = '', + trainable = True) : start_index = 0 inputs = tf.reshape(inputs, [-1, self.ndescrpt * natoms[0]]) shape = inputs.get_shape().as_list() @@ -181,7 +184,7 @@ def _pass_filter(self, [ 0, start_index* self.ndescrpt], [-1, natoms[2+type_i]* self.ndescrpt] ) inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt]) - layer = self._filter_r(inputs_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed) + layer = self._filter_r(inputs_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable) layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()]) output.append(layer) start_index += natoms[2+type_i] @@ -253,7 +256,8 @@ def _filter_r(self, bavg=0.0, name='linear', reuse=None, - seed=None): + seed=None, + trainable = True): # natom x nei shape = inputs.get_shape().as_list() outputs_size = [1] + self.filter_neuron @@ -274,16 +278,19 @@ def _filter_r(self, w = tf.get_variable('matrix_'+str(ii)+'_'+str(type_i), [outputs_size[ii - 1], outputs_size[ii]], global_tf_float_precision, - tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed)) + tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed), + trainable = trainable) b = tf.get_variable('bias_'+str(ii)+'_'+str(type_i), [1, outputs_size[ii]], global_tf_float_precision, - tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed)) + tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), + trainable = trainable) if self.filter_resnet_dt : idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), [1, outputs_size[ii]], global_tf_float_precision, - tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed)) + tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed), + trainable = trainable) if outputs_size[ii] == outputs_size[ii-1]: if self.filter_resnet_dt : xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt diff --git a/source/train/Trainer.py b/source/train/Trainer.py index 10472d828b..fb986399e7 100644 --- a/source/train/Trainer.py +++ b/source/train/Trainer.py @@ -386,6 +386,7 @@ def train (self, fp = open(self.disp_file, "a") cur_batch = self.sess.run(self.global_step) + is_first_step = True self.cur_batch = cur_batch self.run_opt.message("start training at lr %.2e (== %.2e), final lr will be %.2e" % (self.sess.run(self.learning_rate), @@ -417,8 +418,9 @@ def train (self, feed_dict_batch[self.place_holders[ii]] = batch_data[ii] feed_dict_batch[self.place_holders['is_training']] = True - if self.display_in_training and cur_batch == 0 : + if self.display_in_training and is_first_step : self.test_on_the_fly(fp, data, feed_dict_batch) + is_first_step = False if self.timing_in_training : tic = time.time() self.sess.run([self.train_op], feed_dict = feed_dict_batch, options=prf_options, run_metadata=prf_run_metadata) if self.timing_in_training : toc = time.time() diff --git a/source/train/calculator.py b/source/train/calculator.py new file mode 100644 index 0000000000..1179eef891 --- /dev/null +++ b/source/train/calculator.py @@ -0,0 +1,49 @@ +"""An ASE calculator interface. + +Example: +```python +from ase import Atoms +from deepmd.calculator import DP + +water = Atoms('H2O', + positions=[(0.7601, 1.9270, 1), + (1.9575, 1, 1), + (1., 1., 1.)], + cell=[100, 100, 100], + calculator=DP(model="frozen_model.pb")) +print(water.get_potential_energy()) +print(water.get_forces()) +``` + +Optimization is also available: +```python +from ase.optimize import BFGS +dyn = BFGS(water) +dyn.run(fmax=1e-6) +print(water.get_positions()) +``` +""" + +from ase.calculators.calculator import Calculator, all_changes +import deepmd.DeepPot as DeepPot + + +class DP(Calculator): + name = "DP" + implemented_properties = ["energy", "forces", "stress"] + + def __init__(self, model, label="DP", **kwargs): + Calculator.__init__(self, label=label, **kwargs) + self.dp = DeepPot(model) + self.type_dict = dict(zip(self.dp.get_type_map(), range(self.dp.get_ntypes()))) + + def calculate(self, atoms=None, properties=["energy", "forces", "stress"], system_changes=all_changes): + coord = atoms.get_positions().reshape([1, -1]) + cell = atoms.get_cell().reshape([1, -1]) + symbols = atoms.get_chemical_symbols() + atype = [self.type_dict[k] for k in symbols] + e, f, v = self.dp.eval(coord, cell, atype) + self.results['energy'] = e[0] + self.results['forces'] = f[0] + self.results['stress'] = v[0] +