diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index f0134b175c..873a25ae71 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -28,7 +28,7 @@ jobs:
       - name: Build wheels
         env:
           CIBW_BUILD: "cp36-* cp37-* cp38-* cp39-* cp310-*"
-          CIBW_MANYLINUX_X86_64_IMAGE: ghcr.io/deepmodeling/manylinux2014_x86_64_tensorflow
+          CIBW_MANYLINUX_X86_64_IMAGE: ghcr.io/deepmodeling/manylinux_2_24_x86_64_tensorflow
           CIBW_BEFORE_BUILD: pip install tensorflow
           CIBW_SKIP: "*-win32 *-manylinux_i686 *-musllinux*"
         run: |
diff --git a/.github/workflows/lint_python.yml b/.github/workflows/lint_python.yml
index 6b6dd695d3..91905cc258 100644
--- a/.github/workflows/lint_python.yml
+++ b/.github/workflows/lint_python.yml
@@ -21,7 +21,7 @@ jobs:
       run: pip install -r requirements.txt
     - uses: marian-code/python-lint-annotate@v2.5.0
       with:
-        python-root-list: "./deepmd/*.py ./deepmd/*/*.py ./source/train/*.py ./source/tests/*.py ./source/op/*.py"
+        python-root-list: "./deepmd/*.py ./deepmd/*/*.py ./deepmd/*/*/*.py ./source/train/*.py ./source/tests/*.py ./source/op/*.py"
         use-black: true
         use-isort: true
         use-mypy: true
diff --git a/README.md b/README.md
index 8afb65e2c7..357ff43fba 100644
--- a/README.md
+++ b/README.md
@@ -124,6 +124,7 @@ A full [document](doc/train/train-input-auto.rst) on options in the training inp
     - [Run path-integral MD with i-PI](doc/third-party/ipi.md)
     - [Run MD with GROMACS](doc/third-party/gromacs.md)
     - [Interfaces out of DeePMD-kit](doc/third-party/out-of-deepmd-kit.md)
+- [Use NVNMD](doc/nvnmd/index.md)
 
 # Code structure
 
diff --git a/deepmd/__init__.py b/deepmd/__init__.py
index 9c3af2a6c7..9fe29b6d9c 100644
--- a/deepmd/__init__.py
+++ b/deepmd/__init__.py
@@ -6,7 +6,7 @@
     import importlib_metadata as metadata
 import deepmd.utils.network as network
 
-from . import cluster, descriptor, fit, loss, utils
+from . import cluster, descriptor, fit, loss, utils, nvnmd
 from .env import set_mkl
 from .infer import DeepEval, DeepPotential
 from .infer.data_modifier import DipoleChargeModifier
@@ -32,4 +32,5 @@
     "DeepEval",
     "DeepPotential",
     "DipoleChargeModifier",
+    "nvnmd",
 ]
diff --git a/deepmd/common.py b/deepmd/common.py
index 6a18cda677..1146f291d5 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -34,7 +34,7 @@
         from typing import Literal  # python >3.6
     except ImportError:
         from typing_extensions import Literal  # type: ignore
-    _ACTIVATION = Literal["relu", "relu6", "softplus", "sigmoid", "tanh", "gelu"]
+    _ACTIVATION = Literal["relu", "relu6", "softplus", "sigmoid", "tanh", "gelu", "gelu_tf"]
     _PRECISION = Literal["default", "float16", "float32", "float64"]
 
 # define constants
@@ -49,7 +49,7 @@
 def gelu(x: tf.Tensor) -> tf.Tensor:
     """Gaussian Error Linear Unit.
 
-    This is a smoother version of the RELU.
+    This is a smoother version of the RELU, implemented by custom operator.
 
     Parameters
     ----------
@@ -58,7 +58,31 @@ def gelu(x: tf.Tensor) -> tf.Tensor:
 
     Returns
     -------
-    `x` with the GELU activation applied
+    tf.Tensor
+        `x` with the GELU activation applied
+
+    References
+    ----------
+    Original paper
+    https://arxiv.org/abs/1606.08415
+    """
+    return op_module.gelu(x)
+
+
+def gelu_tf(x: tf.Tensor) -> tf.Tensor:
+    """Gaussian Error Linear Unit.
+
+    This is a smoother version of the RELU, implemented by TF.
+
+    Parameters
+    ----------
+    x : tf.Tensor
+        float Tensor to perform activation
+
+    Returns
+    -------
+    tf.Tensor
+        `x` with the GELU activation applied
 
     References
     ----------
@@ -69,10 +93,10 @@ def gelu_wrapper(x):
         try:
             return tensorflow.nn.gelu(x, approximate=True)
         except AttributeError:
+            warnings.warn("TensorFlow does not provide an implementation of gelu, please upgrade your TensorFlow version. Fallback to the custom gelu operator.")
             return op_module.gelu(x)
     return (lambda x: gelu_wrapper(x))(x)
 
-
 # TODO this is not a good way to do things. This is some global variable to which
 # TODO anyone can write and there is no good way to keep track of the changes
 data_requirement = {}
@@ -84,6 +108,7 @@ def gelu_wrapper(x):
     "sigmoid": tf.sigmoid,
     "tanh": tf.nn.tanh,
     "gelu": gelu,
+    "gelu_tf": gelu_tf,
 }
 
 
diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index ff7549b124..b568d8be71 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -17,6 +17,9 @@
 from .descriptor import Descriptor
 from .se import DescrptSe
 
+from deepmd.nvnmd.descriptor.se_a import descrpt2r4, build_davg_dstd, build_op_descriptor, filter_lower_R42GR, filter_GR2D
+from deepmd.nvnmd.utils.config import nvnmd_cfg 
+
 @Descriptor.register("se_e2_a")
 @Descriptor.register("se_a")
 class DescrptSeA (DescrptSe):
@@ -412,6 +415,7 @@ def build (self,
         """
         davg = self.davg
         dstd = self.dstd
+        if nvnmd_cfg.enable and nvnmd_cfg.restore_descriptor: davg, dstd = build_davg_dstd()
         with tf.variable_scope('descrpt_attr' + suffix, reuse = reuse) :
             if davg is None:
                 davg = np.zeros([self.ntypes, self.ndescrpt]) 
@@ -448,8 +452,9 @@ def build (self,
         box   = tf.reshape (box_, [-1, 9])
         atype = tf.reshape (atype_, [-1, natoms[1]])
 
+        op_descriptor = build_op_descriptor() if nvnmd_cfg.enable else op_module.prod_env_mat_a
         self.descrpt, self.descrpt_deriv, self.rij, self.nlist \
-            = op_module.prod_env_mat_a (coord,
+            = op_descriptor           (coord,
                                        atype,
                                        natoms,
                                        box,
@@ -576,6 +581,8 @@ def _pass_filter(self,
             inputs_i = inputs
             inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
             type_i = -1
+            if nvnmd_cfg.enable and nvnmd_cfg.quantize_descriptor: 
+                inputs_i = descrpt2r4(inputs_i, natoms)
             layer, qmat = self._filter(inputs_i, type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, trainable = trainable, activation_fn = self.filter_activation_fn, type_embedding=type_embedding)
             layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0], self.get_dim_out()])
             qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[0], self.get_dim_rot_mat_1() * 3])
@@ -717,6 +724,14 @@ def _filter_lower(
             if self.compress:
                 raise RuntimeError('compression of type embedded descriptor is not supported at the moment')
         # natom x 4 x outputs_size
+        if nvnmd_cfg.enable:
+          return filter_lower_R42GR(
+            type_i, type_input, inputs_i, is_exclude,
+            activation_fn, bavg, stddev, trainable, 
+            suffix, self.seed, self.seed_shift, self.uniform_seed,
+            self.filter_neuron, self.filter_precision, self.filter_resnet_dt,
+            self.embedding_net_variables
+          )
         if self.compress and (not is_exclude):
             if self.type_one_side:
                 net = 'filter_-1_net_' + str(type_i)
@@ -825,6 +840,7 @@ def _filter(
                   stddev = stddev,
                   bavg = bavg,
                   trainable = trainable)
+          if nvnmd_cfg.enable: return filter_GR2D(xyz_scatter_1)
           # natom x nei x outputs_size
           # xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
           # natom x nei x 4
diff --git a/deepmd/entrypoints/freeze.py b/deepmd/entrypoints/freeze.py
index 1f816d083e..e13c4c778b 100755
--- a/deepmd/entrypoints/freeze.py
+++ b/deepmd/entrypoints/freeze.py
@@ -19,6 +19,8 @@
 
 from typing import List, Optional
 
+from deepmd.nvnmd.entrypoints.freeze import save_weight
+
 __all__ = ["freeze"]
 
 log = logging.getLogger(__name__)
@@ -160,7 +162,7 @@ def _make_node_names(model_type: str, modifier_type: Optional[str] = None) -> Li
 
 
 def freeze(
-    *, checkpoint_folder: str, output: str, node_names: Optional[str] = None, **kwargs
+    *, checkpoint_folder: str, output: str, node_names: Optional[str] = None, nvnmd_weight: Optional[str] = None, **kwargs
 ):
     """Freeze the graph in supplied folder.
 
@@ -237,6 +239,9 @@ def freeze(
             output_node_list = node_names.split(",")
         log.info(f"The following nodes will be frozen: {output_node_list}")
 
+        if nvnmd_weight is not None:
+            save_weight(sess, nvnmd_weight) # nvnmd
+
         # We use a built-in TF helper to export variables to constants
         output_graph_def = tf.graph_util.convert_variables_to_constants(
             sess,  # The session is used to retrieve the weights
diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py
index 949797ea8b..5546ca15cd 100644
--- a/deepmd/entrypoints/main.py
+++ b/deepmd/entrypoints/main.py
@@ -20,6 +20,8 @@
 )
 from deepmd.loggers import set_log_handles
 
+from deepmd.nvnmd.entrypoints.train import train_nvnmd
+
 __all__ = ["main", "parse_args", "get_ll", "main_parser"]
 
 
@@ -204,6 +206,13 @@ def main_parser() -> argparse.ArgumentParser:
         default=None,
         help="the frozen nodes, if not set, determined from the model type",
     )
+    parser_frz.add_argument(
+        "-w",
+        "--nvnmd-weight",
+        type=str,
+        default=None,
+        help="the name of weight file (.npy), if set, save the model's weight into the file",
+    )
 
     # * test script ********************************************************************
     parser_tst = subparsers.add_parser(
@@ -436,9 +445,28 @@ def main_parser() -> argparse.ArgumentParser:
         required=True,
         help="type map",
     )
-        
+
     # --version
     parser.add_argument('--version', action='version', version='DeePMD-kit v%s' % __version__)
+
+    # * train nvnmd script ******************************************************************
+    parser_train_nvnmd = subparsers.add_parser(
+        "train-nvnmd",
+        parents=[parser_log],
+        help="train nvnmd model",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser_train_nvnmd.add_argument(
+        "INPUT", help="the input parameter file in json format"
+    )
+    parser_train_nvnmd.add_argument(
+        "-s",
+        "--step",
+        default="s1",
+        type=str,
+        choices=['s1', 's2'],
+        help="steps to train model of NVNMD: s1 (train CNN), s2 (train QNN)"
+    )
     return parser
 
 
@@ -504,6 +532,8 @@ def main():
         convert(**dict_args)
     elif args.command == "neighbor-stat":
         neighbor_stat(**dict_args)
+    elif args.command == "train-nvnmd":  # nvnmd
+        train_nvnmd(**dict_args)
     elif args.command is None:
         pass
     else:
diff --git a/deepmd/env.py b/deepmd/env.py
index db19fd86c6..5415ff75a5 100644
--- a/deepmd/env.py
+++ b/deepmd/env.py
@@ -281,13 +281,21 @@ def get_module(module_name: str) -> "ModuleType":
                         TF_VERSION,
                         tf_py_version,
                     )) from e
-            raise RuntimeError(
+            error_message = (
                 "This deepmd-kit package is inconsitent with TensorFlow "
                 "Runtime, thus an error is raised when loading %s. "
                 "You need to rebuild deepmd-kit against this TensorFlow "
                 "runtime." % (
                     module_name,
-                )) from e
+                )
+            )
+            if TF_CXX11_ABI_FLAG == 1:
+                # #1791
+                error_message += (
+                    "\nWARNING: devtoolset on RHEL6 and RHEL7 does not support _GLIBCXX_USE_CXX11_ABI=1. "
+                    "See https://bugzilla.redhat.com/show_bug.cgi?id=1546704"
+                )
+            raise RuntimeError(error_message) from e
         return module
 
 
diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py
index 4084281865..61d70045d8 100644
--- a/deepmd/fit/ener.py
+++ b/deepmd/fit/ener.py
@@ -5,7 +5,8 @@
 
 from deepmd.env import tf
 from deepmd.common import add_data_requirement, get_activation_func, get_precision, cast_precision
-from deepmd.utils.network import one_layer, one_layer_rand_seed_shift
+from deepmd.utils.network import one_layer_rand_seed_shift
+from deepmd.utils.network import one_layer as one_layer_deepmd
 from deepmd.utils.type_embed import embed_atom_type
 from deepmd.utils.graph import get_fitting_net_variables_from_graph_def, load_graph_def, get_tensor_by_name_from_graph
 from deepmd.fit.fitting import Fitting
@@ -13,6 +14,9 @@
 from deepmd.env import global_cvt_2_tf_float
 from deepmd.env import GLOBAL_TF_FLOAT_PRECISION, TF_VERSION
 
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.nvnmd.fit.ener import one_layer_nvnmd
+
 class EnerFitting (Fitting):
     r"""Fitting the energy of the system. The force and the virial can also be trained.
 
@@ -291,8 +295,12 @@ def _build_lower(
             ext_aparam = tf.cast(ext_aparam,self.fitting_precision)
             layer = tf.concat([layer, ext_aparam], axis = 1)
 
+        if nvnmd_cfg.enable: 
+            one_layer = one_layer_nvnmd
+        else:
+            one_layer = one_layer_deepmd
         for ii in range(0,len(self.n_neuron)) :
-            if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii-1] :
+            if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii-1] and (not nvnmd_cfg.enable):
                 layer+= one_layer(
                     layer,
                     self.n_neuron[ii],
diff --git a/deepmd/infer/deep_eval.py b/deepmd/infer/deep_eval.py
index 0d7355c89b..4da6578159 100644
--- a/deepmd/infer/deep_eval.py
+++ b/deepmd/infer/deep_eval.py
@@ -1,5 +1,6 @@
 import os
 from typing import List, Optional, TYPE_CHECKING, Union
+from functools import lru_cache
 
 import numpy as np
 from deepmd.common import make_default_mesh
@@ -27,8 +28,6 @@ class DeepEval:
         as the initial batch size.
     """
 
-    _model_type: Optional[str] = None
-    _model_version: Optional[str] = None
     load_prefix: str  # set by subclass
 
     def __init__(
@@ -64,19 +63,19 @@ def __init__(
             raise TypeError("auto_batch_size should be bool, int, or AutoBatchSize")
 
     @property
+    @lru_cache(maxsize=None)
     def model_type(self) -> str:
         """Get type of model.
 
         :type:str
         """
-        if not self._model_type:
-            t_mt = self._get_tensor("model_attr/model_type:0")
-            sess = tf.Session(graph=self.graph, config=default_tf_session_config)
-            [mt] = run_sess(sess, [t_mt], feed_dict={})
-            self._model_type = mt.decode("utf-8")
-        return self._model_type
+        t_mt = self._get_tensor("model_attr/model_type:0")
+        sess = tf.Session(graph=self.graph, config=default_tf_session_config)
+        [mt] = run_sess(sess, [t_mt], feed_dict={})
+        return mt.decode("utf-8")
 
     @property
+    @lru_cache(maxsize=None)
     def model_version(self) -> str:
         """Get version of model.
 
@@ -85,17 +84,15 @@ def model_version(self) -> str:
         str
             version of model
         """
-        if not self._model_version:
-            try:
-                t_mt = self._get_tensor("model_attr/model_version:0")
-            except KeyError:
-                # For deepmd-kit version 0.x - 1.x, set model version to 0.0
-                self._model_version = "0.0"
-            else:
-                sess = tf.Session(graph=self.graph, config=default_tf_session_config)
-                [mt] = run_sess(sess, [t_mt], feed_dict={})
-                self._model_version = mt.decode("utf-8")
-        return self._model_version    
+        try:
+            t_mt = self._get_tensor("model_attr/model_version:0")
+        except KeyError:
+            # For deepmd-kit version 0.x - 1.x, set model version to 0.0
+            return "0.0"
+        else:
+            sess = tf.Session(graph=self.graph, config=default_tf_session_config)
+            [mt] = run_sess(sess, [t_mt], feed_dict={})
+            return mt.decode("utf-8")
 
     def _graph_compatable(
         self
diff --git a/deepmd/nvnmd/__init__.py b/deepmd/nvnmd/__init__.py
new file mode 100644
index 0000000000..f3cdaf13e5
--- /dev/null
+++ b/deepmd/nvnmd/__init__.py
@@ -0,0 +1,10 @@
+
+from . import data, descriptor, entrypoints, fit, utils
+
+__all__ = [
+    "data",
+    "descriptor",
+    "entrypoints",
+    "fit",
+    "utils",
+]
diff --git a/deepmd/nvnmd/data/__init__.py b/deepmd/nvnmd/data/__init__.py
new file mode 100644
index 0000000000..21c208e404
--- /dev/null
+++ b/deepmd/nvnmd/data/__init__.py
@@ -0,0 +1,44 @@
+"""
+nvnmd.data
+==========
+
+Provides
+    1. hardware configuration
+    2. default input script
+    3. title and citation
+
+Data
+----
+
+jdata_sys 
+    action configuration
+jdata_config 
+    hardware configuration
+
+    dscp 
+        descriptor configuration
+    fitn 
+        fitting network configuration
+    size 
+        ram capacity
+    ctrl 
+        control flag, such as Time Division Multiplexing (TDM)
+    nbit 
+        number of bits of fixed-point number
+jdata_config_16 (disable) 
+    difference with configure fitting size as 16
+jdata_config_32 (disable) 
+    difference with configure fitting size as 32
+jdata_config_64 (disable) 
+    difference with configure fitting size as 64
+jdata_config_128 (default) 
+    difference with configure fitting size as 128
+jdata_configs 
+    all configure of jdata_config{nfit_node}
+jdata_deepmd_input 
+    default input script for nvnmd training
+NVNMD_WELCOME 
+    nvnmd title when logging
+NVNMD_CITATION 
+    citation of nvnmd
+"""
diff --git a/deepmd/nvnmd/data/data.py b/deepmd/nvnmd/data/data.py
new file mode 100644
index 0000000000..0013c7f226
--- /dev/null
+++ b/deepmd/nvnmd/data/data.py
@@ -0,0 +1,283 @@
+
+jdata_sys = {
+    "debug": False
+}
+
+jdata_config = {
+    "dscp": {
+        "sel": [60, 60],
+        "rcut": 6.0,
+        "rcut_smth": 0.5,
+        "neuron": [8, 16, 32],
+        "resnet_dt": False,
+        "axis_neuron": 4,
+        "type_one_side": True,
+
+        "NI": 128,
+        "rc_lim": 0.5,
+        "M1": "neuron[-1]",
+        "M2": "axis_neuron",
+        "SEL": [60, 60, 0, 0],
+        "NNODE_FEAS": "(1, neuron)",
+        "nlayer_fea": "len(neuron)",
+        "same_net": "type_one_side",
+        "NIDP": "sum(sel)",
+        "NIX": "2^ceil(ln2(NIDP/1.5))",
+        "ntype": "len(sel)",
+        "ntypex": "same_net ? 1: ntype",
+        "ntypex_max": 1,
+        "ntype_max": 4
+    },
+
+    "fitn": {
+        "neuron": [32, 32, 32],
+        "resnet_dt": False,
+
+        "NNODE_FITS": "(M1*M2, neuron, 1)",
+        "nlayer_fit": "len(neuron)+1",
+        "NLAYER": "nlayer_fit"
+    },
+
+    "size": {
+        "NTYPE_MAX": 4,
+        "NSPU": 4096,
+        "MSPU": 32768,
+        "Na": "NSPU",
+        "NaX": "MSPU"
+    },
+
+    "ctrl": {
+        "NSTDM": 16,
+        "NSTDM_M1": 16,
+        "NSTDM_M2": 1,
+        "NSADV": "NSTDM+1",
+        "NSEL": "NSTDM*ntype_max",
+        "NSTDM_M1X": 4,
+        "NSTEP_DELAY": 20,
+        "MAX_FANOUT": 30
+    },
+
+    "nbit": {
+        "NBIT_DATA": 21,
+        "NBIT_DATA_FL": 13,
+        "NBIT_LONG_DATA": 32,
+        "NBIT_LONG_DATA_FL": 24,
+        "NBIT_DIFF_DATA": 24,
+
+        "NBIT_SPE": 2,
+        "NBIT_CRD": "NBIT_DATA*3",
+        "NBIT_LST": "ln2(NaX)",
+
+        "NBIT_SPE_MAX": 8,
+        "NBIT_LST_MAX": 16,
+
+        "NBIT_ATOM": "NBIT_SPE+NBIT_CRD",
+        "NBIT_LONG_ATOM": "NBIT_SPE+NBIT_LONG_DATA*3",
+
+        "NBIT_RIJ": "NBIT_DATA_FL+5",
+        "NBIT_FEA_X": 10,
+        "NBIT_FEA_X_FL": 4,
+        "NBIT_FEA_X2_FL": 6,
+        "NBIT_FEA": 18,
+        "NBIT_FEA_FL": 10,
+        "NBIT_SHIFT": 4,
+
+        "NBIT_DATA2": "NBIT_DATA+NBIT_DATA_FL",
+        "NBIT_DATA2_FL": "2*NBIT_DATA_FL",
+        "NBIT_DATA_FEA": "NBIT_DATA+NBIT_FEA_FL",
+        "NBIT_DATA_FEA_FL": "NBIT_DATA_FL+NBIT_FEA_FL",
+
+        "NBIT_FORCE": 32,
+        "NBIT_FORCE_FL": "2*NBIT_DATA_FL-1",
+
+        "NBIT_SUM": "NBIT_DATA_FL+8",
+        "NBIT_WEIGHT": 18,
+        "NBIT_WEIGHT_FL": 13,
+
+        "NBIT_RAM": 72,
+        "NBIT_ADDR": 32,
+
+        "NBTI_MODEL_HEAD": 32,
+
+        "NBIT_TH_LONG_ADD": 30,
+        "NBIT_ADD": 15,
+
+        "RANGE_B": [-100, 100],
+        "RANGE_W": [-20, 20],
+
+        "NCFG": 35,
+        "NNET": 4920,
+        "NFEA": 8192
+    },
+
+    "end": ""
+}
+
+jdata_config_16 = {
+    "dscp": {
+        "neuron": [8, 16, 32],
+        "axis_neuron": 4,
+        "NI": 128
+    },
+
+    "fitn": {
+        "neuron": [16, 16, 16]
+    },
+
+    "ctrl": {
+        "NSTDM": 16,
+        "NSTDM_M1": 16,
+        "NSTDM_M2": 1,
+        "NSTDM_M1X": 4
+    }
+}
+
+jdata_config_32 = {
+    "dscp": {
+        "neuron": [8, 16, 32],
+        "axis_neuron": 4,
+        "NI": 128
+    },
+
+    "fitn": {
+        "neuron": [32, 32, 32]
+    },
+
+    "ctrl": {
+        "NSTDM": 16,
+        "NSTDM_M1": 16,
+        "NSTDM_M2": 1,
+        "NSTDM_M1X": 4
+    }
+}
+
+jdata_config_64 = {
+    "dscp": {
+        "neuron": [8, 16, 32],
+        "axis_neuron": 4,
+        "NI": 128
+    },
+
+    "fitn": {
+        "neuron": [64, 64, 64]
+    },
+
+    "ctrl": {
+        "NSTDM": 32,
+        "NSTDM_M1": 32,
+        "NSTDM_M2": 1,
+        "NSTDM_M1X": 4
+    }
+}
+
+jdata_config_128 = {
+    "dscp": {
+        "neuron": [8, 16, 32],
+        "axis_neuron": 4,
+        "NI": 128
+    },
+
+    "fitn": {
+        "neuron": [128, 128, 128]
+    },
+
+    "ctrl": {
+        "NSTDM": 32,
+        "NSTDM_M1": 32,
+        "NSTDM_M2": 1,
+        "NSTDM_M1X": 4
+    }
+}
+
+jdata_configs = {
+    "_16": jdata_config_16,
+    "_32": jdata_config_32,
+    "_64": jdata_config_64,
+    "128": jdata_config_128
+}
+
+jdata_deepmd_input = {
+    "model": {
+        "descriptor": {
+            "seed": 1,
+            "type": "se_a",
+            "sel": [
+                60,
+                60
+            ],
+            "rcut": 7.0,
+            "rcut_smth": 0.5,
+            "neuron": [
+                8,
+                16,
+                32
+            ],
+            "type_one_side": False,
+            "axis_neuron": 4,
+            "resnet_dt": False
+        },
+        "fitting_net": {
+            "seed": 1,
+            "neuron": [
+                128,
+                128,
+                128
+            ],
+            "resnet_dt": False
+        }
+    },
+    "nvnmd": {
+        "net_size": 128,
+        "config_file": "none",
+        "weight_file": "none",
+        "map_file": "none",
+        "enable": False,
+        "restore_descriptor": False,
+        "restore_fitting_net": False,
+        "quantize_descriptor": False,
+        "quantize_fitting_net": False
+    },
+    "learning_rate": {
+        "type": "exp",
+        "decay_steps": 5000,
+        "start_lr": 0.005,
+        "stop_lr": 8.257687192506788e-05
+    },
+    "loss": {
+        "start_pref_e": 0.02,
+        "limit_pref_e": 1,
+        "start_pref_f": 1000,
+        "limit_pref_f": 1,
+        "start_pref_v": 0,
+        "limit_pref_v": 0
+    },
+    "training": {
+        "seed": 1,
+        "stop_batch": 10000,
+        "disp_file": "lcurve.out",
+        "disp_freq": 100,
+        "numb_test": 10,
+        "save_freq": 1000,
+        "save_ckpt": "model.ckpt",
+        "disp_training": True,
+        "time_training": True,
+        "profiling": False,
+        "training_data": {
+            "systems": "dataset",
+            "set_prefix": "set",
+            "batch_size": 1
+        }
+    }
+}
+NVNMD_WELCOME = (
+    " _   _  __     __  _   _   __  __   ____  ",
+    "| \ | | \ \   / / | \ | | |  \/  | |  _ \ ",
+    "|  \| |  \ \ / /  |  \| | | |\/| | | | | |",
+    "| |\  |   \ V /   | |\  | | |  | | | |_| |",
+    "|_| \_|    \_/    |_| \_| |_|  |_| |____/ ",
+)
+
+NVNMD_CITATION = (
+    "Please read and cite:",
+    "Mo et al., npj Comput Mater 8, 107 (2022)",
+)
diff --git a/deepmd/nvnmd/descriptor/__init__.py b/deepmd/nvnmd/descriptor/__init__.py
new file mode 100644
index 0000000000..f01cd22222
--- /dev/null
+++ b/deepmd/nvnmd/descriptor/__init__.py
@@ -0,0 +1,9 @@
+"""
+nvnmd.se_a
+==========
+
+Provides
+    1. building descriptor with continuous embedding network
+    2. building descriptor with quantized embedding network
+
+"""
diff --git a/deepmd/nvnmd/descriptor/se_a.py b/deepmd/nvnmd/descriptor/se_a.py
new file mode 100644
index 0000000000..92f3332868
--- /dev/null
+++ b/deepmd/nvnmd/descriptor/se_a.py
@@ -0,0 +1,280 @@
+import numpy as np
+
+from deepmd.env import tf
+from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.env import GLOBAL_NP_FLOAT_PRECISION
+from deepmd.env import op_module
+from deepmd.utils.network import embedding_net
+
+
+#
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.nvnmd.utils.network import matmul3_qq
+from deepmd.nvnmd.utils.weight import get_normalize, get_rng_s
+
+
+def build_davg_dstd():
+    r"""Get the davg and dstd from the dictionary nvnmd_cfg.
+    The davg and dstd have been obtained by training CNN
+    """
+    davg, dstd = get_normalize(nvnmd_cfg.weight)
+    return davg, dstd
+
+
+def build_op_descriptor():
+    r"""Replace se_a.py/DescrptSeA/build
+    """
+    if nvnmd_cfg.quantize_descriptor:
+        return op_module.prod_env_mat_a_nvnmd_quantize
+    else:
+        return op_module.prod_env_mat_a
+
+
+def descrpt2r4(inputs, natoms):
+    r"""Replace :math:`r_{ji} \rightarrow r'_{ji}`
+    where :math:`r_{ji} = (x_{ji}, y_{ji}, z_{ji})` and
+    :math:`r'_{ji} = (s_{ji}, \frac{s_{ji} x_{ji}}{r_{ji}}, \frac{s_{ji} y_{ji}}{r_{ji}}, \frac{s_{ji} z_{ji}}{r_{ji}})`
+    """
+    NBIT_DATA_FL = nvnmd_cfg.nbit['NBIT_DATA_FL']
+    NBIT_FEA_X_FL = nvnmd_cfg.nbit['NBIT_FEA_X_FL']
+    NBIT_FEA_FL = nvnmd_cfg.nbit['NBIT_FEA_FL']
+    prec = 1.0 / (2 ** NBIT_FEA_X_FL)
+
+    ntypes = nvnmd_cfg.dscp['ntype']
+    NIDP = nvnmd_cfg.dscp['NIDP']
+    ndescrpt = NIDP * 4
+    start_index = 0
+
+    # (nf, na*nd)
+    shape = inputs.get_shape().as_list()
+    # (nf*na*ni, 4)
+    inputs_reshape = tf.reshape(inputs, [-1, 4])
+
+    with tf.variable_scope('filter_type_all_x', reuse=True):
+        # u (i.e., r^2)
+        u = tf.reshape(tf.slice(inputs_reshape, [0, 0], [-1, 1]), [-1, 1])
+        with tf.variable_scope('u', reuse=True):
+            u = op_module.quantize_nvnmd(u, 0, -1, NBIT_DATA_FL, -1)
+        # print('u:', u)
+        u = tf.reshape(u, [-1, natoms[0] * NIDP])
+        # rij
+        rij = tf.reshape(tf.slice(inputs_reshape, [0, 1], [-1, 3]), [-1, 3])
+        with tf.variable_scope('rij', reuse=True):
+            rij = op_module.quantize_nvnmd(rij, 0, NBIT_DATA_FL, -1, -1)
+        # print('rij:', rij)
+        s = []
+        sr = []
+        for type_i in range(ntypes):
+            type_input = 0
+            postfix = f"_t{type_input}_t{type_i}"
+            u_i = tf.slice(
+                u,
+                [0, start_index * NIDP],
+                [-1, natoms[2 + type_i] * NIDP])
+            u_i = tf.reshape(u_i, [-1, 1])
+            #
+            keys = 's,sr'.split(',')
+            map_tables = [nvnmd_cfg.map[key + postfix] for key in keys]
+            map_tables2 = [nvnmd_cfg.map[f"d{key}_dr2" + postfix] for key in keys]
+            map_outs = []
+            for ii in range(len(keys)):
+                map_outs.append(op_module.map_nvnmd(
+                    u_i,
+                    map_tables[ii][0],
+                    map_tables[ii][1] / prec,
+                    map_tables2[ii][0],
+                    map_tables2[ii][1] / prec,
+                    prec, NBIT_FEA_FL))
+
+            s_i, sr_i = map_outs
+            s_i = tf.reshape(s_i, [-1, natoms[2 + type_i] * NIDP])
+            sr_i = tf.reshape(sr_i, [-1, natoms[2 + type_i] * NIDP])
+            s.append(s_i)
+            sr.append(sr_i)
+            start_index += natoms[2 + type_i]
+
+        s = tf.concat(s, axis=1)
+        sr = tf.concat(sr, axis=1)
+
+        with tf.variable_scope('s', reuse=True):
+            s = op_module.quantize_nvnmd(s, 0, NBIT_FEA_FL, NBIT_DATA_FL, -1)
+
+        with tf.variable_scope('sr', reuse=True):
+            sr = op_module.quantize_nvnmd(sr, 0, NBIT_FEA_FL, NBIT_DATA_FL, -1)
+
+        s = tf.reshape(s, [-1, 1])
+        sr = tf.reshape(sr, [-1, 1])
+
+        # R2R4
+        Rs = s
+        Rxyz = sr * rij
+        with tf.variable_scope('Rxyz', reuse=True):
+            Rxyz = op_module.quantize_nvnmd(Rxyz, 0, NBIT_DATA_FL, NBIT_DATA_FL, -1)
+        R4 = tf.concat([Rs, Rxyz], axis=1)
+        R4 = tf.reshape(R4, [-1, NIDP, 4])
+        inputs_reshape = R4
+        inputs_reshape = tf.reshape(inputs_reshape, [-1, ndescrpt])
+    return inputs_reshape
+
+
+def filter_lower_R42GR(
+        type_i,
+        type_input,
+        inputs_i,
+        is_exclude,
+        activation_fn,
+        bavg,
+        stddev,
+        trainable,
+        suffix,
+        seed,
+        seed_shift,
+        uniform_seed,
+        filter_neuron,
+        filter_precision,
+        filter_resnet_dt,
+        embedding_net_variables):
+    r"""Replace se_a.py/DescrptSeA/_filter_lower
+    """
+    shape_i = inputs_i.get_shape().as_list()
+    inputs_reshape = tf.reshape(inputs_i, [-1, 4])
+    natom = tf.shape(inputs_i)[0]
+    M1 = nvnmd_cfg.dscp['M1']
+
+    NBIT_DATA_FL = nvnmd_cfg.nbit['NBIT_DATA_FL']
+    NBIT_FEA_X_FL = nvnmd_cfg.nbit['NBIT_FEA_X_FL']
+    NBIT_FEA_X2_FL = nvnmd_cfg.nbit['NBIT_FEA_X2_FL']
+    NBIT_FEA_FL = nvnmd_cfg.nbit['NBIT_FEA_FL']
+    prec = 1.0 / (2 ** NBIT_FEA_X2_FL)
+    type_input = 0 if (type_input < 0) else type_input
+    postfix = f"_t{type_input}_t{type_i}"
+
+    if (nvnmd_cfg.quantize_descriptor):
+        s_min, smax = get_rng_s(nvnmd_cfg.weight)
+        s_min = -2.0
+        # s_min = np.floor(s_min)
+        s = tf.reshape(tf.slice(inputs_reshape, [0, 0], [-1, 1]), [-1, 1])
+        s = op_module.quantize_nvnmd(s, 0, NBIT_FEA_FL, NBIT_DATA_FL, -1)
+        # G
+        keys = 'G'.split(',')
+        map_tables = [nvnmd_cfg.map[key + postfix] for key in keys]
+        map_tables2 = [nvnmd_cfg.map[f"d{key}_ds" + postfix] for key in keys]
+        map_outs = []
+        for ii in range(len(keys)):
+            with tf.variable_scope(keys[ii], reuse=True):
+                map_outs.append(op_module.map_nvnmd(
+                    s - s_min,
+                    map_tables[ii][0], map_tables[ii][1] / prec,
+                    map_tables2[ii][0], map_tables2[ii][1] / prec,
+                    prec, NBIT_FEA_FL))
+                map_outs[ii] = op_module.quantize_nvnmd(map_outs[ii], 0, NBIT_FEA_FL, NBIT_DATA_FL, -1)
+        G = map_outs
+        # G
+        xyz_scatter = G
+        xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1] // 4, M1))
+        # GR
+        inputs_reshape = tf.reshape(inputs_reshape, [-1, shape_i[1] // 4, 4])
+        GR = matmul3_qq(tf.transpose(inputs_reshape, [0, 2, 1]), xyz_scatter, -1)
+        GR = tf.reshape(GR, [-1, 4 * M1])
+        return GR
+
+    else:
+        xyz_scatter = tf.reshape(tf.slice(inputs_reshape, [0, 0], [-1, 1]), [-1, 1])
+        if nvnmd_cfg.restore_descriptor:
+            trainable = False
+            embedding_net_variables = {}
+            for key in nvnmd_cfg.weight.keys():
+                if 'filter_type' in key:
+                    key2 = key.replace('.', '/')
+                    embedding_net_variables[key2] = nvnmd_cfg.weight[key]
+
+        if (not is_exclude):
+            xyz_scatter = embedding_net(
+                xyz_scatter,
+                filter_neuron,
+                filter_precision,
+                activation_fn=activation_fn,
+                resnet_dt=filter_resnet_dt,
+                name_suffix=suffix,
+                stddev=stddev,
+                bavg=bavg,
+                seed=seed,
+                trainable=trainable,
+                uniform_seed=uniform_seed,
+                initial_variables=embedding_net_variables)
+            if (not uniform_seed) and (seed is not None):
+                seed += seed_shift
+        else:
+            # we can safely return the final xyz_scatter filled with zero directly
+            return tf.cast(tf.fill((natom, 4, M1), 0.), GLOBAL_TF_FLOAT_PRECISION)
+        # natom x nei_type_i x out_size
+        xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1] // 4, M1))
+        # When using tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]) below
+        # [588 24] -> [588 6 4] correct
+        # but if sel is zero
+        # [588 0] -> [147 0 4] incorrect; the correct one is [588 0 4]
+        # So we need to explicitly assign the shape to tf.shape(inputs_i)[0] instead of -1
+        return tf.matmul(tf.reshape(inputs_i, [natom, shape_i[1] // 4, 4]), xyz_scatter, transpose_a=True)
+
+
+def filter_GR2D(xyz_scatter_1):
+    r"""Replace se_a.py/_filter
+    """
+    NIX = nvnmd_cfg.dscp['NIX']
+    NBIT_DATA_FL = nvnmd_cfg.nbit['NBIT_DATA_FL']
+    M1 = nvnmd_cfg.dscp['M1']
+    M2 = nvnmd_cfg.dscp['M2']
+
+    if (nvnmd_cfg.quantize_descriptor):
+        xyz_scatter_1 = tf.reshape(xyz_scatter_1, [-1, 4 * M1])
+        # fix the number of bits of gradient
+        xyz_scatter_1 = op_module.quantize_nvnmd(xyz_scatter_1, 0, -1, NBIT_DATA_FL, -1)
+        xyz_scatter_1 = xyz_scatter_1 * (1.0 / NIX)
+        with tf.variable_scope('GR', reuse=True):
+            xyz_scatter_1 = op_module.quantize_nvnmd(xyz_scatter_1, 0, NBIT_DATA_FL, NBIT_DATA_FL, -1)
+        xyz_scatter_1 = tf.reshape(xyz_scatter_1, [-1, 4, M1])
+
+        # natom x 4 x outputs_size_2
+        xyz_scatter_2 = xyz_scatter_1
+        # natom x 3 x outputs_size_1
+        qmat = tf.slice(xyz_scatter_1, [0, 1, 0], [-1, 3, -1])
+        # natom x outputs_size_2 x 3
+        qmat = tf.transpose(qmat, perm=[0, 2, 1])
+        # D': natom x outputs_size x outputs_size_2
+        result = tf.matmul(xyz_scatter_1, xyz_scatter_2, transpose_a=True)
+        # D': natom x (outputs_size x outputs_size_2)
+        result = tf.reshape(result, [-1, M1 * M1])
+        #
+        index_subset = []
+        for ii in range(M1):
+            for jj in range(ii, ii + M2):
+                index_subset.append((ii * M1) + (jj % M1))
+        index_subset = tf.constant(np.int32(np.array(index_subset)))
+        result = tf.gather(result, index_subset, axis=1)
+
+        with tf.variable_scope('d', reuse=True):
+            result = op_module.quantize_nvnmd(result, 0, NBIT_DATA_FL, NBIT_DATA_FL, -1)
+    else:
+        # natom x 4 x outputs_size
+        xyz_scatter_1 = xyz_scatter_1 * (1.0 / NIX)
+        # natom x 4 x outputs_size_2
+        # xyz_scatter_2 = tf.slice(xyz_scatter_1, [0,0,0],[-1,-1,outputs_size_2])
+        xyz_scatter_2 = xyz_scatter_1
+        # natom x 3 x outputs_size_1
+        qmat = tf.slice(xyz_scatter_1, [0, 1, 0], [-1, 3, -1])
+        # natom x outputs_size_1 x 3
+        qmat = tf.transpose(qmat, perm=[0, 2, 1])
+        # natom x outputs_size x outputs_size_2
+        result = tf.matmul(xyz_scatter_1, xyz_scatter_2, transpose_a=True)
+        # natom x (outputs_size x outputs_size_2)
+        # result = tf.reshape(result, [-1, outputs_size_2 * outputs_size[-1]])
+        result = tf.reshape(result, [-1, M1 * M1])
+        #
+        index_subset = []
+        for ii in range(M1):
+            for jj in range(ii, ii + M2):
+                index_subset.append((ii * M1) + (jj % M1))
+        index_subset = tf.constant(np.int32(np.array(index_subset)))
+        result = tf.gather(result, index_subset, axis=1)
+
+    return result, qmat
diff --git a/deepmd/nvnmd/entrypoints/__init__.py b/deepmd/nvnmd/entrypoints/__init__.py
new file mode 100644
index 0000000000..037c74d76a
--- /dev/null
+++ b/deepmd/nvnmd/entrypoints/__init__.py
@@ -0,0 +1,9 @@
+from .freeze import save_weight
+from .mapt import MapTable
+from .wrap import Wrap
+
+__all__ = [
+    "save_weight",
+    "MapTable",
+    "Wrap"
+]
diff --git a/deepmd/nvnmd/entrypoints/freeze.py b/deepmd/nvnmd/entrypoints/freeze.py
new file mode 100644
index 0000000000..81d987dd5b
--- /dev/null
+++ b/deepmd/nvnmd/entrypoints/freeze.py
@@ -0,0 +1,48 @@
+
+#!/usr/bin/env python3
+
+from deepmd.env import tf
+from deepmd.nvnmd.utils.fio import FioDic
+
+
+def filter_tensorVariableList(tensorVariableList) -> dict:
+    r"""Get the name of variable for NVNMD
+
+    | :code:`descrpt_attr/t_avg:0`
+    | :code:`descrpt_attr/t_std:0`
+    | :code:`filter_type_{atom i}/matrix_{layer l}_{atomj}:0`
+    | :code:`filter_type_{atom i}/bias_{layer l}_{atomj}:0`
+    | :code:`layer_{layer l}_type_{atom i}/matrix:0`
+    | :code:`layer_{layer l}_type_{atom i}/bias:0`
+    | :code:`final_layer_type_{atom i}/matrix:0`
+    | :code:`final_layer_type_{atom i}/bias:0`
+    """
+    nameList = [tv.name for tv in tensorVariableList]
+    nameList = [name.replace(':0', '') for name in nameList]
+    nameList = [name.replace('/', '.') for name in nameList]
+
+    dic_name_tv = {}
+    for ii in range(len(nameList)):
+        name = nameList[ii]
+        tv = tensorVariableList[ii]
+        p1 = name.startswith('descrpt_attr')
+        p1 = p1 or name.startswith('filter_type_')
+        p1 = p1 or name.startswith('layer_')
+        p1 = p1 or name.startswith('final_layer_type_')
+        p2 = 'Adam' not in name
+        p3 = 'XXX' not in name
+        if p1 and p2 and p3:
+            dic_name_tv[name] = tv
+    return dic_name_tv
+
+
+def save_weight(sess, file_name: str = 'nvnmd/weight.npy'):
+    r"""Save the dictionary of weight to a npy file
+    """
+    tvs = tf.global_variables()
+    dic_key_tv = filter_tensorVariableList(tvs)
+    dic_key_value = {}
+    for key in dic_key_tv.keys():
+        value = sess.run(dic_key_tv[key])
+        dic_key_value[key] = value
+    FioDic().save(file_name, dic_key_value)
diff --git a/deepmd/nvnmd/entrypoints/mapt.py b/deepmd/nvnmd/entrypoints/mapt.py
new file mode 100644
index 0000000000..bbd73c79a6
--- /dev/null
+++ b/deepmd/nvnmd/entrypoints/mapt.py
@@ -0,0 +1,336 @@
+
+import numpy as np
+import logging
+
+from deepmd.env import tf
+from deepmd.utils.sess import run_sess
+
+from deepmd.nvnmd.utils.fio import FioDic
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.nvnmd.utils.weight import get_normalize, get_rng_s, get_filter_weight
+from deepmd.nvnmd.utils.network import get_sess
+
+from deepmd.nvnmd.data.data import jdata_deepmd_input
+
+from typing import List, Optional
+
+log = logging.getLogger(__name__)
+
+
+class MapTable:
+    r"""Generate the mapping table describing the relastionship of
+    atomic distance, cutoff function, and embedding matrix.
+
+    three mapping table will be built:
+
+    | :math:`r^2_{ji} \rightarrow s_{ji}`
+    | :math:`r^2_{ji} \rightarrow sr_{ji}`
+    | :math:`r^2_{ji} \rightarrow \mathcal{G}_{ji}`
+
+    where :math:`s_{ji}` is cut-off function,
+    :math:`sr_{ji} = \frac{s(r_{ji})}{r_{ji}}`, and
+    :math:`\mathcal{G}_{ji}` is embedding matrix.
+
+    The mapping funciton can be define as:
+
+    | :math:`y = f(x) = y_{k} + (x - x_{k}) * dy_{k}`
+    | :math:`y_{k} = f(x_{k})`
+    | :math:`dy_{k} = \frac{f(x_{k+1}) - f(x_{k})}{dx}`
+    | :math:`x_{k} \leq x < x_{k+1}`
+    | :math:`x_{k} = k * dx`
+
+    where :math:`dx` is interpolation interval.
+
+    Parameters
+    ----------
+    config_file
+        input file name
+        an .npy file containing the configuration information of NVNMD model
+    weight_file
+        input file name
+        an .npy file containing the weights of NVNMD model
+    map_file
+        output file name
+        an .npy file containing the mapping tables of NVNMD model
+
+    References
+    ----------
+    DOI: 10.1038/s41524-022-00773-z
+    """
+
+    def __init__(
+            self,
+            config_file: str,
+            weight_file: str,
+            map_file: str
+    ):
+        self.config_file = config_file
+        self.weight_file = weight_file
+        self.map_file = map_file
+
+        jdata = jdata_deepmd_input['nvnmd']
+        jdata['config_file'] = config_file
+        jdata['weight_file'] = weight_file
+        jdata['enable'] = True
+
+        nvnmd_cfg.init_from_jdata(jdata)
+        # map_table = self.build_map()
+
+    def qqq(self, dat, NBIT_FEA_FL, NBIT_FEA_X, is_set_zero=False):
+        dat = dat if isinstance(dat, list) else [dat]
+        prec = 2 ** NBIT_FEA_FL
+        N = int(2 ** NBIT_FEA_X)
+        #
+        dat2 = []
+        for ii in range(len(dat)):
+            dati = dat[ii]
+            vi = dati[:-1]  # i
+            vi1 = dati[1:]  # i+1
+            # v = vi + dvi * (r - ri)
+            # ri = i * dt
+            # dvi = v(i+1) / dt
+            vi = np.round(vi * prec) / prec
+            vi1 = np.round(vi1 * prec) / prec
+            dvi = vi1 - vi
+            if is_set_zero:
+                dvi[0] = 0
+            #
+            v = [np.reshape(vp, [N, -1]) for vp in [vi, dvi]]
+            dat2.append(v)
+        return dat2
+
+    def build_map(self):
+        ntypex = nvnmd_cfg.dscp['ntypex']
+        ntype = nvnmd_cfg.dscp['ntype']
+        NBIT_FEA_FL = nvnmd_cfg.nbit['NBIT_FEA_FL']
+        NBIT_FEA_X = nvnmd_cfg.nbit['NBIT_FEA_X']
+
+        dic = self.run_u2s()
+        dic.update(self.run_s2G(dic))
+
+        # quantize s and G
+        prec = 2**NBIT_FEA_FL
+        for tt in range(ntypex):
+            dic['s'][tt][0] = np.round(dic['s'][tt][0] * prec) / prec
+            dic['sr'][tt][0] = np.round(dic['sr'][tt][0] * prec) / prec
+            for tt2 in range(ntype):
+                v = np.round(dic['G'][tt * ntype + tt2][0] * prec) / prec
+                dic['G'][tt * ntype + tt2][0] = v
+
+        maps = {}
+        keys = 's,sr,ds_dr2,dsr_dr2,G,dG_ds'.split(',')
+        keys2 = 'G,dG_ds'.split(',')
+        for key in keys:
+            val = self.qqq(dic[key], NBIT_FEA_FL, NBIT_FEA_X, key not in keys2)
+            maps[key] = val
+
+        N = int(2**NBIT_FEA_X)
+        maps2 = {}
+        maps2['r2'] = dic['r2'][0:N]
+        maps2['s2'] = dic['s2'][0:N]
+        for tt in range(ntypex):
+            for tt2 in range(ntype):
+                postfix = f'_t{tt}_t{tt2}'
+                for key in keys:
+                    maps2[key + postfix] = []
+                    maps2[key + postfix].append(maps[key][tt * ntype + tt2][0].reshape([N, -1]))
+                    maps2[key + postfix].append(maps[key][tt * ntype + tt2][1].reshape([N, -1]))
+        self.map = maps2
+
+        FioDic().save(self.map_file, self.map)
+        log.info("NVNMD: finish building mapping table")
+        return self.map
+
+# =====================================================================
+# build r2s
+# =====================================================================
+
+    def build_r2s(self, r2):
+        # limit = nvnmd_cfg.dscp['rc_lim']
+        rmin = nvnmd_cfg.dscp['rcut_smth']
+        rmax = nvnmd_cfg.dscp['rcut']
+        # ntypex = nvnmd_cfg.dscp['ntypex']
+        ntype = nvnmd_cfg.dscp['ntype']
+        avg, std = get_normalize(nvnmd_cfg.weight)
+        avg, std = np.float32(avg), np.float32(std)
+        r = tf.sqrt(r2)
+        r_ = tf.clip_by_value(r, rmin, rmax)
+        r__ = tf.clip_by_value(r, 0, rmax)
+        uu = (r_ - rmin) / (rmax - rmin)
+        vv = uu * uu * uu * (-6 * uu * uu + 15 * uu - 10) + 1
+
+        sl = []
+        srl = []
+
+        for tt in range(ntype):
+            s = vv / r__
+            sr = s / r__
+            s = tf.reshape(s, [-1, 1])
+            sr = tf.reshape(sr, [-1, 1])
+            s = (s - avg[tt, 0]) / std[tt, 0]
+            sr = sr / std[tt, 1]
+            sl.append(s)
+            srl.append(sr)
+        return sl, srl
+
+    def build_ds_dr(self, r2, s, sr):
+        # ntypex = nvnmd_cfg.dscp['ntypex']
+        ntype = nvnmd_cfg.dscp['ntype']
+
+        ds_drl = []
+        dsr_drl = []
+        for tt in range(ntype):
+            si = s[tt]
+            sri = sr[tt]
+            ds_dr = tf.gradients(si, r2)
+            dsr_dr = tf.gradients(sri, r2)
+            ds_drl.append(ds_dr[0])
+            dsr_drl.append(dsr_dr[0])
+        return ds_drl, dsr_drl
+
+    def build_r2s_r2ds(self):
+        dic_ph = {}
+        dic_ph['r2'] = tf.placeholder(tf.float32, [None, 1], 't_r2')
+        dic_ph['s'], dic_ph['sr'] = self.build_r2s(dic_ph['r2'])
+        dic_ph['ds_dr2'], dic_ph['dsr_dr2'] = self.build_ds_dr(dic_ph['r2'], dic_ph['s'], dic_ph['sr'])
+
+        return dic_ph
+
+    def run_u2s(self):
+        # ntypex = nvnmd_cfg.dscp['ntypex']
+        ntype = nvnmd_cfg.dscp['ntype']
+        avg, std = get_normalize(nvnmd_cfg.weight)
+        avg, std = np.float32(avg), np.float32(std)
+        NBIT_FEA_X = nvnmd_cfg.nbit['NBIT_FEA_X']
+        NBIT_FEA_X_FL = nvnmd_cfg.nbit['NBIT_FEA_X_FL']
+
+        dic_ph = self.build_r2s_r2ds()
+        sess = get_sess()
+
+        N = 2 ** NBIT_FEA_X
+        N2 = 2 ** NBIT_FEA_X_FL
+        # N+1 ranther than N for calculating defference
+        r2 = 1.0 * np.arange(0, N + 1) / N2
+        r2 = np.reshape(r2, [-1, 1])
+        feed_dic = {dic_ph['r2']: r2}
+        key = 'r2,s,sr,ds_dr2,dsr_dr2'
+        tlst = [dic_ph[k] for k in key.split(',')]
+        # res = sess.run(tlst, feed_dic)
+        res = run_sess(sess, tlst, feed_dict=feed_dic)
+
+        res2 = {}
+        key = key.split(',')
+        for ii in range(len(key)):
+            res2[key[ii]] = res[ii]
+
+        # change value
+        # set 0 value, when u=0
+        for tt in range(ntype):
+            res2['s'][tt][0] = -avg[tt, 0] / std[tt, 0]
+            res2['sr'][tt][0] = 0
+            res2['ds_dr2'][tt][0] = 0
+            res2['dsr_dr2'][tt][0] = 0
+
+        # r = np.sqrt(res2['r2'])
+        sess.close()
+
+        return res2
+# =====================================================================
+# build s2G
+# =====================================================================
+
+    def build_s2G(self, s):
+        ntypex = nvnmd_cfg.dscp['ntypex']
+        ntype = nvnmd_cfg.dscp['ntype']
+
+        activation_fn = tf.tanh
+        outputs_size = nvnmd_cfg.dscp['NNODE_FEAS']
+
+        xyz_scatters = []
+        for tt in range(ntypex):
+            for tt2 in range(ntype):
+                xyz_scatter = s
+                for ll in range(1, len(outputs_size)):
+                    w, b = get_filter_weight(nvnmd_cfg.weight, tt, tt2, ll)
+                    w, b = np.float32(w), np.float32(b)
+                    if outputs_size[ll] == outputs_size[ll - 1]:
+                        xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b)
+                    elif outputs_size[ll] == outputs_size[ll - 1] * 2:
+                        xyz_scatter = tf.concat([xyz_scatter, xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b)
+                    else:
+                        xyz_scatter = activation_fn(tf.matmul(xyz_scatter, w) + b)
+                xyz_scatters.append(xyz_scatter)
+        return xyz_scatters
+
+    def build_dG_ds(self, G, s):
+        ntypex = nvnmd_cfg.dscp['ntypex']
+        ntype = nvnmd_cfg.dscp['ntype']
+        M1 = nvnmd_cfg.dscp['M1']
+
+        dG_ds = []
+        for tt in range(ntypex):
+            for tt2 in range(ntype):
+                Gi = G[tt * ntype + tt2]
+                si = s
+
+                dG_ds_i = []
+                for ii in range(M1):
+                    dG_ds_ii = tf.reshape(tf.gradients(Gi[:, ii], si), [-1, 1])
+                    dG_ds_i.append(dG_ds_ii)
+                dG_ds_i = tf.concat(dG_ds_i, axis=1)
+                dG_ds.append(dG_ds_i)
+        return dG_ds
+
+    def build_s2G_s2dG(self):
+        # ntypex = nvnmd_cfg.dscp['ntypex']
+        dic_ph = {}
+        dic_ph['s2'] = tf.placeholder(tf.float32, [None, 1], 't_s')
+        dic_ph['G'] = self.build_s2G(dic_ph['s2'])
+        dic_ph['dG_ds'] = self.build_dG_ds(dic_ph['G'], dic_ph['s2'])
+        return dic_ph
+
+    def run_s2G(self, dat):
+        NBIT_FEA_FL = nvnmd_cfg.nbit['NBIT_FEA_FL']
+        NBIT_FEA_X = nvnmd_cfg.nbit['NBIT_FEA_X']
+        NBIT_FEA_X2_FL = nvnmd_cfg.nbit['NBIT_FEA_X2_FL']
+        prec = 2 ** NBIT_FEA_FL
+
+        dic_ph = self.build_s2G_s2dG()
+        sess = get_sess()
+
+        N = 2 ** NBIT_FEA_X
+        N2 = 2 ** NBIT_FEA_X2_FL
+        s_min, s_max = get_rng_s(nvnmd_cfg.weight)
+        #
+        if (s_min < -2.0) or (s_max > 14.0):
+            log.warning(f"the range of s [{s_min}, {s_max}] is over the limit [-2.0, 14.0]")
+        s_min = -2.0
+        s = s_min + np.arange(0, N + 1) / N2
+        s = np.reshape(s, [-1, 1])
+        feed_dic = {dic_ph['s2']: s}
+
+        feed_dic = {dic_ph['s2']: s}
+        key = 's2,G,dG_ds'
+        tlst = [dic_ph[k] for k in key.split(',')]
+        # res = sess.run(tlst, feed_dic)
+        res = run_sess(sess, tlst, feed_dict=feed_dic)
+
+        res2 = {}
+        key = key.split(',')
+        for ii in range(len(key)):
+            res2[key[ii]] = res[ii]
+
+        sess.close()
+        return res2
+
+
+def mapt(
+    *,
+    nvnmd_config: Optional[str] = 'nvnmd/config.npy',
+    nvnmd_weight: Optional[str] = 'nvnmd/weight.npy',
+    nvnmd_map: Optional[str] = 'nvnmd/map.npy',
+    **kwargs
+):
+    # build mapping table
+    mapObj = MapTable(nvnmd_config, nvnmd_weight, nvnmd_map)
+    mapObj.build_map()
diff --git a/deepmd/nvnmd/entrypoints/train.py b/deepmd/nvnmd/entrypoints/train.py
new file mode 100644
index 0000000000..8c9ffbf2f0
--- /dev/null
+++ b/deepmd/nvnmd/entrypoints/train.py
@@ -0,0 +1,181 @@
+
+import os
+import logging
+
+from deepmd.env import tf
+from deepmd.entrypoints.train import train
+from deepmd.entrypoints.freeze import freeze
+from deepmd.nvnmd.entrypoints.mapt import mapt
+from deepmd.nvnmd.entrypoints.wrap import wrap
+
+from deepmd.nvnmd.utils.fio import FioDic
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.nvnmd.data.data import jdata_deepmd_input
+
+log = logging.getLogger(__name__)
+
+jdata_cmd_train = {
+    "INPUT": "train.json",
+    "init_model": None,
+    "restart": None,
+    "output": "out.json",
+    "init_frz_model": None,
+    "mpi_log": "master",
+    "log_level": 2,
+    "log_path": None,
+    "is_compress": False
+}
+
+jdata_cmd_freeze = {
+    "checkpoint_folder": '.',
+    "output": 'frozen_model.pb',
+    "node_names": None,
+    "nvnmd_weight": "nvnmd/weight.npy"
+}
+
+
+def replace_path(p, p2):
+    pars = p.split(os.sep)
+    pars[-2] = p2
+    return os.path.join(*pars)
+
+
+def add_path(p, p2):
+    pars = p.split('/')
+    pars.insert(-1, p2)
+    return os.path.join(*pars)
+
+
+def normalized_input(fn, PATH_CNN):
+    r"""Normalize a input script file for continuous neural network
+    """
+    f = FioDic()
+    jdata = f.load(fn, jdata_deepmd_input)
+    # nvnmd
+    jdata_nvnmd = jdata_deepmd_input['nvnmd']
+    jdata_nvnmd['enable'] = True
+    jdata_nvnmd_ = f.get(jdata, 'nvnmd', jdata_nvnmd)
+    jdata_nvnmd = f.update(jdata_nvnmd_, jdata_nvnmd)
+    # model
+    jdata_model = {
+        "descriptor": {
+            "seed": 1,
+            "sel": jdata_nvnmd_["sel"],
+            "rcut": jdata_nvnmd_['rcut'],
+            "rcut_smth": jdata_nvnmd_['rcut_smth']
+        },
+        "fitting_net": {
+            "seed": 1
+        }}
+    nvnmd_cfg.init_from_jdata(jdata_nvnmd)
+    nvnmd_cfg.init_from_deepmd_input(jdata_model)
+    nvnmd_cfg.init_train_mode('cnn')
+    # training
+    jdata_train = f.get(jdata, 'training', {})
+    jdata_train['disp_training'] = True
+    jdata_train['time_training'] = True
+    jdata_train['profiling'] = False
+    jdata_train['disp_file'] = add_path(jdata_train['disp_file'], PATH_CNN)
+    jdata_train['save_ckpt'] = add_path(jdata_train['save_ckpt'], PATH_CNN)
+    #
+    jdata['model'] = nvnmd_cfg.get_model_jdata()
+    jdata['nvnmd'] = nvnmd_cfg.get_nvnmd_jdata()
+    return jdata
+
+
+def normalized_input_qnn(jdata, PATH_QNN, CONFIG_CNN, WEIGHT_CNN, MAP_CNN):
+    r"""Normalize a input script file for quantize neural network
+    """
+    #
+    jdata_nvnmd = jdata_deepmd_input['nvnmd']
+    jdata_nvnmd['enable'] = True
+    jdata_nvnmd['config_file'] = CONFIG_CNN
+    jdata_nvnmd['weight_file'] = WEIGHT_CNN
+    jdata_nvnmd['map_file'] = MAP_CNN
+    nvnmd_cfg.init_from_jdata(jdata_nvnmd)
+    nvnmd_cfg.init_train_mode('qnn')
+    jdata['nvnmd'] = nvnmd_cfg.get_nvnmd_jdata()
+    # training
+    jdata2 = jdata['training']
+    jdata2['disp_file'] = replace_path(jdata2['disp_file'], PATH_QNN)
+    jdata2['save_ckpt'] = replace_path(jdata2['save_ckpt'], PATH_QNN)
+    jdata['training'] = jdata2
+    return jdata
+
+
+def train_nvnmd(
+    *,
+    INPUT: str,
+    step: str,
+    **kwargs,
+):
+    # test input
+    if not os.path.exists(INPUT):
+        log.warning("The input script %s does not exist"%(INPUT))
+    # STEP1
+    PATH_CNN = 'nvnmd_cnn'
+    CONFIG_CNN = os.path.join(PATH_CNN, 'config.npy')
+    INPUT_CNN = os.path.join(PATH_CNN, 'train.json')
+    WEIGHT_CNN = os.path.join(PATH_CNN, 'weight.npy')
+    FRZ_MODEL_CNN = os.path.join(PATH_CNN, 'frozen_model.pb')
+    MAP_CNN = os.path.join(PATH_CNN, 'map.npy')
+    if step == "s1":
+        # normailize input file
+        jdata = normalized_input(INPUT, PATH_CNN)
+        FioDic().save(INPUT_CNN, jdata)
+        nvnmd_cfg.save(CONFIG_CNN)
+        # train cnn
+        jdata = jdata_cmd_train.copy()
+        jdata['INPUT'] = INPUT_CNN
+        train(**jdata)
+        tf.reset_default_graph()
+        # freeze
+        jdata = jdata_cmd_freeze.copy()
+        jdata['checkpoint_folder'] = PATH_CNN
+        jdata['output'] = FRZ_MODEL_CNN
+        jdata['nvnmd_weight'] = WEIGHT_CNN
+        freeze(**jdata)
+        tf.reset_default_graph()
+        # map table
+        jdata = {
+            "nvnmd_config": CONFIG_CNN,
+            "nvnmd_weight": WEIGHT_CNN,
+            "nvnmd_map": MAP_CNN
+        }
+        mapt(**jdata)
+        tf.reset_default_graph()
+    # STEP2
+    PATH_QNN = 'nvnmd_qnn'
+    CONFIG_QNN = os.path.join(PATH_QNN, 'config.npy')
+    INPUT_QNN = os.path.join(PATH_QNN, 'train.json')
+    WEIGHT_QNN = os.path.join(PATH_QNN, 'weight.npy')
+    FRZ_MODEL_QNN = os.path.join(PATH_QNN, 'frozen_model.pb')
+    MODEL_QNN = os.path.join(PATH_QNN, 'model.pb')
+
+    if step == "s2":
+        # normailize input file
+        jdata = normalized_input(INPUT, PATH_CNN)
+        jdata = normalized_input_qnn(jdata, PATH_QNN, CONFIG_CNN, WEIGHT_CNN, MAP_CNN)
+        FioDic().save(INPUT_QNN, jdata)
+        nvnmd_cfg.save(CONFIG_QNN)
+        # train qnn
+        jdata = jdata_cmd_train.copy()
+        jdata['INPUT'] = INPUT_QNN
+        train(**jdata)
+        tf.reset_default_graph()
+        # freeze
+        jdata = jdata_cmd_freeze.copy()
+        jdata['checkpoint_folder'] = PATH_QNN
+        jdata['output'] = FRZ_MODEL_QNN
+        jdata['nvnmd_weight'] = WEIGHT_QNN
+        freeze(**jdata)
+        tf.reset_default_graph()
+        # wrap
+        jdata = {
+            "nvnmd_config": CONFIG_QNN,
+            "nvnmd_weight": WEIGHT_QNN,
+            "nvnmd_map": MAP_CNN,
+            "nvnmd_model": MODEL_QNN
+        }
+        wrap(**jdata)
+        tf.reset_default_graph()
diff --git a/deepmd/nvnmd/entrypoints/wrap.py b/deepmd/nvnmd/entrypoints/wrap.py
new file mode 100644
index 0000000000..2e38c0d16d
--- /dev/null
+++ b/deepmd/nvnmd/entrypoints/wrap.py
@@ -0,0 +1,390 @@
+
+import numpy as np
+import logging
+
+from deepmd.nvnmd.utils.fio import FioBin, FioTxt
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.nvnmd.utils.weight import get_fitnet_weight
+from deepmd.nvnmd.utils.encode import Encode
+from deepmd.nvnmd.utils.op import map_nvnmd
+
+from deepmd.nvnmd.data.data import jdata_deepmd_input, jdata_sys
+from typing import List, Optional
+
+log = logging.getLogger(__name__)
+
+
+class Wrap():
+    r"""Generate the binary model file (model.pb)
+    the model file can be use to run the NVNMD with lammps
+    the pair style need set as:
+
+    | :code:`pair_style nvnmd model.pb`
+    | :code:`pair_coeff * *`
+
+    Parameters
+    ----------
+    config_file
+        input file name
+        an .npy file containing the configuration information of NVNMD model
+    weight_file
+        input file name
+        an .npy file containing the weights of NVNMD model
+    map_file
+        input file name
+        an .npy file containing the mapping tables of NVNMD model
+    model_file
+        output file name
+        an .pb file containing the model using in the NVNMD
+
+    References
+    ----------
+    DOI: 10.1038/s41524-022-00773-z
+    """
+
+    def __init__(
+        self,
+        config_file: str,
+        weight_file: str,
+        map_file: str,
+        model_file: str
+    ):
+        self.config_file = config_file
+        self.weight_file = weight_file
+        self.map_file = map_file
+        self.model_file = model_file
+
+        jdata = jdata_deepmd_input['nvnmd']
+        jdata['config_file'] = config_file
+        jdata['weight_file'] = weight_file
+        jdata['map_file'] = map_file
+        jdata['enable'] = True
+
+        nvnmd_cfg.init_from_jdata(jdata)
+
+    def wrap(self):
+        dscp = nvnmd_cfg.dscp
+        ctrl = nvnmd_cfg.ctrl
+
+        M1 = dscp['M1']
+        ntype = dscp['ntype']
+        ntype_max = dscp['ntype_max']
+        NSTDM_M1X = ctrl['NSTDM_M1X']
+        e = Encode()
+
+        bcfg = self.wrap_dscp()
+        bfps, bbps = self.wrap_fitn()
+        bfea, bgra = self.wrap_map()
+
+        # split data with {nbit} bits per row
+        hcfg = e.bin2hex(e.split_bin(bcfg, 72))
+        # the legnth of hcfg need to be the multiples of NSTDM_M1X
+        hcfg = e.extend_list(hcfg, int(np.ceil(len(hcfg) / NSTDM_M1X)) * NSTDM_M1X)
+
+        hfps = e.bin2hex(e.split_bin(bfps, 72))
+        # hfps = e.extend_list(hfps, (len(hfps) // ntype) * ntype_max)
+
+        hbps = e.bin2hex(e.split_bin(bbps, 72))
+        # hbps = e.extend_list(hbps, (len(hbps) // ntype) * ntype_max)
+
+        # split into multiple rows
+        bfea = e.split_bin(bfea, len(bfea[0]) // NSTDM_M1X)
+        # bfea = e.reverse_bin(bfea, NSTDM_M1X)
+        # extend the number of lines
+        hfea = e.bin2hex(bfea)
+        hfea = e.extend_list(hfea, (len(hfea) // ntype) * ntype_max)
+
+        # split into multiple rows
+        bgra = e.split_bin(bgra, len(bgra[0]) // NSTDM_M1X)
+        # bgra = e.reverse_bin(bgra, NSTDM_M1X)
+        # extend the number of lines
+        hgra = e.bin2hex(bgra)
+        hgra = e.extend_list(hgra, (len(hgra) // ntype) * ntype_max)
+
+        # extend data according to the number of bits per row of BRAM
+        nhex = 512
+        hcfg = e.extend_hex(hcfg, nhex)
+        hfps = e.extend_hex(hfps, nhex)
+        hbps = e.extend_hex(hbps, nhex)
+        hfea = e.extend_hex(hfea, nhex)
+        hgra = e.extend_hex(hgra, nhex)
+
+        # DEVELOP_DEBUG
+        if jdata_sys['debug']:
+            log.info("len(hcfg): %d" % (len(hcfg)))
+            log.info("len(hfps): %d" % (len(hfps)))
+            log.info("len(hbps): %d" % (len(hbps)))
+            log.info("len(hfea): %d" % (len(hfea)))
+            log.info("len(hgra): %d" % (len(hgra)))
+            #
+            FioTxt().save('nvnmd/wrap/hcfg.txt', hcfg)
+            FioTxt().save('nvnmd/wrap/hfps.txt', hfps)
+            FioTxt().save('nvnmd/wrap/hbps.txt', hbps)
+            FioTxt().save('nvnmd/wrap/hfea.txt', hfea)
+            FioTxt().save('nvnmd/wrap/hgra.txt', hgra)
+        #
+        NCFG = len(hcfg)
+        NNET = len(hfps)
+        NFEA = len(hfea)
+        nvnmd_cfg.nbit['NCFG'] = NCFG
+        nvnmd_cfg.nbit['NNET'] = NNET
+        nvnmd_cfg.nbit['NFEA'] = NFEA
+        nvnmd_cfg.save(nvnmd_cfg.config_file)
+        head = self.wrap_head(NCFG, NNET, NFEA)
+        #
+        hs = [] + head
+        hs.extend(hcfg)
+        hs.extend(hfps)
+        hs.extend(hbps)
+        hs.extend(hfea)
+        hs.extend(hgra)
+
+        FioBin().save(self.model_file, hs)
+        log.info("NVNMD: finish wrapping model file")
+
+    def wrap_head(self, NCFG, NNET, NFEA):
+        nbit = nvnmd_cfg.nbit
+        NBTI_MODEL_HEAD = nbit['NBTI_MODEL_HEAD']
+        NBIT_DATA_FL = nbit['NBIT_DATA_FL']
+        rcut = nvnmd_cfg.dscp['rcut']
+
+        bs = ''
+        e = Encode()
+        # nline
+        bs = e.dec2bin(NCFG, NBTI_MODEL_HEAD)[0] + bs
+        bs = e.dec2bin(NNET, NBTI_MODEL_HEAD)[0] + bs
+        bs = e.dec2bin(NFEA, NBTI_MODEL_HEAD)[0] + bs
+        # dscp
+        RCUT = e.qr(rcut, NBIT_DATA_FL)
+        bs = e.dec2bin(RCUT, NBTI_MODEL_HEAD)[0] + bs
+        # extend
+        hs = e.bin2hex(bs)
+        nhex = 512
+        hs = e.extend_hex(hs, nhex)
+        return hs
+
+    def wrap_dscp(self):
+        r"""Wrap the configuration of descriptor
+        """
+        dscp = nvnmd_cfg.dscp
+        nbit = nvnmd_cfg.nbit
+        maps = nvnmd_cfg.map
+        NBIT_FEA_X = nbit['NBIT_FEA_X']
+        NBIT_FEA_X_FL = nbit['NBIT_FEA_X_FL']
+        NBIT_FEA_X2_FL = nbit['NBIT_FEA_X2_FL']
+        NBIT_FEA_FL = nbit['NBIT_FEA_FL']
+        NBIT_LST = nbit['NBIT_LST']
+        NBIT_SHIFT = nbit['NBIT_SHIFT']
+
+        bs = ''
+        e = Encode()
+        # sel
+        SEL = dscp['SEL']
+        bs = e.dec2bin(SEL[0], NBIT_LST)[0] + bs
+        bs = e.dec2bin(SEL[1], NBIT_LST)[0] + bs
+        bs = e.dec2bin(SEL[2], NBIT_LST)[0] + bs
+        bs = e.dec2bin(SEL[3], NBIT_LST)[0] + bs
+        #
+        NIX = dscp['NIX']
+        ln2_NIX = int(np.log2(NIX))
+        bs = e.dec2bin(ln2_NIX, NBIT_SHIFT)[0] + bs
+        # G*s
+        # ntypex = dscp['ntypex']
+        ntype = dscp['ntype']
+        # ntypex_max = dscp['ntypex_max']
+        ntype_max = dscp['ntype_max']
+        M1 = dscp['M1']
+        GSs = []
+        for tt in range(ntype_max):
+            for tt2 in range(ntype_max):
+                if (tt < ntype) and (tt2 < ntype):
+                    s = maps[f's_t{0}_t{tt}'][0][0]
+                    s = e.qf(s, NBIT_FEA_FL) / (2**NBIT_FEA_FL)
+                    s_min = -2.0
+                    yk, dyk = maps[f'G_t{0}_t{tt2}']
+                    prec = 1 / (2 ** NBIT_FEA_X2_FL)
+                    G = map_nvnmd(s - s_min, yk, dyk / prec, prec)
+                    G = e.qf(G, NBIT_FEA_FL) / (2**NBIT_FEA_FL)
+                    v = s * G
+                else:
+                    v = np.zeros(M1)
+                for ii in range(M1):
+                    GSs.extend(e.dec2bin(e.qr(v[ii], 2 * NBIT_FEA_FL), 27, True))
+        sGSs = ''.join(GSs[::-1])
+        bs = sGSs + bs
+        return bs
+
+    def wrap_fitn(self):
+        r"""Wrap the weights of fitting net
+        """
+        dscp = nvnmd_cfg.dscp
+        fitn = nvnmd_cfg.fitn
+        weight = nvnmd_cfg.weight
+        nbit = nvnmd_cfg.nbit
+        ctrl = nvnmd_cfg.ctrl
+
+        ntype = dscp['ntype']
+        ntype_max = dscp['ntype_max']
+        nlayer_fit = fitn['nlayer_fit']
+        NNODE_FITS = fitn['NNODE_FITS']
+        NBIT_SUM = nbit['NBIT_SUM']
+        NBIT_DATA_FL = nbit['NBIT_DATA_FL']
+        NBIT_WEIGHT = nbit['NBIT_WEIGHT']
+        NBIT_WEIGHT_FL = nbit['NBIT_WEIGHT_FL']
+        NBIT_SPE = nbit['NBIT_SPE']
+        NSTDM = ctrl['NSTDM']
+        NSEL = ctrl['NSEL']
+
+        # encode all parameters
+        bb, bw = [], []
+        for ll in range(nlayer_fit):
+            bbt, bwt = [], []
+            for tt in range(ntype_max):
+                # get parameters: weight and bias
+                if (tt < ntype):
+                    w, b = get_fitnet_weight(weight, tt, ll, nlayer_fit)
+                else:
+                    w, b = get_fitnet_weight(weight, 0, ll, nlayer_fit)
+                    w = w * 0
+                    b = b * 0
+                # restrict the shift value of energy
+                if (ll == (nlayer_fit - 1)):
+                    b = b * 0
+                bbi = self.wrap_bias(b, NBIT_SUM, NBIT_DATA_FL)
+                bwi = self.wrap_weight(w, NBIT_WEIGHT, NBIT_WEIGHT_FL)
+                bbt.append(bbi)
+                bwt.append(bwi)
+            bb.append(bbt)
+            bw.append(bwt)
+        #
+        bfps, bbps = [], []
+        for ss in range(NSEL):
+            tt = ss // NSTDM
+            sc = ss % NSTDM
+            sr = ss % NSTDM
+            bfp, bbp = '', ''
+            for ll in range(nlayer_fit):
+                nr = NNODE_FITS[ll]
+                nc = NNODE_FITS[ll + 1]
+                nrs = int(np.ceil(nr / NSTDM))
+                ncs = int(np.ceil(nc / NSTDM))
+                if (nc == 1):
+                    # final layer
+                    # fp #
+                    bi = [bw[ll][tt][sr * nrs + rr][cc] for rr in range(nrs) for cc in range(nc)]
+                    bi.reverse()
+                    bfp = ''.join(bi) + bfp
+                    #
+                    bi = [bb[ll][tt][sc * ncs * 0 + cc] for cc in range(ncs)]
+                    bi.reverse()
+                    bfp = ''.join(bi) + bfp
+                    # bp #
+                    bi = [bw[ll][tt][sr * nrs + rr][cc] for rr in range(nrs) for cc in range(nc)]
+                    bi.reverse()
+                    bbp = ''.join(bi) + bbp
+                    #
+                    bi = [bb[ll][tt][sc * ncs * 0 + cc] for cc in range(ncs)]
+                    bi.reverse()
+                    bbp = ''.join(bi) + bbp
+                else:
+                    # fp #
+                    bi = [bw[ll][tt][rr][sc * ncs + cc] for cc in range(ncs) for rr in range(nr)]
+                    bi.reverse()
+                    bfp = ''.join(bi) + bfp
+                    #
+                    bi = [bb[ll][tt][sc * ncs + cc] for cc in range(ncs)]
+                    bi.reverse()
+                    bfp = ''.join(bi) + bfp
+                    # bp #
+                    bi = [bw[ll][tt][sr * nrs + rr][cc] for rr in range(nrs) for cc in range(nc)]
+                    bi.reverse()
+                    bbp = ''.join(bi) + bbp
+                    #
+                    bi = [bb[ll][tt][sc * ncs + cc] for cc in range(ncs)]
+                    bi.reverse()
+                    bbp = ''.join(bi) + bbp
+            bfps.append(bfp)
+            bbps.append(bbp)
+        return bfps, bbps
+
+    def wrap_bias(self, bias, NBIT_SUM, NBIT_DATA_FL):
+        e = Encode()
+        bias = e.qr(bias, NBIT_DATA_FL)
+        Bs = e.dec2bin(bias, NBIT_SUM, True)
+        return Bs
+
+    def wrap_weight(self, weight, NBIT_WEIGHT, NBIT_WEIGHT_FL):
+        sh = weight.shape
+        nr, nc = sh[0], sh[1]
+        e = Encode()
+        weight = e.qr(weight, NBIT_WEIGHT_FL)
+        Ws = e.dec2bin(weight, NBIT_WEIGHT, True)
+        Ws = [[Ws[nc * rr + cc] for cc in range(nc)] for rr in range(nr)]
+        return Ws
+
+    def wrap_map(self):
+        r"""Wrap the mapping table of embedding network
+        """
+        dscp = nvnmd_cfg.dscp
+        maps = nvnmd_cfg.map
+        nbit = nvnmd_cfg.nbit
+
+        M1 = dscp['M1']
+        ntype = dscp['ntype']
+        NBIT_FEA = nbit['NBIT_FEA']
+        NBIT_FEA_FL = nbit['NBIT_FEA_FL']
+
+        keys = 's,sr,G'.split(',')
+        keys2 = 'ds_dr2,dsr_dr2,dG_ds'.split(',')
+
+        e = Encode()
+
+        datas = {}
+        datas2 = {}
+        idxs = [[0, tt] for tt in range(ntype)]
+        for ii in range(len(idxs)):
+            tt, tt2 = idxs[ii]
+            postfix = f'_t{tt}_t{tt2}'
+            for key in (keys + keys2):
+                if ii == 0:
+                    datas[key] = []
+                    datas2[key] = []
+                datas[key].append(maps[key + postfix][0])  # v
+                datas2[key].append(maps[key + postfix][1])  # dv
+
+        for key in (keys + keys2):
+            datas[key] = np.vstack(datas[key])
+            datas[key] = e.qr(datas[key], NBIT_FEA_FL)
+
+            datas2[key] = np.vstack(datas2[key])
+            datas2[key] = e.qr(datas2[key], NBIT_FEA_FL)
+        # fea
+        dat = [datas[key] for key in keys] + [datas2[key] for key in keys]
+        idx = np.int32(np.arange(0, int((M1 + 2) * 2)).reshape([2, -1]).transpose().reshape(-1))
+        dat = np.hstack(dat)
+        dat = dat[:, ::-1]
+        dat = dat[:, idx]  # data consists of value and delta_value
+        bs = e.dec2bin(dat, NBIT_FEA, True, 'fea')
+        bs = e.merge_bin(bs, (M1 + 2) * 2)
+        bfea = bs
+        # gra
+        dat = [datas[key] for key in keys2] + [datas2[key] for key in keys2]
+        dat = np.hstack(dat)
+        dat = dat[:, ::-1]
+        dat = dat[:, idx]
+        bs = e.dec2bin(dat, NBIT_FEA, True, 'gra')
+        bs = e.merge_bin(bs, (M1 + 2) * 2)
+        bgra = bs
+        return bfea, bgra
+
+
+def wrap(
+    *,
+    nvnmd_config: Optional[str] = 'nvnmd/config.npy',
+    nvnmd_weight: Optional[str] = 'nvnmd/weight.npy',
+    nvnmd_map: Optional[str] = 'nvnmd/map.npy',
+    nvnmd_model: Optional[str] = 'nvnmd/model.pb',
+    **kwargs
+):
+    wrapObj = Wrap(nvnmd_config, nvnmd_weight, nvnmd_map, nvnmd_model)
+    wrapObj.wrap()
diff --git a/deepmd/nvnmd/fit/__init__.py b/deepmd/nvnmd/fit/__init__.py
new file mode 100644
index 0000000000..4d7e88e30d
--- /dev/null
+++ b/deepmd/nvnmd/fit/__init__.py
@@ -0,0 +1,9 @@
+"""
+nvnmd.fit
+=========
+
+Provides
+    1. continuous fitting network
+    2. quantized fitting network
+
+"""
\ No newline at end of file
diff --git a/deepmd/nvnmd/fit/ener.py b/deepmd/nvnmd/fit/ener.py
new file mode 100644
index 0000000000..31bcab7588
--- /dev/null
+++ b/deepmd/nvnmd/fit/ener.py
@@ -0,0 +1,5 @@
+
+from deepmd.env import tf
+from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.nvnmd.utils.network import one_layer as one_layer_nvnmd
diff --git a/deepmd/nvnmd/utils/__init__.py b/deepmd/nvnmd/utils/__init__.py
new file mode 100644
index 0000000000..f888413ad1
--- /dev/null
+++ b/deepmd/nvnmd/utils/__init__.py
@@ -0,0 +1,21 @@
+
+from .argcheck import nvnmd_args
+from .config import nvnmd_cfg
+from .encode import Encode
+from .fio import FioBin, FioDic, FioTxt
+from .network import one_layer
+from .op import map_nvnmd
+from .weight import get_filter_weight, get_fitnet_weight
+
+__all__ = [
+    "nvnmd_args",
+    "nvnmd_cfg",
+    "Encode",
+    "FioBin",
+    "FioDic",
+    "FioTxt",
+    "one_layer",
+    "map_nvnmd",
+    "get_filter_weight",
+    "get_fitnet_weight",
+]
diff --git a/deepmd/nvnmd/utils/argcheck.py b/deepmd/nvnmd/utils/argcheck.py
new file mode 100644
index 0000000000..7903ffd361
--- /dev/null
+++ b/deepmd/nvnmd/utils/argcheck.py
@@ -0,0 +1,29 @@
+
+
+from dargs import Argument
+
+
+def nvnmd_args():
+    doc_net_size_file = "configuration the number of nodes of fitting_net, just can be set as 128"
+    doc_map_file = "A file containing the mapping tables to replace the calculation of embedding nets"
+    doc_config_file = "A file containing the parameters about how to implement the model in certain hardware"
+    doc_weight_file = "a *.npy file containing the weights of the model"
+    doc_enable = "enable the nvnmd training"
+    doc_restore_descriptor = "enable to restore the parameter of embedding_net from weight.npy"
+    doc_restore_fitting_net = "enable to restore the parameter of fitting_net from weight.npy"
+    doc_quantize_descriptor = "enable the quantizatioin of descriptor"
+    doc_quantize_fitting_net = "enable the quantizatioin of fitting_net"
+    args = [
+        Argument("net_size", int, optional=False, default=128, doc=doc_net_size_file),
+        Argument("map_file", str, optional=False, default='none', doc=doc_map_file),
+        Argument("config_file", str, optional=False, default='none', doc=doc_config_file),
+        Argument("weight_file", str, optional=False, default='none', doc=doc_weight_file),
+        Argument("enable", bool, optional=False, default=False, doc=doc_enable),
+        Argument("restore_descriptor", bool, optional=False, default=False, doc=doc_restore_descriptor),
+        Argument("restore_fitting_net", bool, optional=False, default=False, doc=doc_restore_fitting_net),
+        Argument("quantize_descriptor", bool, optional=False, default=False, doc=doc_quantize_descriptor),
+        Argument("quantize_fitting_net", bool, optional=False, default=False, doc=doc_quantize_fitting_net),
+    ]
+
+    doc_nvnmd = 'The nvnmd options.'
+    return Argument("nvnmd", dict, args, [], optional=True, doc = doc_nvnmd)
\ No newline at end of file
diff --git a/deepmd/nvnmd/utils/config.py b/deepmd/nvnmd/utils/config.py
new file mode 100644
index 0000000000..0e839ac244
--- /dev/null
+++ b/deepmd/nvnmd/utils/config.py
@@ -0,0 +1,283 @@
+
+import numpy as np
+import logging
+
+from deepmd.nvnmd.data.data import jdata_config, jdata_configs, jdata_deepmd_input
+from deepmd.nvnmd.data.data import NVNMD_WELCOME, NVNMD_CITATION
+from deepmd.nvnmd.utils.fio import FioDic
+
+log = logging.getLogger(__name__)
+
+
+class NvnmdConfig():
+    r"""Configuration for NVNMD
+    record the message of model such as size, using nvnmd or not
+
+    Parameters
+    ----------
+    jdata
+        a dictionary of input script
+
+    References
+    ----------
+    DOI: 10.1038/s41524-022-00773-z
+    """
+
+    def __init__(
+        self,
+        jdata: dict
+    ):
+        self.map = {}
+        self.config = jdata_config
+        self.save_path = 'nvnmd/config.npy'
+        self.weight = {}
+        self.init_from_jdata(jdata)
+
+    def init_from_jdata(self, jdata: dict = {}):
+        r"""Initial this class with `jdata` loaded from input script
+        """
+        if jdata == {}:
+            return None
+
+        self.net_size = jdata['net_size']
+        self.map_file = jdata['map_file']
+        self.config_file = jdata['config_file']
+        self.enable = jdata['enable']
+        self.weight_file = jdata['weight_file']
+        self.restore_descriptor = jdata['restore_descriptor']
+        self.restore_fitting_net = jdata['restore_fitting_net']
+        self.quantize_descriptor = jdata['quantize_descriptor']
+        self.quantize_fitting_net = jdata['quantize_fitting_net']
+
+        # load data
+        if self.enable:
+            self.map = FioDic().load(self.map_file, {})
+            self.weight = FioDic().load(self.weight_file, {})
+
+            jdata_config_ = jdata_config.copy()
+            jdata_config_['fitn']['neuron'][0] = self.net_size
+            load_config = FioDic().load(self.config_file, jdata_config_)
+            self.init_from_config(load_config)
+            # if load the file, set net_size
+            self.init_net_size()
+
+    def init_value(self):
+        r"""Initial member with dict
+        """
+        self.dscp = self.config['dscp']
+        self.fitn = self.config['fitn']
+        self.size = self.config['size']
+        self.ctrl = self.config['ctrl']
+        self.nbit = self.config['nbit']
+
+    def init_train_mode(self, mod='cnn'):
+        r"""Configure for taining cnn or qnn
+        """
+        if mod == 'cnn':
+            self.restore_descriptor = False
+            self.restore_fitting_net = False
+            self.quantize_descriptor = False
+            self.quantize_fitting_net = False
+        elif mod == 'qnn':
+            self.restore_descriptor = True
+            self.restore_fitting_net = True
+            self.quantize_descriptor = True
+            self.quantize_fitting_net = True
+
+    def init_from_config(self, jdata):
+        r"""Initial member element one by one
+        """
+        self.config = FioDic().update(jdata, self.config)
+        self.config['dscp'] = self.init_dscp(self.config['dscp'], self.config)
+        self.config['fitn'] = self.init_fitn(self.config['fitn'], self.config)
+        self.config['size'] = self.init_size(self.config['size'], self.config)
+        self.config['ctrl'] = self.init_ctrl(self.config['ctrl'], self.config)
+        self.config['nbit'] = self.init_nbit(self.config['nbit'], self.config)
+        self.init_value()
+
+    def init_net_size(self):
+        r"""Initial net_size
+        """
+        # self.net_size = self.fitn['neuron'][0]
+        self.net_size = self.config['fitn']['neuron'][0]
+        if self.enable:
+            key = str(self.net_size)
+            if key in jdata_configs.keys():
+                # log.info(f"NVNMD: configure the net_size is {key}")
+                self.init_from_config(jdata_configs[key])
+            else:
+                log.error("NVNMD: don't have the configure of net_size")
+
+    def init_from_deepmd_input(self, jdata):
+        r"""Initial members with input script of deepmd
+        """
+        self.config['dscp'] = FioDic().update(jdata['descriptor'], self.config['dscp'])
+        self.config['fitn'] = FioDic().update(jdata['fitting_net'], self.config['fitn'])
+        self.config['dscp'] = self.init_dscp(self.config['dscp'], self.config)
+        self.config['fitn'] = self.init_fitn(self.config['fitn'], self.config)
+        #
+        self.init_net_size()
+        self.init_value()
+
+    def init_dscp(self, jdata: dict, jdata_parent: dict = {}) -> dict:
+        r"""Initial members about descriptor
+        """
+        jdata['M1'] = jdata['neuron'][-1]
+        jdata['M2'] = jdata['axis_neuron']
+        jdata['NNODE_FEAS'] = [1] + jdata['neuron']
+        jdata['nlayer_fea'] = len(jdata['neuron'])
+        jdata['same_net'] = int(1) if jdata['type_one_side'] else int(0)
+        jdata['NIDP'] = int(np.sum(jdata['sel']))
+        jdata['NIX'] = 2 ** int(np.ceil(np.log2(jdata['NIDP'] / 1.5)))
+        jdata['SEL'] = (jdata['sel'] + [0, 0, 0, 0])[0:4]
+        jdata['ntype'] = len(jdata['sel'])
+        jdata['ntypex'] = 1 if(jdata['same_net']) else jdata['ntype']
+
+        return jdata
+
+    def init_fitn(self, jdata: dict, jdata_parent: dict = {}) -> dict:
+        r"""Initial members about fitting network
+        """
+        M1 = jdata_parent['dscp']['M1']
+        M2 = jdata_parent['dscp']['M2']
+
+        jdata['NNODE_FITS'] = [int(M1 * M2)] + jdata['neuron'] + [1]
+        jdata['nlayer_fit'] = len(jdata['neuron']) + 1
+        jdata['NLAYER'] = jdata['nlayer_fit']
+
+        return jdata
+
+    def init_size(self, jdata: dict, jdata_parent: dict = {}) -> dict:
+        r"""Initial members about ram capacity
+        """
+        jdata['Na'] = jdata['NSPU']
+        jdata['NaX'] = jdata['MSPU']
+        return jdata
+
+    def init_ctrl(self, jdata: dict, jdata_parent: dict = {}) -> dict:
+        r"""Initial members about control signal
+        """
+        ntype_max = jdata_parent['dscp']['ntype_max']
+        jdata['NSADV'] = jdata['NSTDM'] + 1
+        jdata['NSEL'] = jdata['NSTDM'] * ntype_max
+        if (32 % jdata['NSTDM_M1X'] > 0):
+            log.warning("NVNMD: NSTDM_M1X must be divisor of 32 for the right runing in data_merge module")
+        return jdata
+
+    def init_nbit(self, jdata: dict, jdata_parent: dict = {}) -> dict:
+        r"""Initial members about quantification precision
+        """
+        Na = jdata_parent['size']['Na']
+        NaX = jdata_parent['size']['NaX']
+        jdata['NBIT_CRD'] = jdata['NBIT_DATA'] * 3
+        jdata['NBIT_LST'] = int(np.ceil(np.log2(NaX)))
+        jdata['NBIT_ATOM'] = jdata['NBIT_SPE'] + jdata['NBIT_CRD']
+        jdata['NBIT_LONG_ATOM'] = jdata['NBIT_SPE'] + jdata['NBIT_LONG_DATA'] * 3
+        jdata['NBIT_RIJ'] = jdata['NBIT_DATA_FL'] + 5
+        jdata['NBIT_SUM'] = jdata['NBIT_DATA_FL'] + 8
+        jdata['NBIT_DATA2'] = jdata['NBIT_DATA'] + jdata['NBIT_DATA_FL']
+        jdata['NBIT_DATA2_FL'] = 2 * jdata['NBIT_DATA_FL']
+        jdata['NBIT_DATA_FEA'] = jdata['NBIT_DATA'] + jdata['NBIT_FEA_FL']
+        jdata['NBIT_DATA_FEA_FL'] = jdata['NBIT_DATA_FL'] + jdata['NBIT_FEA_FL']
+        jdata['NBIT_FORCE_FL'] = 2 * jdata['NBIT_DATA_FL'] - 1
+        return jdata
+
+    def save(self, file_name=None):
+        r"""Save all configuration to file
+        """
+        if file_name is None:
+            file_name = self.save_path
+        else:
+            self.save_path = file_name
+        FioDic().save(file_name, self.config)
+
+    def get_dscp_jdata(self):
+        r"""Generate `model/descriptor` in input script
+        """
+        dscp = self.dscp
+        jdata = jdata_deepmd_input['model']['descriptor']
+        jdata['sel'] = dscp['sel']
+        jdata['rcut'] = dscp['rcut']
+        jdata['rcut_smth'] = dscp['rcut_smth']
+        jdata['neuron'] = dscp['neuron']
+        jdata['type_one_side'] = dscp['type_one_side']
+        jdata['axis_neuron'] = dscp['axis_neuron']
+        return jdata
+
+    def get_fitn_jdata(self):
+        r"""Generate `model/fitting_net` in input script
+        """
+        fitn = self.fitn
+        jdata = jdata_deepmd_input['model']['fitting_net']
+        jdata['neuron'] = fitn['neuron']
+        return jdata
+
+    def get_model_jdata(self):
+        r"""Generate `model` in input script
+        """
+        jdata = jdata_deepmd_input['model']
+        jdata['descriptor'] = self.get_dscp_jdata()
+        jdata['fitting_net'] = self.get_fitn_jdata()
+        return jdata
+
+    def get_nvnmd_jdata(self):
+        r"""Generate `nvnmd` in input script
+        """
+        jdata = jdata_deepmd_input['nvnmd']
+        jdata['net_size'] = self.net_size
+        jdata['config_file'] = self.config_file
+        jdata['weight_file'] = self.weight_file
+        jdata['map_file'] = self.map_file
+        jdata['enable'] = self.enable
+        jdata['restore_descriptor'] = self.restore_descriptor
+        jdata['restore_fitting_net'] = self.restore_fitting_net
+        jdata['quantize_descriptor'] = self.quantize_descriptor
+        jdata['quantize_fitting_net'] = self.quantize_fitting_net
+        return jdata
+
+    def get_learning_rate_jdata(self):
+        r"""Generate `learning_rate` in input script
+        """
+        return jdata_deepmd_input['learning_rate']
+
+    def get_loss_jdata(self):
+        r"""Generate `loss` in input script
+        """
+        return jdata_deepmd_input['loss']
+
+    def get_training_jdata(self):
+        r"""Generate `training` in input script
+        """
+        return jdata_deepmd_input['training']
+
+    def get_deepmd_jdata(self):
+        r"""Generate input script with member element one by one
+        """
+        jdata = jdata_deepmd_input.copy()
+        jdata['model'] = self.get_model_jdata()
+        jdata['nvnmd'] = self.get_nvnmd_jdata()
+        jdata['learning_rate'] = self.get_learning_rate_jdata()
+        jdata['loss'] = self.get_loss_jdata()
+        jdata['training'] = self.get_training_jdata()
+        return jdata
+
+    def disp_message(self):
+        r"""Display the log of NVNMD
+        """
+        NVNMD_CONFIG = (
+            f"enable: {self.enable}",
+            f"net_size: {self.net_size}",
+            f"map_file: {self.map_file}",
+            f"config_file: {self.config_file}",
+            f"weight_file: {self.weight_file}",
+            f"restore_descriptor: {self.restore_descriptor}",
+            f"restore_fitting_net: {self.restore_fitting_net}",
+            f"quantize_descriptor: {self.quantize_descriptor}",
+            f"quantize_fitting_net: {self.quantize_fitting_net}",
+        )
+        for message in NVNMD_WELCOME + NVNMD_CITATION + NVNMD_CONFIG:
+            log.info(message)
+
+
+# global configuration for nvnmd
+nvnmd_cfg = NvnmdConfig(jdata_deepmd_input['nvnmd'])
diff --git a/deepmd/nvnmd/utils/encode.py b/deepmd/nvnmd/utils/encode.py
new file mode 100644
index 0000000000..187a9dcfa8
--- /dev/null
+++ b/deepmd/nvnmd/utils/encode.py
@@ -0,0 +1,192 @@
+
+import numpy as np
+import logging 
+
+from deepmd.nvnmd.data.data import jdata_sys
+
+log = logging.getLogger(__name__)
+
+
+class Encode():
+    r"""Encoding value as hex, bin, and dec format
+    """
+
+    def __init__(self):
+        pass
+
+    def qr(self, v, nbit: int = 14):
+        r"""Quantize value using round
+        """
+        return np.round(v * (2**nbit))
+
+    def qf(self, v, nbit: int = 14):
+        r"""Quantize value using floor
+        """
+        return np.floor(v * (2**nbit))
+
+    def qc(self, v, nbit: int = 14):
+        r"""Quantize value using ceil
+        """
+        return np.ceil(v * (2**nbit))
+
+    def check_dec(self, idec, nbit, signed=False, name=''):
+        r"""Check whether the data (idec) is in the range
+        range is :math:`[0, 2^nbit-1]` for unsigned
+        range is :math:`[-2^{nbit-1}, 2^{nbit-1}-1]` for signed
+        """
+        prec = np.int64(2**nbit)
+        if signed:
+            pmax = prec // 2 - 1
+            pmin = -pmax
+        else:
+            pmax = prec - 1
+            pmin = 0
+        I1 = idec < pmin
+        I2 = idec > pmax
+
+        if jdata_sys['debug']:
+            if np.sum(I1) > 0:
+                log.warning(f"NVNMD: there are data {name} smaller than the lower limit {pmin}")
+            if np.sum(I2) > 0:
+                log.warning(f"NVNMD: there are data {name} bigger than the upper limit {pmax}")
+
+    def extend_list(self, slbin, nfull):
+        r"""Extend the list (slbin) to the length (nfull)
+        the attched element of list is 0
+
+        such as, when
+
+        | slbin = ['10010','10100'],
+        | nfull = 4
+
+        extent it to
+
+        ['10010','10100','00000','00000]
+        """
+        nfull = int(nfull)
+        n = len(slbin)
+        dn = nfull - n
+        ds = '0' * len(slbin[0])
+        return slbin + [ds for ii in range(dn)]
+
+    def extend_bin(self, slbin, nfull):
+        r"""Extend the element of list (slbin) to the length (nfull)
+
+        such as, when
+        
+        | slbin = ['10010','10100'],
+        | nfull = 6
+
+        extent to
+
+        ['010010','010100']
+        """
+        nfull = int(nfull)
+        n = len(slbin[0])
+        dn = nfull - n
+        ds = '0' * int(dn)
+        return [ds + s for s in slbin]
+
+    def extend_hex(self, slhex, nfull):
+        r"""Extend the element of list (slhex) to the length (nfull)
+        """
+        nfull = int(nfull)
+        n = len(slhex[0])
+        dn = (nfull // 4) - n
+        ds = '0' * int(dn)
+        return [ds + s for s in slhex]
+
+    def split_bin(self, sbin, nbit: int):
+        r"""Split sbin into many segment with the length nbit
+        """
+        if isinstance(sbin, list):
+            sl = []
+            for s in sbin:
+                sl.extend(self.split_bin(s, nbit))
+            return sl
+        else:
+            n = len(sbin)
+            nseg = int(np.ceil(n / nbit))
+            s = '0' * int(nseg * nbit - n)
+            sbin = s + sbin
+
+            sl = [sbin[ii * nbit:(ii + 1) * nbit] for ii in range(nseg)]
+            sl = sl[::-1]
+            return sl
+
+    def reverse_bin(self, slbin, nreverse):
+        r"""Reverse binary string list per `nreverse` value
+        """
+        nreverse = int(nreverse)
+        # consider that {len(slbin)} can not be divided by {nreverse} without remainder
+        n = int(np.ceil(len(slbin) / nreverse))
+        slbin = self.extend_list(slbin, n * nreverse)
+        return [slbin[ii * nreverse + nreverse - 1 - jj] for ii in range(n) for jj in range(nreverse)]
+
+    def merge_bin(self, slbin, nmerge):
+        r"""Merge binary string list per `nmerge` value
+        """
+        nmerge = int(nmerge)
+        # consider that {len(slbin)} can not be divided by {nmerge} without remainder
+        n = int(np.ceil(len(slbin) / nmerge))
+        slbin = self.extend_list(slbin, n * nmerge)
+        return [''.join(slbin[nmerge * ii: nmerge * (ii + 1)]) for ii in range(n)]
+
+    def dec2bin(self, idec, nbit=10, signed=False, name=''):
+        r"""Convert dec array to binary string list
+        """
+        idec = np.int64(np.reshape(np.array(idec), [-1]))
+        self.check_dec(idec, nbit, signed, name)
+
+        prec = np.int64(2**nbit)
+        if signed:
+            pmax = prec // 2 - 1
+            pmin = -pmax
+        else:
+            pmax = prec - 1
+            pmin = 0
+        idec = np.maximum(pmin, idec)
+        idec = np.minimum(pmax, idec)
+        idec = idec + 2 * prec
+
+        sl = []
+        n = len(idec)
+        for ii in range(n):
+            s = bin(idec[ii])
+            s = s[-nbit:]
+            sl.append(s)
+        return sl
+
+    def hex2bin_str(self, shex):
+        r"""Convert hex string to binary string
+        """
+        n = len(shex)
+        sl = []
+        for ii in range(n):
+            si = bin(int(shex[ii], 16) + 16)
+            sl.append(si[-4:])
+        return ''.join(sl)
+
+    def hex2bin(self, data):
+        r"""Convert hex string list to binary string list
+        """
+        data = np.reshape(np.array(data), [-1])
+        return [self.hex2bin_str(d) for d in data]
+
+    def bin2hex_str(self, sbin):
+        r"""Convert binary string to hex string
+        """
+        n = len(sbin)
+        nx = int(np.ceil(n / 4))
+        sbin = ('0' * (nx * 4 - n)) + sbin
+        sl = []
+        for ii in range(nx):
+            si = hex(int(sbin[4 * ii: 4 * (ii + 1)], 2) + 16)
+            sl.append(si[-1])
+        return ''.join(sl)
+
+    def bin2hex(self, data):
+        r"""Convert binary string list to hex string list
+        """
+        data = np.reshape(np.array(data), [-1])
+        return [self.bin2hex_str(d) for d in data]
diff --git a/deepmd/nvnmd/utils/fio.py b/deepmd/nvnmd/utils/fio.py
new file mode 100644
index 0000000000..5d1f43f6e9
--- /dev/null
+++ b/deepmd/nvnmd/utils/fio.py
@@ -0,0 +1,212 @@
+
+import os
+import numpy as np
+import json
+import struct
+
+import logging
+log = logging.getLogger(__name__)
+
+
+class Fio:
+    r"""Basic class for FIO
+    """
+    def __init__(self):
+        pass
+
+    def exits(self, file_name=''):
+        if file_name == '':
+            return True
+        return os.path.exists(file_name)
+
+    def mkdir(self, path_name=''):
+        if not self.exits(path_name):
+            os.makedirs(path_name)
+
+    def create_file_path(self, file_name=''):
+        pars = file_name.split('/')
+        if len(pars) > 0:
+            path_name = '/'.join(pars[:-1])
+            self.mkdir(path_name)
+
+    def is_path(self, path):
+        return self.exits(path) and os.path.isdir(path)
+
+    def is_file(self, file_name):
+        return self.exits(file_name) and os.path.isfile(file_name)
+
+    def get_file_list(self, path) -> list:
+        if self.is_file(path):
+            return []
+        if self.is_path:
+            listdir = os.listdir(path)
+            file_lst = []
+            for name in listdir:
+                if self.is_file(os.path.join(path, name)):
+                    file_lst.append(os.path.join(path, name))
+                else:
+                    file_lst_ = self.get_file_list(os.path.join(path, name))
+                    file_lst.extend(file_lst_)
+            return file_lst
+        return []
+
+
+class FioDic:
+    r"""Input and output for dict class data
+    the file can be .json or .npy file containing a dictionary
+    """
+    def __init__(self) -> None:
+        pass
+
+    def load(self, file_name='', default_value={}):
+        if file_name.endswith('.json'):
+            return FioJsonDic().load(file_name, default_value)
+        elif file_name.endswith('.npy'):
+            return FioNpyDic().load(file_name, default_value)
+        else:
+            return FioNpyDic().load(file_name, default_value)
+
+    def save(self, file_name='', dic={}):
+        if file_name.endswith('.json'):
+            FioJsonDic().save(file_name, dic)
+        elif file_name.endswith('.npy'):
+            FioNpyDic().save(file_name, dic)
+        else:
+            FioNpyDic().save(file_name, dic)
+
+    def get(self, jdata, key, default_value):
+        if key in jdata.keys():
+            return jdata[key]
+        else:
+            return default_value
+
+    def update(self, jdata, jdata_o):
+        r"""Update key-value pair is key in jdata_o.keys()
+
+        Parameter
+        =========
+        jdata
+            new jdata
+        jdata_o
+            origin jdata
+        """
+        for key in jdata.keys():
+            if key in jdata_o.keys():
+                if isinstance(jdata_o[key], dict):
+                    jdata_o[key] = self.update(jdata[key], jdata_o[key])
+                else:
+                    jdata_o[key] = jdata[key]
+        return jdata_o
+
+
+class FioNpyDic:
+    r"""Input and output for .npy file containing dictionary
+    """
+    def __init__(self):
+        pass
+
+    def load(self, file_name='', default_value={}):
+        if Fio().exits(file_name):
+            log.info(f"load {file_name}")
+            dat = np.load(file_name, allow_pickle=True)[0]
+            return dat
+        else:
+            log.warning(f"can not find {file_name}")
+            return default_value
+
+    def save(self, file_name='', dic={}):
+        Fio().create_file_path(file_name)
+        np.save(file_name, [dic])
+
+
+class FioJsonDic:
+    r"""Input and output for .json file containing dictionary
+    """
+    def __init__(self):
+        pass
+
+    def load(self, file_name='', default_value={}):
+        r"""Load .json file into dict
+        """
+        if Fio().exits(file_name):
+            log.info(f"load {file_name}")
+            with open(file_name, 'r') as fr:
+                jdata = fr.read()
+            dat = json.loads(jdata)
+            return dat
+        else:
+            log.warning(f"can not find {file_name}")
+            return default_value
+
+    def save(self, file_name='', dic={}):
+        r"""Save dict into .json file
+        """
+        log.info(f"write jdata to {file_name}")
+        Fio().create_file_path(file_name)
+        with open(file_name, 'w') as fw:
+            json.dump(dic, fw, indent=4)
+
+
+class FioBin():
+    r"""Input and output for binary file
+    """
+    def __init__(self):
+        pass
+
+    def load(self, file_name='', default_value=''):
+        r"""Load binary file into bytes value
+        """
+        if Fio().exits(file_name):
+            log.info(f"load {file_name}")
+            dat = ""
+            with open(file_name, 'rb') as fr:
+                dat = fr.read()
+            return dat
+        else:
+            log.warning(f"can not find {file_name}")
+            return default_value
+
+    def save(self, file_name: str = '', data: str = ''):
+        r"""Save hex string into binary file
+        """
+        log.info(f"write binary to {file_name}")
+        Fio().create_file_path(file_name)
+        with open(file_name, 'wb') as fp:
+            for si in data:
+                # one byte consists of two hex chars
+                for ii in range(len(si) // 2):
+                    v = int(si[2 * ii: 2 * (ii + 1)], 16)
+                    v = struct.pack('B', v)
+                    fp.write(v)
+
+
+class FioTxt():
+    r"""Input and output for .txt file with string
+    """
+    def __init__(self):
+        pass
+
+    def load(self, file_name='', default_value=[]):
+        r"""Load .txt file into string list
+        """
+        if Fio().exits(file_name):
+            log.info(f"load {file_name}")
+            with open(file_name, 'r', encoding='utf-8') as fr:
+                dat = fr.readlines()
+            dat = [d.replace('\n', '') for d in dat]
+            return dat
+        else:
+            log.info(f"can not find {file_name}")
+            return default_value
+
+    def save(self, file_name: str = '', data: list = []):
+        r"""Save string list into .txt file
+        """
+        log.info(f"write string to txt file {file_name}")
+        Fio().create_file_path(file_name)
+
+        if isinstance(data, str):
+            data = [data]
+        data = [d + '\n' for d in data]
+        with open(file_name, 'w') as fw:
+            fw.writelines(data)
diff --git a/deepmd/nvnmd/utils/network.py b/deepmd/nvnmd/utils/network.py
new file mode 100644
index 0000000000..40ca58bbb0
--- /dev/null
+++ b/deepmd/nvnmd/utils/network.py
@@ -0,0 +1,196 @@
+
+import numpy as np
+
+from deepmd.env import tf
+from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.env import op_module
+
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.nvnmd.utils.weight import get_constant_initializer
+from deepmd.utils.network import variable_summaries
+
+
+def get_sess():
+    init_op = tf.global_variables_initializer()
+    sess = tf.Session()
+    sess.run(init_op)
+    return sess
+
+
+def matmul2_qq(a, b, nbit):
+    r"""Quantized matmul operation for 2d tensor.
+    a and b is input tensor, nbit represent quantification precision
+    """
+    sh_a = a.get_shape().as_list()
+    sh_b = b.get_shape().as_list()
+    a = tf.reshape(a, [-1, 1, sh_a[1]])
+    b = tf.reshape(tf.transpose(b), [1, sh_b[1], sh_b[0]])
+    y = a * b
+    y = qf(y, nbit)
+    y = tf.reduce_sum(y, axis=2)
+    return y
+
+
+def matmul3_qq(a, b, nbit):
+    r"""Quantized matmul operation for 3d tensor.
+    a and b is input tensor, nbit represent quantification precision
+    """
+    sh_a = a.get_shape().as_list()
+    sh_b = b.get_shape().as_list()
+    a = tf.reshape(a, [-1, sh_a[1], 1, sh_a[2]])
+    b = tf.reshape(tf.transpose(b, [0, 2, 1]), [-1, 1, sh_b[2], sh_b[1]])
+    y = a * b
+    if nbit == -1:
+        y = y
+    else:
+        y = qf(y, nbit)
+    y = tf.reduce_sum(y, axis=3)
+    return y
+
+
+def qf(x, nbit):
+    r"""Quantize and floor tensor `x` with quantification precision `nbit`.
+    """
+    prec = 2**nbit
+
+    y = tf.floor(x * prec) / prec
+    y = x + tf.stop_gradient(y - x)
+    return y
+
+
+def qr(x, nbit):
+    r"""Quantize and round tensor `x` with quantification precision `nbit`.
+    """
+    prec = 2**nbit
+
+    y = tf.round(x * prec) / prec
+    y = x + tf.stop_gradient(y - x)
+    return y
+
+
+# fitting_net
+def tanh2(x, nbit=-1, nbit2=-1):
+    r"""User-defined activation function tanh2
+
+    Parameter
+    ---------
+    x
+        input tensor
+    nbit
+        quantification precision for forward calculation
+    nbit2
+        quantification precision for backward calculation
+    """
+    y = op_module.tanh2_nvnmd(x, 0, nbit, nbit2, -1)
+    return y
+
+
+def tanh4(x, nbit=-1, nbit2=-1):
+    r"""User-defined activation function tanh4
+
+    Parameter
+    ---------
+    x
+        input tensor
+    nbit
+        quantification precision for forward calculation
+    nbit2
+        quantification precision for backward calculation
+    """
+    y = op_module.tanh4_nvnmd(x, 0, nbit, nbit2, -1)
+    return y
+
+
+def one_layer_wb(
+    shape,
+    outputs_size,
+    bavg,
+    stddev,
+    precision,
+    trainable,
+    initial_variables,
+    seed,
+    uniform_seed,
+    name
+):
+    if nvnmd_cfg.restore_fitting_net:
+        # initializer
+        w_initializer = get_constant_initializer(nvnmd_cfg.weight, 'matrix')
+        b_initializer = get_constant_initializer(nvnmd_cfg.weight, 'bias')
+    else:
+        w_initializer = tf.random_normal_initializer(
+            stddev=stddev / np.sqrt(shape[1] + outputs_size),
+            seed=seed if (seed is None or uniform_seed) else seed + 0)
+        b_initializer = tf.random_normal_initializer(
+            stddev=stddev,
+            mean=bavg,
+            seed=seed if (seed is None or uniform_seed) else seed + 1)
+        if initial_variables is not None:
+            w_initializer = tf.constant_initializer(initial_variables[name + '/matrix'])
+            b_initializer = tf.constant_initializer(initial_variables[name + '/bias'])
+    # variable
+    w = tf.get_variable('matrix',
+                        [shape[1], outputs_size],
+                        precision,
+                        w_initializer,
+                        trainable=trainable)
+    variable_summaries(w, 'matrix')
+    b = tf.get_variable('bias',
+                        [outputs_size],
+                        precision,
+                        b_initializer,
+                        trainable=trainable)
+    variable_summaries(b, 'bias')
+
+    return w, b
+
+
+def one_layer(inputs,
+              outputs_size,
+              activation_fn=tf.nn.tanh,
+              precision=GLOBAL_TF_FLOAT_PRECISION,
+              stddev=1.0,
+              bavg=0.0,
+              name='linear',
+              reuse=None,
+              seed=None,
+              use_timestep=False,
+              trainable=True,
+              useBN=False,
+              uniform_seed=False,
+              initial_variables=None,
+              mixed_prec=None,
+              final_layer=False):
+    r"""Build one layer with continuous or quantized value.
+    Its weight and bias can be initialed with random or constant value.
+    """
+    if activation_fn is not None:
+        activation_fn = tanh4
+    with tf.variable_scope(name, reuse=reuse):
+        shape = inputs.get_shape().as_list()
+        w, b = one_layer_wb(shape, outputs_size, bavg, stddev, precision, trainable, initial_variables, seed, uniform_seed, name)
+        if nvnmd_cfg.quantize_fitting_net:
+            NBIT_DATA_FL = nvnmd_cfg.nbit['NBIT_DATA_FL']
+            NBIT_WEIGHT_FL = nvnmd_cfg.nbit['NBIT_WEIGHT_FL']
+            #
+            inputs = qf(inputs, NBIT_DATA_FL)
+            w = qr(w, NBIT_WEIGHT_FL)
+            with tf.variable_scope('wx', reuse=reuse):
+                wx = op_module.matmul_nvnmd(inputs, w, 0, NBIT_DATA_FL, NBIT_DATA_FL, -1)
+            #
+            b = qr(b, NBIT_DATA_FL)
+            with tf.variable_scope('wxb', reuse=reuse):
+                hidden = wx + b
+            #
+            with tf.variable_scope('actfun', reuse=reuse):
+                if activation_fn is not None:
+                    y = activation_fn(hidden, NBIT_DATA_FL, NBIT_DATA_FL)
+                else:
+                    y = hidden
+        else:
+            hidden = tf.matmul(inputs, w) + b
+            y = activation_fn(hidden, -1, -1) if (activation_fn is not None) else hidden
+    # 'reshape' is necessary
+    # the next layer needs shape of input tensor to build weight
+    y = tf.reshape(y, [-1, outputs_size])
+    return y
diff --git a/deepmd/nvnmd/utils/op.py b/deepmd/nvnmd/utils/op.py
new file mode 100644
index 0000000000..4aa33de72e
--- /dev/null
+++ b/deepmd/nvnmd/utils/op.py
@@ -0,0 +1,11 @@
+
+import numpy as np
+
+
+def map_nvnmd(x, map_y, map_dy, prec, nbit=None):
+    r"""Mapping function implemented by numpy
+    """
+    xk = int(np.floor(x / prec))
+    dx = x - xk * prec
+    y = map_y[xk] + map_dy[xk] * dx
+    return y
diff --git a/deepmd/nvnmd/utils/weight.py b/deepmd/nvnmd/utils/weight.py
new file mode 100644
index 0000000000..9b0fca00f6
--- /dev/null
+++ b/deepmd/nvnmd/utils/weight.py
@@ -0,0 +1,95 @@
+
+import numpy as np
+import logging
+
+from deepmd.env import tf
+
+log = logging.getLogger(__name__)
+
+
+def get_weight(weights, key):
+    r"""Get weight value according to key
+    """
+    if key in weights.keys():
+        return weights[key]
+    else:
+        log.warning(f"There is not {key} in weights.")
+        return None
+
+
+def get_normalize(weights: dict):
+    r"""Get normalize parameter (avg and std) of :math:`s_{ji}`
+    """
+    key = "descrpt_attr.t_avg"
+    avg = get_weight(weights, key)
+    key = "descrpt_attr.t_std"
+    std = get_weight(weights, key)
+    return avg, std
+
+
+def get_rng_s(weights: dict):
+    r"""Guess the range of :math:`s_{ji}`
+    """
+    avg, std = get_normalize(weights)
+    smin = np.min(-avg[:, 0] / std[:, 0])
+    smax = np.max(2.0 / std[:, 0])
+    return smin, smax
+
+
+def get_filter_weight(weights: dict, spe_i: int, spe_j: int, layer_l: int):
+    r"""Get weight and bias of embedding network
+
+    Parameters
+    ----------
+    spe_i(int)
+        special order of central atom i
+        0~ntype-1
+    spe_j(int)
+        special order of neighbor atom j
+        0~ntype-1
+    layer_l
+        layer order in embedding network
+        1~nlayer
+    """
+    # key = f"filter_type_{spe_i}.matrix_{layer_l}_{spe_j}" # type_one_side = false
+    key = f"filter_type_all.matrix_{layer_l}_{spe_j}"  # type_one_side = true
+    weight = get_weight(weights, key)
+    # key = f"filter_type_{spe_i}.bias_{layer_l}_{spe_j}" # type_one_side = false
+    key = f"filter_type_all.bias_{layer_l}_{spe_j}"  # type_one_side = true
+    bias = get_weight(weights, key)
+    return weight, bias
+
+
+def get_fitnet_weight(weights: dict, spe_i: int, layer_l: int, nlayer: int = 10):
+    r"""Get weight and bias of fitting network
+
+    Parameters
+    ----------
+    spe_i(int)
+        special order of central atom i
+        0~ntype-1
+    layer_l(int)
+        layer order in embedding network
+        0~nlayer-1
+    """
+    if layer_l == nlayer - 1:
+        key = f"final_layer_type_{spe_i}.matrix"
+        weight = get_weight(weights, key)
+        key = f"final_layer_type_{spe_i}.bias"
+        bias = get_weight(weights, key)
+    else:
+        key = f"layer_{layer_l}_type_{spe_i}.matrix"
+        weight = get_weight(weights, key)
+        key = f"layer_{layer_l}_type_{spe_i}.bias"
+        bias = get_weight(weights, key)
+
+    return weight, bias
+
+
+def get_constant_initializer(weights, name):
+    r"""Get initial value by name and create a initializer
+    """
+    scope = tf.get_variable_scope().name
+    name = scope + '.' + name
+    value = get_weight(weights, name)
+    return tf.constant_initializer(value)
diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py
index 77d5028051..6f1476c82d 100644
--- a/deepmd/train/trainer.py
+++ b/deepmd/train/trainer.py
@@ -35,6 +35,8 @@
 
 log = logging.getLogger(__name__)
 
+# nvnmd
+from deepmd.nvnmd.utils.config import nvnmd_cfg
 
 def _is_subdir(path, directory):
     path = os.path.realpath(path)
@@ -63,6 +65,14 @@ def _init_param(self, jdata):
         self.model_param    = model_param
         self.descrpt_param  = descrpt_param
         
+        # nvnmd
+        self.nvnmd_param = jdata.get('nvnmd', {})
+        nvnmd_cfg.init_from_jdata(self.nvnmd_param)
+        nvnmd_cfg.init_from_deepmd_input(model_param)
+        if nvnmd_cfg.enable:
+            nvnmd_cfg.disp_message()
+            nvnmd_cfg.save()
+        
         # descriptor
         try:
             descrpt_type = descrpt_param['type']
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index ed1253d171..e7c7edb170 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -6,6 +6,7 @@
 from deepmd.utils.plugin import Plugin
 import json
 
+from deepmd.nvnmd.utils.argcheck import nvnmd_args
 
 def list_to_doc(xx):
     items = []
@@ -27,7 +28,7 @@ def type_embedding_args():
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_seed = 'Random seed for parameter initialization'
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_trainable = 'If the parameters in the embedding net are trainable'
     
@@ -127,7 +128,7 @@ def descrpt_se_a_args():
     doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
     doc_axis_neuron = 'Size of the submatrix of G (embedding matrix).'
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_type_one_side = 'Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets'
     doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
@@ -161,7 +162,7 @@ def descrpt_se_t_args():
     doc_rcut = 'The cut-off radius.'
     doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_trainable = 'If the parameters in the embedding net are trainable'
@@ -204,7 +205,7 @@ def descrpt_se_r_args():
     doc_rcut = 'The cut-off radius.'
     doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_type_one_side = 'Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets'
     doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
@@ -261,7 +262,7 @@ def fitting_ener():
     doc_numb_fparam = 'The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams.'
     doc_numb_aparam = 'The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams.'
     doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_trainable = 'Whether the parameters in the fitting net are trainable. This option can be\n\n\
@@ -287,7 +288,7 @@ def fitting_ener():
 
 def fitting_polar():
     doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_scale = 'The output of the fitting net (polarizability matrix) will be scaled by ``scale``'
@@ -319,7 +320,7 @@ def fitting_polar():
 
 def fitting_dipole():
     doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_sel_type = 'The atom types for which the atomic dipole will be provided. If not set, all types will be selected.'
@@ -685,11 +686,13 @@ def gen_doc(*, make_anchor=True, make_link=True, **kwargs):
     lra = learning_rate_args()
     la = loss_args()
     ta = training_args()
+    nvnmda = nvnmd_args()
     ptr = []
     ptr.append(ma.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs))
     ptr.append(la.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs))
     ptr.append(lra.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs))
     ptr.append(ta.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs))
+    ptr.append(nvnmda.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs))
 
     key_words = []
     for ii in "\n\n".join(ptr).split('\n'):
@@ -705,6 +708,7 @@ def gen_json(**kwargs):
         learning_rate_args(),
         loss_args(),
         training_args(),
+        nvnmd_args(),
     ), cls=ArgumentEncoder)
 
 def normalize_hybrid_list(hy_list):
@@ -726,8 +730,9 @@ def normalize(data):
     lra = learning_rate_args()
     la = loss_args()
     ta = training_args()
+    nvnmda = nvnmd_args()
 
-    base = Argument("base", dict, [ma, lra, la, ta])
+    base = Argument("base", dict, [ma, lra, la, ta, nvnmda])
     data = base.normalize_value(data, trim_pattern="_*")
     base.check_value(data, strict=True)
 
diff --git a/doc/index.rst b/doc/index.rst
index 5ba11a764a..c31911b1fb 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -35,6 +35,7 @@ DeePMD-kit is a package written in Python/C++, designed to minimize the effort r
    inference/index
    cli
    third-party/index
+   nvnmd/index
    troubleshooting/index
 
 .. _developer-guide:
diff --git a/doc/inference/cxx.md b/doc/inference/cxx.md
index 3871a3d92d..746164df23 100644
--- a/doc/inference/cxx.md
+++ b/doc/inference/cxx.md
@@ -17,7 +17,7 @@ where `e`, `f` and `v` are predicted energy, force and virial of the system, res
 
 You can compile `infer_water.cpp` using `gcc`:
 ```sh
-gcc infer_water.cpp -D HIGH_PREC -L $deepmd_root/lib -L $tensorflow_root/lib -I $deepmd_root/include -I $tensorflow_root/include -Wl,--no-as-needed -ldeepmd_cc -lstdc++ -ltensorflow_cc -Wl,-rpath=$deepmd_root/lib -Wl,-rpath=$tensorflow_root/lib -o infer_water
+gcc infer_water.cpp -D HIGH_PREC -L $deepmd_root/lib -L $tensorflow_root/lib -I $deepmd_root/include -Wl,--no-as-needed -ldeepmd_cc -lstdc++ -ltensorflow_cc -Wl,-rpath=$deepmd_root/lib -Wl,-rpath=$tensorflow_root/lib -o infer_water
 ```
 and then run the program:
 ```sh
diff --git a/doc/nvnmd/figure_1.png b/doc/nvnmd/figure_1.png
new file mode 100644
index 0000000000..eeef710a63
Binary files /dev/null and b/doc/nvnmd/figure_1.png differ
diff --git a/doc/nvnmd/figure_2.png b/doc/nvnmd/figure_2.png
new file mode 100644
index 0000000000..fdeec62e80
Binary files /dev/null and b/doc/nvnmd/figure_2.png differ
diff --git a/doc/nvnmd/figure_3.png b/doc/nvnmd/figure_3.png
new file mode 100644
index 0000000000..4cc8d9368d
Binary files /dev/null and b/doc/nvnmd/figure_3.png differ
diff --git a/doc/nvnmd/figure_4.png b/doc/nvnmd/figure_4.png
new file mode 100644
index 0000000000..be6ba8034b
Binary files /dev/null and b/doc/nvnmd/figure_4.png differ
diff --git a/doc/nvnmd/figure_5.png b/doc/nvnmd/figure_5.png
new file mode 100644
index 0000000000..f07d2ab233
Binary files /dev/null and b/doc/nvnmd/figure_5.png differ
diff --git a/doc/nvnmd/figure_6.png b/doc/nvnmd/figure_6.png
new file mode 100644
index 0000000000..7db3a69d49
Binary files /dev/null and b/doc/nvnmd/figure_6.png differ
diff --git a/doc/nvnmd/figure_7.png b/doc/nvnmd/figure_7.png
new file mode 100644
index 0000000000..c5fe54d1be
Binary files /dev/null and b/doc/nvnmd/figure_7.png differ
diff --git a/doc/nvnmd/index.md b/doc/nvnmd/index.md
new file mode 100644
index 0000000000..763f794c9e
--- /dev/null
+++ b/doc/nvnmd/index.md
@@ -0,0 +1,7 @@
+# Use NVNMD
+
+NVNMD stands for non-von Neumann molecular dynamics.
+
+In this section, we will introduce how to use it.
+
+- [Use NVNMD](nvnmd.md)
diff --git a/doc/nvnmd/index.rst b/doc/nvnmd/index.rst
new file mode 100644
index 0000000000..c4470ee3fd
--- /dev/null
+++ b/doc/nvnmd/index.rst
@@ -0,0 +1,7 @@
+Use NVNMD
+=========
+
+.. toctree::
+   :maxdepth: 1
+
+   nvnmd
\ No newline at end of file
diff --git a/doc/nvnmd/nvnmd.md b/doc/nvnmd/nvnmd.md
new file mode 100644
index 0000000000..3979020f9d
--- /dev/null
+++ b/doc/nvnmd/nvnmd.md
@@ -0,0 +1,266 @@
+# Introduction
+
+NVNMD stands for non-von Neumann molecular dynamics.
+
+This is the training code we used to generate the results in our paper entitled "Accurate and Efficient Molecular Dynamics based on Machine Learning and Non Von Neumann Architecture", which has been accepted by npj Computational Materials ([DOI: 10.1038/s41524-022-00773-z](https://www.nature.com/articles/s41524-022-00773-z)).
+
+Any user can follow two consecutive steps to run molecular dynamics (MD) on the proposed NVNMD computer, which has been released online: (i) to train a machine learning (ML) model that can decently reproduce the potential energy surface (PES); and (ii) to deploy the trained ML model on the proposed NVNMD computer, then run MD there to obtain the atomistic trajectories.
+
+# Training
+
+Our training procedure consists of not only the continuous neural network (CNN) training, but also the quantized neural network (QNN) training which uses the results of CNN as inputs. It is performed on CPU or GPU by using the training codes we open-sourced online.
+
+To train a ML model that can decently reproduce the PES, training and testing data set should be prepared first. This can be done by using either the state-of-the-art active learning tools, or the outdated (i.e., less efficient) brute-force density functional theory (DFT)-based ab-initio molecular dynamics (AIMD) sampling.
+
+If you just want to simply test the training function, you can use the example in the `$deepmd_source_dir/examples/nvnmd` directory. If you want to fully experience training and running MD functions, you can download the complete example from the [website](https://github.com/LiuGroupHNU/nvnmd-example).
+
+Then, copy the data set to working directory
+
+```bash
+mkdir -p $workspace
+cd $workspace
+mkdir -p data
+cp -r $dataset data
+```
+
+where `$dataset` is the path to the data set and `$workspace` is the path to working directory. 
+
+## Input script
+
+Create and go to the training directory.
+
+
+```bash
+mkdir train
+cd train 
+```
+
+Then copy the input script `train_cnn.json` and `train_qnn.json` to the directory `train`
+
+```bash
+cp -r $deepmd_source_dir/examples/nvnmd/train/train_cnn.json train_cnn.json
+cp -r $deepmd_source_dir/examples/nvnmd/train/train_qnn.json train_qnn.json
+```
+
+The structure of the input script is as follows
+
+```json
+{
+    "nvnmd" : {},
+    "learning_rate" : {},
+    "loss" : {},
+    "training": {}
+}
+```
+
+### nvnmd
+
+The "nvnmd" section is defined as 
+
+```json
+{
+    "net_size":128,
+    "sel":[60, 60],
+    "rcut":6.0,
+    "rcut_smth":0.5
+}
+```
+
+where items are defined as:
+
+| Item      | Mean                        | Optional Value                                |
+| --------- | --------------------------- | --------------------------------------------- |
+| net_size  | the size of nueral network  | 128                                     |
+| sel       | the number of neighbors     | integer list of lengths 1 to 4 are acceptable |
+| rcut      | the cutoff radial           | (0, 8.0]                                      |
+| rcut_smth | the smooth cutoff parameter | (0, 8.0]                                      |
+
+### learning_rate
+
+The "learning_rate" section is defined as 
+
+```json
+{
+    "type":"exp",
+    "start_lr": 1e-3,
+    "stop_lr": 3e-8,
+    "decay_steps": 5000
+}
+```
+
+where items are defined as:
+
+| Item        | Mean                                                         | Optional Value         |
+| ----------- | ------------------------------------------------------------ | ---------------------- |
+| type        | learning rate variant type                                   | exp                    |
+| start_lr    | the learning rate at the beginning of the training           | a positive real number |
+| stop_lr     | the desired learning rate at the end of the training         | a positive real number |
+| decay_stops | the learning rate is decaying every {decay_stops} training steps | a positive integer     |
+
+### loss
+
+The "loss" section is defined as 
+
+```json
+{
+    "start_pref_e": 0.02,
+    "limit_pref_e": 2,
+    "start_pref_f": 1000,
+    "limit_pref_f": 1,
+    "start_pref_v": 0,
+    "limit_pref_v": 0
+}
+```
+
+where items are defined as:
+
+| Item         | Mean                                                       | Optional Value               |
+| ------------ | ---------------------------------------------------------- | ---------------------------- |
+| start_pref_e | the loss factor of energy at the beginning of the training | zero or positive real number |
+| limit_pref_e | the loss factor of energy at the end of the training       | zero or positive real number |
+| start_pref_f | the loss factor of force at the beginning of the training  | zero or positive real number |
+| limit_pref_f | the loss factor of force at the end of the training        | zero or positive real number |
+| start_pref_v | the loss factor of virial at the beginning of the training | zero or positive real number |
+| limit_pref_v | the loss factor of virial at the end of the training       | zero or positive real number |
+
+### training
+
+The "training" section is defined as 
+
+```json
+{
+  "seed": 1,
+    "stop_batch": 1000000,
+    "numb_test": 1,
+    "disp_file": "lcurve.out",
+    "disp_freq": 1000,
+    "save_ckpt": "model.ckpt",
+    "save_freq": 10000,
+    "training_data":{
+      "systems":["system1_path", "system2_path", "..."],
+      "set_prefix": "set",
+      "batch_size": ["batch_size_of_system1", "batch_size_of_system2", "..."]
+    }
+}
+```
+
+where items are defined as:
+
+| Item       | Mean                                                | Optional Value     |
+| ---------- | --------------------------------------------------- | ------------------ |
+| seed       | the randome seed                                    | a integer          |
+| stop_batch | the total training steps                            | a positive integer |
+| numb_test  | the accuracy is test by using {numb_test} sample    | a positive integer |
+| disp_file  | the log file where the training message display     | a string           |
+| disp_freq  | display frequency                                   | a positive integer |
+| save_ckpt  | check point file                                    | a string           |
+| save_freq  | save frequency                                      | a positive integer |
+| systems    | a list of data directory which contains the dataset | string list        |
+| set_prefix | the prefix of dataset                               | a string           |
+| batch_size | a list of batch size of corresponding dataset       | a integer list     |
+
+## Training
+
+Training can be invoked by
+
+```bash
+# step1: train CNN
+dp train-nvnmd train_cnn.json -s s1
+# step2: train QNN
+dp train-nvnmd train_qnn.json -s s2
+```
+
+After training process, you will get two folders: `nvnmd_cnn` and `nvnmd_qnn`. The `nvnmd_cnn` contains the model after continuous neural network (CNN) training. The `nvnmd_qnn` contains the model after quantized neural network (QNN) training. The binary file `nvnmd_qnn/model.pb` is the model file which is used to performs NVNMD in server [http://nvnmd.picp.vip]
+
+
+# Testing
+
+The frozen model can be used in many ways. The most straightforward testing can be invoked by
+
+```bash
+mkdir test
+dp test -m ./nvnmd_qnn/frozen_model.pb -s path/to/system -d ./test/detail -n 99999 -l test/output.log
+```
+
+where the frozen model file to import is given via the `-m` command line flag, the path to the testing data set is given via the `-s` command line flag, the file containing details of energy, force and virial accuracy is given via the `-d` command line flag, the amount of data for testing is given via the `-n` command line flag.
+
+# Running MD
+
+After CNN and QNN training, you can upload the ML model to our online NVNMD system and run MD there.
+
+## Account application
+
+The server website of NVNMD is available at http://nvnmd.picp.vip. You can visit the URL and enter the login interface (Figure.1).
+
+![ALT](./figure_1.png "The login interface")
+<center>Figure.1 The login interface</center>
+
+To obtain an account, please send your application to the email (jie_liu@hnu.edu.cn, liujie@uw.edu). The username and password will be sent to you by email.
+
+## Adding task
+
+After successfully obtaining the account, enter the username and password in the login interface, and click "Login" to enter the homepage (Figure.2).
+
+![ALT](./figure_2.png "The homepage")
+<center>Figure.2 The homepage</center>
+
+The homepage displays the remaining calculation time and all calculation records not deleted. Click `Add a new task` to enter the interface for adding a new task (Figure.3).
+
+![ALT](./figure_3.png "The interface for adding a new task")
+<center>Figure.3 The interface for adding a new task</center>
+
+- Task name: name of the task
+- Upload mode: two modes of uploading results to online data storage, including `Manual upload` and `Automatic upload`. Results need to be uploaded manually to online data storage with `Manual upload` mode, and will be uploaded automatically with `Automatic upload` mode.
+- Input script: input file of the MD simulation.
+
+In the input script, one needs to specify the pair style as follows
+
+```lammps
+pair_style nvnmd model.pb
+pair_coeff * *
+```
+
+- Model file: the ML model named `model.pb` obtained by QNN training.
+- Data files: data files containing information required for running an MD simulation (e.g., `coord.lmp` containing initial atom coordinates).
+
+Next, you can click `Submit` to submit the task and then automatically return to the homepage (Figure.4).
+
+![ALT](./figure_4.png "The homepage with a new record")
+<center>Figure.4 The homepage with a new record</center>
+
+Then, click `Refresh` to view the latest status of all calculation tasks.
+
+## Cancelling calculation
+
+For the task whose calculation status is `Pending` and `Running`, you can click the corresponding `Cancel` on the homepage to stop the calculation (Figure.5).
+
+![ALT](./figure_5.png "The homepage with a cancelled task")
+<center>Figure.5 The homepage with a cancelled task</center>
+
+## Downloading results
+
+For the task whose calculation status is `Completed`, `Failed` and `Cancelled`, you can click the corresponding `Package` or `Separate files` in the `Download results` bar on the homepage to download results.
+
+Click `Package` to download a zipped package of all files including input files and output results (Figure.6).
+
+![ALT](./figure_6.png "The interface for downloading a zipped package")
+<center>Figure.6 The interface for downloading a zipped package</center>
+
+Click `Separate files` to download the required separate files (Figure.7).
+
+![ALT](./figure_7.png "The interface for downloading separate files")
+<center>Figure.7 The interface for downloading separate files</center>
+
+If `Manual upload` mode is selected or the file has expired, click `Upload` on the download interface to upload manually.
+
+## Deleting record
+
+For the task no longer needed, you can click the corresponding `Delete` on the homepage to delete the record.
+
+Records cannot be retrieved after deletion.
+
+## Clearing records
+
+Click `Clear calculation records` on the homepage to clear all records.
+
+Records cannot be retrieved after clearing.
diff --git a/doc/train/train-input.rst b/doc/train/train-input.rst
index 44d511cc21..e2aa45a522 100644
--- a/doc/train/train-input.rst
+++ b/doc/train/train-input.rst
@@ -18,3 +18,7 @@ Training Parameters
 .. dargs::
    :module: deepmd.utils.argcheck
    :func: training_args
+
+.. dargs::
+   :module: deepmd.utils.argcheck
+   :func: nvnmd_args
diff --git a/examples/nvnmd/data/set.000/box.npy b/examples/nvnmd/data/set.000/box.npy
new file mode 100644
index 0000000000..8235fc7662
Binary files /dev/null and b/examples/nvnmd/data/set.000/box.npy differ
diff --git a/examples/nvnmd/data/set.000/coord.npy b/examples/nvnmd/data/set.000/coord.npy
new file mode 100644
index 0000000000..5b5d94ca99
Binary files /dev/null and b/examples/nvnmd/data/set.000/coord.npy differ
diff --git a/examples/nvnmd/data/set.000/energy.npy b/examples/nvnmd/data/set.000/energy.npy
new file mode 100644
index 0000000000..3afe841c04
Binary files /dev/null and b/examples/nvnmd/data/set.000/energy.npy differ
diff --git a/examples/nvnmd/data/set.000/force.npy b/examples/nvnmd/data/set.000/force.npy
new file mode 100644
index 0000000000..9e24465514
Binary files /dev/null and b/examples/nvnmd/data/set.000/force.npy differ
diff --git a/examples/nvnmd/data/type.raw b/examples/nvnmd/data/type.raw
new file mode 100644
index 0000000000..4cd0cf2db3
--- /dev/null
+++ b/examples/nvnmd/data/type.raw
@@ -0,0 +1,216 @@
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
diff --git a/examples/nvnmd/train/train_cnn.json b/examples/nvnmd/train/train_cnn.json
new file mode 100644
index 0000000000..a961856247
--- /dev/null
+++ b/examples/nvnmd/train/train_cnn.json
@@ -0,0 +1,40 @@
+{
+    "nvnmd":{
+        "net_size": 128,
+        "sel": [60, 60],
+        "rcut": 6.0,
+        "rcut_smth": 0.5
+    },
+    "learning_rate": {
+        "type": "exp",
+        "start_lr": 1e-3,
+        "stop_lr": 3e-8,
+        "decay_steps": 5000
+    },
+    "loss": {
+        "start_pref_e": 0.02,
+        "limit_pref_e": 1,
+        "start_pref_f": 1000,
+        "limit_pref_f": 1,
+        "start_pref_v": 0,
+        "limit_pref_v": 0
+    },
+    "training": {
+        "seed": 1,
+        "stop_batch": 1000000,
+        "numb_test": 1,
+        "disp_file": "lcurve.out",
+        "disp_freq": 1000,
+        "save_ckpt": "model.ckpt",
+        "save_freq": 10000,
+        "training_data": {
+            "systems": [
+                "../data"
+            ],
+            "set_prefix": "set",
+            "batch_size": [
+                1
+            ]
+        }
+    }
+}
diff --git a/examples/nvnmd/train/train_qnn.json b/examples/nvnmd/train/train_qnn.json
new file mode 100644
index 0000000000..1a63f5c2c6
--- /dev/null
+++ b/examples/nvnmd/train/train_qnn.json
@@ -0,0 +1,40 @@
+{
+    "nvnmd": {
+        "net_size": 128,
+        "sel": [60, 60],
+        "rcut": 6.0,
+        "rcut_smth": 0.5
+    },
+    "learning_rate": {
+        "type": "exp",
+        "start_lr": 1e-8,
+        "stop_lr": 1e-9,
+        "decay_steps": 5000
+    },
+    "loss": {
+        "start_pref_e": 1,
+        "limit_pref_e": 1,
+        "start_pref_f": 1,
+        "limit_pref_f": 1,
+        "start_pref_v": 0,
+        "limit_pref_v": 0
+    },
+    "training": {
+        "seed": 1,
+        "stop_batch": 100000,
+        "numb_test": 1,
+        "disp_file": "lcurve.out",
+        "disp_freq": 1000,
+        "save_ckpt": "model.ckpt",
+        "save_freq": 10000,
+        "training_data": {
+            "systems": [
+                "../data"
+            ],
+            "set_prefix": "set",
+            "batch_size": [
+                1
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 9f8b4a69b9..01af95ee22 100644
--- a/setup.py
+++ b/setup.py
@@ -117,6 +117,12 @@
         "deepmd/op",
         "deepmd/model",
         "deepmd/train",
+        "deepmd/nvnmd",
+        "deepmd/nvnmd/data",
+        "deepmd/nvnmd/descriptor",
+        "deepmd/nvnmd/entrypoints",
+        "deepmd/nvnmd/fit",
+        "deepmd/nvnmd/utils",
     ],
     python_requires=">=3.6",
     classifiers=[
diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
index 830ecd6b78..e244c294ec 100644
--- a/source/api_cc/CMakeLists.txt
+++ b/source/api_cc/CMakeLists.txt
@@ -17,14 +17,18 @@ add_library(${libname} SHARED ${LIB_SRC})
 
 # link: libdeepmd libdeepmd_op libtensorflow_cc libtensorflow_framework
 target_link_libraries (${libname} PUBLIC ${LIB_DEEPMD}	${TensorFlow_LIBRARY} ${TensorFlowFramework_LIBRARY})
-target_include_directories(${libname} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_BINARY_DIR} ${TensorFlow_INCLUDE_DIRS})
+target_include_directories(${libname} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(${libname} PRIVATE ${TensorFlow_INCLUDE_DIRS})
 
 set_target_properties(
   ${libname} 
   PROPERTIES 
-  COMPILE_DEFINITIONS ${prec_def}
   INSTALL_RPATH "$ORIGIN;${TensorFlow_LIBRARY_PATH}"
 )
+target_compile_definitions(${libname}
+  PUBLIC ${prec_def}
+  PRIVATE TF_PRIVATE
+)
 
 install(TARGETS ${libname} DESTINATION lib/)
 
diff --git a/source/api_cc/include/DataModifier.h b/source/api_cc/include/DataModifier.h
index 114abfd889..98634281cd 100644
--- a/source/api_cc/include/DataModifier.h
+++ b/source/api_cc/include/DataModifier.h
@@ -32,7 +32,7 @@ class DipoleChargeModifier
   tensorflow::Session* session;
   std::string name_scope, name_prefix;
   int num_intra_nthreads, num_inter_nthreads;
-  tensorflow::GraphDef graph_def;
+  tensorflow::GraphDef* graph_def;
   bool inited;
   VALUETYPE rcut;
   VALUETYPE cell_size;
diff --git a/source/api_cc/include/DeepPot.h b/source/api_cc/include/DeepPot.h
index 2b76cc4e0b..48d2a3cd7f 100644
--- a/source/api_cc/include/DeepPot.h
+++ b/source/api_cc/include/DeepPot.h
@@ -179,7 +179,7 @@ class DeepPot
 private:
   tensorflow::Session* session;
   int num_intra_nthreads, num_inter_nthreads;
-  tensorflow::GraphDef graph_def;
+  tensorflow::GraphDef* graph_def;
   bool inited;
   template<class VT> VT get_scalar(const std::string & name) const;
   // VALUETYPE get_rcut () const;
@@ -401,7 +401,7 @@ class DeepPotModelDevi
   unsigned numb_models;
   std::vector<tensorflow::Session*> sessions;
   int num_intra_nthreads, num_inter_nthreads;
-  std::vector<tensorflow::GraphDef> graph_defs;
+  std::vector<tensorflow::GraphDef*> graph_defs;
   bool inited;
   template<class VT> VT get_scalar(const std::string name) const;
   // VALUETYPE get_rcut () const;
@@ -426,7 +426,6 @@ class DeepPotModelDevi
 
   // function used for nborlist copy
   std::vector<std::vector<int> > get_sel() const;
-  void cum_sum(const std::vector<std::vector<tensorflow::int32> > n_sel);
 };
 }
 
diff --git a/source/api_cc/include/DeepTensor.h b/source/api_cc/include/DeepTensor.h
index 2f696d5289..0bdec4ce56 100644
--- a/source/api_cc/include/DeepTensor.h
+++ b/source/api_cc/include/DeepTensor.h
@@ -160,7 +160,7 @@ class DeepTensor
   tensorflow::Session* session;
   std::string name_scope;
   int num_intra_nthreads, num_inter_nthreads;
-  tensorflow::GraphDef graph_def;
+  tensorflow::GraphDef* graph_def;
   bool inited;
   VALUETYPE rcut;
   VALUETYPE cell_size;
diff --git a/source/api_cc/include/common.h b/source/api_cc/include/common.h
index 9d8faa83c4..d82ce50e84 100644
--- a/source/api_cc/include/common.h
+++ b/source/api_cc/include/common.h
@@ -8,21 +8,15 @@
 #include "AtomMap.h"
 #include "errors.h"
 
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/public/version.h"
-#include <tensorflow/core/graph/default_device.h>
-#include <tensorflow/core/graph/graph_def_builder.h>
+#ifdef TF_PRIVATE
+#include "tf_private.h"
+#else
+#include "tf_public.h"
+#endif
 
 
 namespace deepmd{
 
-#if TF_MAJOR_VERSION >= 2 && TF_MINOR_VERSION >= 2
-typedef tensorflow::tstring STRINGTYPE;
-#else
-typedef std::string STRINGTYPE;
-#endif
-
 #ifdef HIGH_PREC
 typedef double VALUETYPE;
 typedef double ENERGYTYPE;
@@ -188,5 +182,22 @@ session_input_tensors (std::vector<std::pair<std::string, tensorflow::Tensor>> &
 		       const int			nghost,
 		       const int			ago,
 		       const std::string		scope = "");
+
+/**
+* @brief Read model file to a string.
+* @param[in] model Path to the model.
+* @param[out] file_content Content of the model file.
+**/
+void
+read_file_to_string(std::string model, std::string & file_content);
+
+
+/**
+* @brief Convert pbtxt to pb.
+* @param[in] fn_pb_txt Filename of the pb txt file.
+* @param[out] fn_pb Filename of the pb file.
+**/
+void
+convert_pbtxt_to_pb(std::string fn_pb_txt, std::string fn_pb);
 }
 
diff --git a/source/api_cc/include/tf_private.h b/source/api_cc/include/tf_private.h
new file mode 100644
index 0000000000..12b7077139
--- /dev/null
+++ b/source/api_cc/include/tf_private.h
@@ -0,0 +1,19 @@
+/**
+ * @file tf_private.h
+ * @brief This file includes TensorFlow headers used for compilation.
+ * 
+ */
+
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/version.h"
+#include <tensorflow/core/graph/default_device.h>
+#include <tensorflow/core/graph/graph_def_builder.h>
+
+namespace deepmd {
+#if TF_MAJOR_VERSION >= 2 && TF_MINOR_VERSION >= 2
+typedef tensorflow::tstring STRINGTYPE;
+#else
+typedef std::string STRINGTYPE;
+#endif
+}
diff --git a/source/api_cc/include/tf_public.h b/source/api_cc/include/tf_public.h
new file mode 100644
index 0000000000..e766168c98
--- /dev/null
+++ b/source/api_cc/include/tf_public.h
@@ -0,0 +1,15 @@
+/**
+ * @file tf_public.h
+ * @brief This file declares incompleted TensorFlow class used for public headers.
+ * 
+ */
+
+// skip if TF headers have been included
+#ifndef TF_MAJOR_VERSION
+namespace tensorflow{
+    class Session;
+    class Tensor;
+    class GraphDef;
+    class Status;
+}
+#endif
diff --git a/source/api_cc/src/DataModifier.cc b/source/api_cc/src/DataModifier.cc
index 2fbd58584b..fef74596b5 100644
--- a/source/api_cc/src/DataModifier.cc
+++ b/source/api_cc/src/DataModifier.cc
@@ -5,7 +5,8 @@ using namespace tensorflow;
 
 DipoleChargeModifier::
 DipoleChargeModifier()
-    : inited (false)
+    : inited (false),
+      graph_def(new GraphDef())
 {
 }
 
@@ -13,7 +14,8 @@ DipoleChargeModifier::
 DipoleChargeModifier(const std::string & model, 
 	     const int & gpu_rank, 
 	     const std::string &name_scope_)
-    : inited (false), name_scope(name_scope_)
+    : inited (false), name_scope(name_scope_),
+      graph_def(new GraphDef())
 {
   init(model, gpu_rank);  
 }
@@ -35,8 +37,8 @@ init (const std::string & model,
   options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
   deepmd::load_op_library();
   deepmd::check_status(NewSession(options, &session));
-  deepmd::check_status(ReadBinaryProto(Env::Default(), model, &graph_def));
-  deepmd::check_status(session->Create(graph_def));  
+  deepmd::check_status(ReadBinaryProto(Env::Default(), model, graph_def));
+  deepmd::check_status(session->Create(*graph_def));  
   // int nnodes = graph_def.node_size();
   // for (int ii = 0; ii < nnodes; ++ii){
   //   cout << ii << " \t " << graph_def.node(ii).name() << endl;
diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc
index 1b99e78920..f6b16f1064 100644
--- a/source/api_cc/src/DeepPot.cc
+++ b/source/api_cc/src/DeepPot.cc
@@ -162,13 +162,15 @@ static void run_model (ENERGYTYPE   &		dener,
 
 DeepPot::
 DeepPot ()
-    : inited (false), init_nbor (false)
+    : inited (false), init_nbor (false),
+      graph_def(new GraphDef())
 {
 }
 
 DeepPot::
 DeepPot (const std::string & model, const int & gpu_rank, const std::string & file_content)
-    : inited (false), init_nbor (false)
+    : inited (false), init_nbor (false),
+      graph_def(new GraphDef())
 {
   init(model, gpu_rank, file_content);  
 }
@@ -190,9 +192,9 @@ init (const std::string & model, const int & gpu_rank, const std::string & file_
   deepmd::load_op_library();
 
   if(file_content.size() == 0)
-    check_status (ReadBinaryProto(Env::Default(), model, &graph_def));
+    check_status (ReadBinaryProto(Env::Default(), model, graph_def));
   else
-    graph_def.ParseFromString(file_content);
+    (*graph_def).ParseFromString(file_content);
   int gpu_num = -1;
   #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   DPGetDeviceCount(gpu_num); // check current device environment
@@ -203,11 +205,11 @@ init (const std::string & model, const int & gpu_rank, const std::string & file_
     DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
     std::string str = "/gpu:";
     str += std::to_string(gpu_rank % gpu_num);
-    graph::SetDefaultDevice(str, &graph_def);
+    graph::SetDefaultDevice(str, graph_def);
   }
   #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   check_status (NewSession(options, &session));
-  check_status (session->Create(graph_def));
+  check_status (session->Create(*graph_def));
   rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
   cell_size = rcut;
   ntypes = get_scalar<int>("descrpt_attr/ntypes");
@@ -282,7 +284,7 @@ std::string graph_info(const GraphDef & graph_def) {
 // init the tmp array data
 std::vector<int> DeepPot::get_sel_a () const {
     std::vector<int> sel_a;
-    std::istringstream is(graph_info(graph_def));
+    std::istringstream is(graph_info(*graph_def));
     std::string line = "";
     while(is >> line) {
         if (line.find("sel_a") != line.npos) {
@@ -548,10 +550,11 @@ init (const std::vector<std::string> & models, const int & gpu_rank, const std::
   options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
   options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
   for (unsigned ii = 0; ii < numb_models; ++ii){
+    graph_defs[ii] = new GraphDef();
     if (file_contents.size() == 0)
-      check_status (ReadBinaryProto(Env::Default(), models[ii], &graph_defs[ii]));
+      check_status (ReadBinaryProto(Env::Default(), models[ii], graph_defs[ii]));
     else
-      graph_defs[ii].ParseFromString(file_contents[ii]);
+      (*graph_defs[ii]).ParseFromString(file_contents[ii]);
   }
   #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (gpu_num > 0) {
@@ -566,10 +569,10 @@ init (const std::vector<std::string> & models, const int & gpu_rank, const std::
     if (gpu_num > 0) {
       std::string str = "/gpu:";
       str += std::to_string(gpu_rank % gpu_num);
-      graph::SetDefaultDevice(str, &graph_defs[ii]);
+      graph::SetDefaultDevice(str, &(*graph_defs[ii]));
     }
     check_status (NewSession(options, &(sessions[ii])));
-    check_status (sessions[ii]->Create(graph_defs[ii]));
+    check_status (sessions[ii]->Create(*graph_defs[ii]));
   }
   rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
   cell_size = rcut;
@@ -620,7 +623,7 @@ get_sel () const
     std::vector<std::vector<int> > sec;
     for (int ii = 0; ii < numb_models; ii++) {
         std::vector<int> sel;
-        std::istringstream is(graph_info(graph_defs[ii]));
+        std::istringstream is(graph_info(*graph_defs[ii]));
         std::string line = "";
         while(is >> line) {
             if (line.find("sel") != line.npos) {
@@ -643,20 +646,6 @@ get_sel () const
     return sec;
 }
 
-void  
-DeepPotModelDevi::
-cum_sum (const std::vector<std::vector<int32> > n_sel) 
-{
-    for (int ii = 0; ii < numb_models; ++ii) {
-        std::vector<int> _sec;
-        _sec.resize (n_sel[ii].size() + 1);
-        _sec[0] = 0;
-        for (int jj = 1; jj < _sec.size(); ++jj) {
-            _sec[jj] = _sec[jj-1] + n_sel[ii][jj-1];
-        }
-        sec.push_back(_sec);
-    }
-}
 
 void
 DeepPotModelDevi::
diff --git a/source/api_cc/src/DeepTensor.cc b/source/api_cc/src/DeepTensor.cc
index 316b6ec3a9..6a67b70ea0 100644
--- a/source/api_cc/src/DeepTensor.cc
+++ b/source/api_cc/src/DeepTensor.cc
@@ -5,7 +5,8 @@ using namespace tensorflow;
 
 DeepTensor::
 DeepTensor()
-    : inited (false)
+    : inited (false),
+      graph_def(new GraphDef())
 {
 }
 
@@ -13,7 +14,8 @@ DeepTensor::
 DeepTensor(const std::string & model, 
 	   const int & gpu_rank, 
 	   const std::string &name_scope_)
-    : inited (false), name_scope(name_scope_)
+    : inited (false), name_scope(name_scope_),
+      graph_def(new GraphDef())
 {
   init(model, gpu_rank);  
 }
@@ -35,8 +37,8 @@ init (const std::string & model,
   options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
   deepmd::load_op_library();
   deepmd::check_status (NewSession(options, &session));
-  deepmd::check_status (ReadBinaryProto(Env::Default(), model, &graph_def));
-  deepmd::check_status (session->Create(graph_def));  
+  deepmd::check_status (ReadBinaryProto(Env::Default(), model, graph_def));
+  deepmd::check_status (session->Create(*graph_def));  
   rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
   cell_size = rcut;
   ntypes = get_scalar<int>("descrpt_attr/ntypes");
diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc
index a83f364e11..01c2dd5f8d 100644
--- a/source/api_cc/src/common.cc
+++ b/source/api_cc/src/common.cc
@@ -2,6 +2,9 @@
 #include "AtomMap.h"
 #include "device.h"
 #include <dlfcn.h>
+#include <fcntl.h>
+#include "google/protobuf/text_format.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
 
 using namespace tensorflow;
 
@@ -855,3 +858,25 @@ select_map_inv<deepmd::STRINGTYPE >(
     const typename std::vector<deepmd::STRINGTYPE >::const_iterator in, 
     const std::vector<int > & idx_map, 
     const int & stride);
+
+
+void
+deepmd::
+read_file_to_string(std::string model, std::string & file_content)
+{
+  deepmd::check_status(tensorflow::ReadFileToString(tensorflow::Env::Default(), model, &file_content));
+}
+
+
+void
+deepmd::
+convert_pbtxt_to_pb(std::string fn_pb_txt, std::string fn_pb)
+{
+    int fd = open(fn_pb_txt.c_str(), O_RDONLY);
+    tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
+    tensorflow::GraphDef graph_def;
+    tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
+    delete input;
+    std::fstream output(fn_pb, std::ios::out | std::ios::trunc | std::ios::binary);
+    graph_def.SerializeToOstream(&output);
+}
diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt
index 5609f0a8f2..b4ac448f9c 100644
--- a/source/api_cc/tests/CMakeLists.txt
+++ b/source/api_cc/tests/CMakeLists.txt
@@ -26,6 +26,10 @@ include_directories(${API_BASE_DIR}/include)
 include_directories(${CMAKE_SOURCE_DIR})
 file(GLOB API_SRC ${API_BASE_DIR}/src/*.cc ${API_BASE_DIR}/src/*.cpp)
 add_library(${apiname} SHARED ${API_SRC})
+target_compile_definitions(${apiname}
+  PRIVATE TF_PRIVATE
+)
+target_include_directories(${apiname} PRIVATE ${TensorFlow_INCLUDE_DIRS})
 configure_file(
   ${API_BASE_DIR}/include/version.h.in
   ${CMAKE_SOURCE_DIR}/version.h
@@ -44,8 +48,8 @@ if (TENSORFLOW_VERSION GREATER_EQUAL 2.7)
 else()
   set (CMAKE_CXX_STANDARD 11)
 endif()
-include_directories(${TensorFlow_INCLUDE_DIRS})
 add_library(${opname} SHARED ${OP_SRC})
+target_include_directories(${opname} PRIVATE ${TensorFlow_INCLUDE_DIRS})
 
 find_package(Threads)
 # find openmp
@@ -105,12 +109,6 @@ else()
   target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} pthread ${TensorFlow_LIBRARY} rt coverage_config)
 endif()
 
-find_package(Protobuf)
-if(Protobuf_FOUND)
-  include_directories(${Protobuf_INCLUDE_DIRS})
-  target_link_libraries(runUnitTests ${Protobuf_LIBRARIES})
-endif()
-
 add_test( runUnitTests runUnitTests )
 
 find_package(GTest)
diff --git a/source/api_cc/tests/test_deepdipole.cc b/source/api_cc/tests/test_deepdipole.cc
index 94f21ec675..23382f49df 100644
--- a/source/api_cc/tests/test_deepdipole.cc
+++ b/source/api_cc/tests/test_deepdipole.cc
@@ -7,8 +7,6 @@
 #include "neighbor_list.h"
 #include "test_utils.h"
 
-#include "google/protobuf/text_format.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>  
@@ -38,17 +36,7 @@ class TestInferDeepDipole : public ::testing::Test
   deepmd::DeepTensor dp;
 
   void SetUp() override {
-    std::string file_name = "../../tests/infer/deepdipole.pbtxt";
-    int fd = open(file_name.c_str(), O_RDONLY);
-    tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-    tensorflow::GraphDef graph_def;
-    tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-    delete input;
-    std::fstream output("deepdipole.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-    graph_def.SerializeToOstream(&output);
-    // check the string by the following commands
-    // string txt;
-    // tensorflow::protobuf::TextFormat::PrintToString(graph_def, &txt);
+    deepmd::convert_pbtxt_to_pb("../../tests/infer/deepdipole.pbtxt", "deepdipole.pb");
 
     dp.init("deepdipole.pb");
   };
@@ -139,16 +127,7 @@ class TestInferDeepDipoleNew : public ::testing::Test
 
   void SetUp() override {
     std::string file_name = "../../tests/infer/deepdipole_new.pbtxt";
-    int fd = open(file_name.c_str(), O_RDONLY);
-    tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-    tensorflow::GraphDef graph_def;
-    tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-    delete input;
-    std::fstream output("deepdipole_new.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-    graph_def.SerializeToOstream(&output);
-    // check the string by the following commands
-    // string txt;
-    // tensorflow::protobuf::TextFormat::PrintToString(graph_def, &txt);
+    deepmd::convert_pbtxt_to_pb("../../tests/infer/deepdipole_new.pbtxt", "deepdipole_new.pb");
     dp.init("deepdipole_new.pb");
     odim = dp.output_dim ();
 
@@ -339,17 +318,7 @@ class TestInferDeepDipoleFake : public ::testing::Test
   deepmd::DeepTensor dp;
 
   void SetUp() override {
-    std::string file_name = "../../tests/infer/deepdipole_fake.pbtxt";
-    int fd = open(file_name.c_str(), O_RDONLY);
-    tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-    tensorflow::GraphDef graph_def;
-    tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-    delete input;
-    std::fstream output("deepdipole_fake.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-    graph_def.SerializeToOstream(&output);
-    // check the string by the following commands
-    // string txt;
-    // tensorflow::protobuf::TextFormat::PrintToString(graph_def, &txt);
+    deepmd::convert_pbtxt_to_pb("../../tests/infer/deepdipole_fake.pbtxt", "deepdipole_fake.pb");
 
     dp.init("deepdipole_fake.pb");
   };
diff --git a/source/api_cc/tests/test_deeppolar.cc b/source/api_cc/tests/test_deeppolar.cc
index 3dfc34c1d0..ad4ccdca5b 100644
--- a/source/api_cc/tests/test_deeppolar.cc
+++ b/source/api_cc/tests/test_deeppolar.cc
@@ -7,8 +7,6 @@
 #include "neighbor_list.h"
 #include "test_utils.h"
 
-#include "google/protobuf/text_format.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>  
@@ -39,16 +37,7 @@ class TestInferDeepPolar : public ::testing::Test
 
   void SetUp() override {
     std::string file_name = "../../tests/infer/deeppolar.pbtxt";
-    int fd = open(file_name.c_str(), O_RDONLY);
-    tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-    tensorflow::GraphDef graph_def;
-    tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-    delete input;
-    std::fstream output("deeppolar.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-    graph_def.SerializeToOstream(&output);
-    // check the string by the following commands
-    // string txt;
-    // tensorflow::protobuf::TextFormat::PrintToString(graph_def, &txt);
+    deepmd::convert_pbtxt_to_pb("../../tests/infer/deeppolar.pbtxt", "deeppolar.pb");
 
     dp.init("deeppolar.pb");
 
@@ -141,16 +130,7 @@ class TestInferDeepPolarNew : public ::testing::Test
 
   void SetUp() override {
     std::string file_name = "../../tests/infer/deeppolar_new.pbtxt";
-    int fd = open(file_name.c_str(), O_RDONLY);
-    tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-    tensorflow::GraphDef graph_def;
-    tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-    delete input;
-    std::fstream output("deeppolar_new.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-    graph_def.SerializeToOstream(&output);
-    // check the string by the following commands
-    // string txt;
-    // tensorflow::protobuf::TextFormat::PrintToString(graph_def, &txt);
+    deepmd::convert_pbtxt_to_pb("../../tests/infer/deeppolar_new.pbtxt", "deeppolar_new.pb");
     dp.init("deeppolar_new.pb");
     odim = dp.output_dim ();
 
diff --git a/source/api_cc/tests/test_deeppot_a.cc b/source/api_cc/tests/test_deeppot_a.cc
index 61336c4e33..d8082972f8 100644
--- a/source/api_cc/tests/test_deeppot_a.cc
+++ b/source/api_cc/tests/test_deeppot_a.cc
@@ -7,8 +7,6 @@
 #include "neighbor_list.h"
 #include "test_utils.h"
 
-#include "google/protobuf/text_format.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>  
@@ -47,16 +45,7 @@ class TestInferDeepPotA : public ::testing::Test
 
   void SetUp() override {
     std::string file_name = "../../tests/infer/deeppot.pbtxt";
-    int fd = open(file_name.c_str(), O_RDONLY);
-    tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-    tensorflow::GraphDef graph_def;
-    tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-    delete input;
-    std::fstream output("deeppot.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-    graph_def.SerializeToOstream(&output);
-    // check the string by the following commands
-    // string txt;
-    // tensorflow::protobuf::TextFormat::PrintToString(graph_def, &txt);
+    deepmd::convert_pbtxt_to_pb("../../tests/infer/deeppot.pbtxt", "deeppot.pb");
 
     dp.init("deeppot.pb");
 
@@ -430,13 +419,7 @@ class TestInferDeepPotANoPbc : public ::testing::Test
 
   void SetUp() override {
     std::string file_name = "../../tests/infer/deeppot.pbtxt";
-    int fd = open(file_name.c_str(), O_RDONLY);
-    tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-    tensorflow::GraphDef graph_def;
-    tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-    delete input;
-    std::fstream output("deeppot.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-    graph_def.SerializeToOstream(&output);
+    deepmd::convert_pbtxt_to_pb(file_name, "deeppot.pb");
 
     dp.init("deeppot.pb");
 
diff --git a/source/api_cc/tests/test_deeppot_model_devi.cc b/source/api_cc/tests/test_deeppot_model_devi.cc
index 3b10fbf5c2..2b532f098e 100644
--- a/source/api_cc/tests/test_deeppot_model_devi.cc
+++ b/source/api_cc/tests/test_deeppot_model_devi.cc
@@ -7,8 +7,6 @@
 #include "neighbor_list.h"
 #include "test_utils.h"
 
-#include "google/protobuf/text_format.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>  
@@ -39,24 +37,12 @@ class TestInferDeepPotModeDevi : public ::testing::Test
   void SetUp() override {
     {
       std::string file_name = "../../tests/infer/deeppot.pbtxt";
-      int fd = open(file_name.c_str(), O_RDONLY);
-      tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-      tensorflow::GraphDef graph_def;
-      tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-      delete input;
-      std::fstream output("deeppot.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-      graph_def.SerializeToOstream(&output);
+      deepmd::convert_pbtxt_to_pb("../../tests/infer/deeppot.pbtxt", "deeppot.pb");
       dp0.init("deeppot.pb");
     }
     {
       std::string file_name = "../../tests/infer/deeppot-1.pbtxt";
-      int fd = open(file_name.c_str(), O_RDONLY);
-      tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-      tensorflow::GraphDef graph_def;
-      tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-      delete input;
-      std::fstream output("deeppot-1.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-      graph_def.SerializeToOstream(&output);
+      deepmd::convert_pbtxt_to_pb("../../tests/infer/deeppot-1.pbtxt", "deeppot-1.pb");
       dp1.init("deeppot-1.pb");
     }
     dp_md.init(std::vector<std::string>({"deeppot.pb", "deeppot-1.pb"}));
@@ -101,24 +87,12 @@ class TestInferDeepPotModeDeviPython : public ::testing::Test
   void SetUp() override {
     {
       std::string file_name = "../../tests/infer/deeppot.pbtxt";
-      int fd = open(file_name.c_str(), O_RDONLY);
-      tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-      tensorflow::GraphDef graph_def;
-      tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-      delete input;
-      std::fstream output("deeppot.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-      graph_def.SerializeToOstream(&output);
+      deepmd::convert_pbtxt_to_pb("../../tests/infer/deeppot.pbtxt", "deeppot.pb");
       dp0.init("deeppot.pb");
     }
     {
       std::string file_name = "../../tests/infer/deeppot-1.pbtxt";
-      int fd = open(file_name.c_str(), O_RDONLY);
-      tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-      tensorflow::GraphDef graph_def;
-      tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-      delete input;
-      std::fstream output("deeppot-1.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-      graph_def.SerializeToOstream(&output);
+      deepmd::convert_pbtxt_to_pb("../../tests/infer/deeppot-1.pbtxt", "deeppot-1.pb");
       dp1.init("deeppot-1.pb");
     }
     dp_md.init(std::vector<std::string>({"deeppot.pb", "deeppot-1.pb"}));
diff --git a/source/api_cc/tests/test_deeppot_r.cc b/source/api_cc/tests/test_deeppot_r.cc
index 2d6af9f6ae..3bd34f398c 100644
--- a/source/api_cc/tests/test_deeppot_r.cc
+++ b/source/api_cc/tests/test_deeppot_r.cc
@@ -7,8 +7,6 @@
 #include "neighbor_list.h"
 #include "test_utils.h"
 
-#include "google/protobuf/text_format.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>  
@@ -47,16 +45,7 @@ class TestInferDeepPotR : public ::testing::Test
 
   void SetUp() override {
     std::string file_name = "../../tests/infer/deeppot-r.pbtxt";
-    int fd = open(file_name.c_str(), O_RDONLY);
-    tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-    tensorflow::GraphDef graph_def;
-    tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-    delete input;
-    std::fstream output("deeppot.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-    graph_def.SerializeToOstream(&output);
-    // check the string by the following commands
-    // string txt;
-    // tensorflow::protobuf::TextFormat::PrintToString(graph_def, &txt);
+    deepmd::convert_pbtxt_to_pb("../../tests/infer/deeppot-r.pbtxt", "deeppot.pb");
 
     dp.init("deeppot.pb");
 
@@ -430,13 +419,7 @@ class TestInferDeepPotRNoPbc : public ::testing::Test
 
   void SetUp() override {
     std::string file_name = "../../tests/infer/deeppot-r.pbtxt";
-    int fd = open(file_name.c_str(), O_RDONLY);
-    tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-    tensorflow::GraphDef graph_def;
-    tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-    delete input;
-    std::fstream output("deeppot.pb", std::ios::out | std::ios::trunc | std::ios::binary);
-    graph_def.SerializeToOstream(&output);
+    deepmd::convert_pbtxt_to_pb("../../tests/infer/deeppot-r.pbtxt", "deeppot.pb");
 
     dp.init("deeppot.pb");
 
diff --git a/source/api_cc/tests/test_dipolecharge.cc b/source/api_cc/tests/test_dipolecharge.cc
index 6db7b8af0e..b75677f8f6 100644
--- a/source/api_cc/tests/test_dipolecharge.cc
+++ b/source/api_cc/tests/test_dipolecharge.cc
@@ -10,8 +10,6 @@
 #include "neighbor_list.h"
 #include "test_utils.h"
 
-#include "google/protobuf/text_format.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>  
@@ -58,20 +56,8 @@ class TestDipoleCharge : public ::testing::Test
 
   void SetUp() override {
     std::string file_name = "../../tests/infer/dipolecharge_e.pbtxt";
-    int fd = open(file_name.c_str(), O_RDONLY);
-    tensorflow::protobuf::io::ZeroCopyInputStream* input = new tensorflow::protobuf::io::FileInputStream(fd);
-    tensorflow::GraphDef graph_def;
-    tensorflow::protobuf::TextFormat::Parse(input, &graph_def);
-    delete input;
     std::string model = "dipolecharge_e.pb";
-    std::fstream output(model.c_str(), std::ios::out | std::ios::trunc | std::ios::binary);
-    graph_def.SerializeToOstream(&output);
-    // check the string by the following commands
-    // string txt;
-    // tensorflow::protobuf::TextFormat::PrintToString(graph_def, &txt);
-
-    // dp.init("dipolecharge_d.pb");
-    // dm.init("dipolecharge_d.pb");
+    deepmd::convert_pbtxt_to_pb(file_name, model);
     dp.init(model, 0, "dipole_charge");
     dm.init(model, 0, "dipole_charge");
 
diff --git a/source/cmake/googletest.cmake.in b/source/cmake/googletest.cmake.in
index 37b622bf30..577bd17dc1 100644
--- a/source/cmake/googletest.cmake.in
+++ b/source/cmake/googletest.cmake.in
@@ -2,9 +2,15 @@ cmake_minimum_required(VERSION 2.8.2)
 
 project(googletest-download NONE)
 
+if (USE_GITEE_GTEST)
+  set(GTEST_REPO_ADDRESS "https://gitee.com/mirrors/googletest.git")
+else ()
+  set(GTEST_REPO_ADDRESS "https://github.com/google/googletest.git")
+endif()
+
 include(ExternalProject)
 ExternalProject_Add(googletest
-  GIT_REPOSITORY    https://github.com/google/googletest.git
+  GIT_REPOSITORY    ${GTEST_REPO_ADDRESS}
   GIT_TAG           main
   SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
   BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
diff --git a/source/gmx/CMakeLists.txt b/source/gmx/CMakeLists.txt
index c0683b01ba..baa2e13140 100644
--- a/source/gmx/CMakeLists.txt
+++ b/source/gmx/CMakeLists.txt
@@ -15,7 +15,6 @@ file(GLOB INC_SRC include/*.h)
 
 add_library(${libgmxname} SHARED ${LIB_SRC})
 target_link_libraries(${libgmxname} ${LIB_DEEPMD_CC} ${LIB_DEEPMD} ${TensorFlow_LIBRARY} ${TensorFlowFramework_LIBRARY})
-target_include_directories(${libgmxname} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../api_cc/include)
 target_include_directories(${libgmxname} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_include_directories(${libgmxname} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../3rdparty/)
 
diff --git a/source/gmx/src/gmx_plugin.cpp b/source/gmx/src/gmx_plugin.cpp
index db68d6e55a..65ddc14f02 100644
--- a/source/gmx/src/gmx_plugin.cpp
+++ b/source/gmx/src/gmx_plugin.cpp
@@ -2,6 +2,7 @@
 #include "json.hpp"
 #include <iostream>
 #include <fstream>
+#include <sstream>
 
 using namespace deepmd;
 
diff --git a/source/install/test_cc.sh b/source/install/test_cc.sh
index 58fa86eb10..65582e2986 100755
--- a/source/install/test_cc.sh
+++ b/source/install/test_cc.sh
@@ -26,7 +26,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test_cc
 mkdir -p ${BUILD_TMP_DIR}
 mkdir -p ${INSTALL_PREFIX}
 cd ${BUILD_TMP_DIR}
-cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} ../api_cc/tests
+cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} ../api_cc/tests
 make -j${NPROC}
 make install
 
diff --git a/source/lib/include/env_mat_nvnmd.h b/source/lib/include/env_mat_nvnmd.h
new file mode 100644
index 0000000000..640ab2e998
--- /dev/null
+++ b/source/lib/include/env_mat_nvnmd.h
@@ -0,0 +1,41 @@
+
+/*
+//==================================================
+ _   _  __     __  _   _   __  __   ____  
+| \ | | \ \   / / | \ | | |  \/  | |  _ \ 
+|  \| |  \ \ / /  |  \| | | |\/| | | | | |
+| |\  |   \ V /   | |\  | | |  | | | |_| |
+|_| \_|    \_/    |_| \_| |_|  |_| |____/ 
+
+//==================================================
+
+code: nvnmd
+reference: deepmd
+author: mph (pinghui_mo@outlook.com)
+date: 2021-12-6
+
+*/
+
+#pragma once
+
+#include <cmath>
+#include <vector>
+#include "utilities.h"
+
+namespace deepmd{
+
+template<typename FPTYPE> 
+void env_mat_a_nvnmd_quantize_cpu (
+    std::vector<FPTYPE > &	        descrpt_a,
+    std::vector<FPTYPE > &	        descrpt_a_deriv,
+    std::vector<FPTYPE > &	        rij_a,
+    const std::vector<FPTYPE > &	posi,
+    const std::vector<int > &		type,
+    const int &				i_idx,
+    const std::vector<int > &		fmt_nlist,
+    const std::vector<int > &		sec, 
+    const float &			rmin,
+    const float &			rmax,
+    const FPTYPE            precs[3]);
+
+}
diff --git a/source/lib/include/prod_env_mat_nvnmd.h b/source/lib/include/prod_env_mat_nvnmd.h
new file mode 100644
index 0000000000..0dbe1ff247
--- /dev/null
+++ b/source/lib/include/prod_env_mat_nvnmd.h
@@ -0,0 +1,56 @@
+/*
+//==================================================
+ _   _  __     __  _   _   __  __   ____  
+| \ | | \ \   / / | \ | | |  \/  | |  _ \ 
+|  \| |  \ \ / /  |  \| | | |\/| | | | | |
+| |\  |   \ V /   | |\  | | |  | | | |_| |
+|_| \_|    \_/    |_| \_| |_|  |_| |____/ 
+
+//==================================================
+
+code: nvnmd
+reference: deepmd
+author: mph (pinghui_mo@outlook.com)
+date: 2021-12-6
+
+*/
+
+#pragma once
+#include <vector>
+#include "device.h"
+#include "neighbor_list.h"
+
+namespace deepmd{
+
+// prod_env_mat_a_nvnmd_cpu
+// have been remove for the same function
+
+template<typename FPTYPE>
+void prod_env_mat_a_nvnmd_quantize_cpu(
+    FPTYPE * em, 
+    FPTYPE * em_deriv, 
+    FPTYPE * rij, 
+    int * nlist, 
+    const FPTYPE * coord, 
+    const int * type, 
+    const InputNlist & inlist,
+    const int max_nbor_size,
+    const FPTYPE * avg, 
+    const FPTYPE * std, 
+    const int nloc, 
+    const int nall, 
+    const float rcut, 
+    const float rcut_smth, 
+    const std::vector<int> sec,
+    const FPTYPE precs[3]);
+
+#if GOOGLE_CUDA
+// UNDEFINE
+#endif // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+// UNDEFINE
+#endif // TENSORFLOW_USE_ROCM
+
+}
+
diff --git a/source/lib/src/env_mat_nvnmd.cc b/source/lib/src/env_mat_nvnmd.cc
new file mode 100644
index 0000000000..1455784465
--- /dev/null
+++ b/source/lib/src/env_mat_nvnmd.cc
@@ -0,0 +1,160 @@
+/*
+//==================================================
+ _   _  __     __  _   _   __  __   ____  
+| \ | | \ \   / / | \ | | |  \/  | |  _ \ 
+|  \| |  \ \ / /  |  \| | | |\/| | | | | |
+| |\  |   \ V /   | |\  | | |  | | | |_| |
+|_| \_|    \_/    |_| \_| |_|  |_| |____/ 
+
+//==================================================
+
+code: nvnmd
+reference: deepmd
+author: mph (pinghui_mo@outlook.com)
+date: 2021-12-6
+
+*/
+
+
+#include "env_mat_nvnmd.h"
+#include "switcher.h"
+
+// env_mat_a_nvnmd_cpu
+// have been remove for the same function
+
+/*
+//==================================================
+  env_mat_a_nvnmd_quantize_cpu
+//==================================================
+*/
+
+
+template<typename FPTYPE> 
+void 
+deepmd::
+env_mat_a_nvnmd_quantize_cpu (
+    std::vector<FPTYPE > &	        descrpt_a,
+    std::vector<FPTYPE > &	        descrpt_a_deriv,
+    std::vector<FPTYPE > &	        rij_a,
+    const std::vector<FPTYPE > &	posi,
+    const std::vector<int > &		type,
+    const int &				i_idx,
+    const std::vector<int > &		fmt_nlist_a,
+    const std::vector<int > &		sec_a, 
+    const float &			rmin,
+    const float &			rmax,
+    const FPTYPE precs[3])
+{  
+    // compute the diff of the neighbors
+    rij_a.resize (sec_a.back() * 3);
+    fill (rij_a.begin(), rij_a.end(), 0.0);
+    for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) {
+        for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) {
+            if (fmt_nlist_a[jj] < 0) break;
+            const int & j_idx = fmt_nlist_a[jj];
+            for (int dd = 0; dd < 3; ++dd) {
+                rij_a[jj * 3 + dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
+            }
+        }
+    }
+    // 1./rr, cos(theta), cos(phi), sin(phi)
+    descrpt_a.resize (sec_a.back() * 4);
+    fill (descrpt_a.begin(), descrpt_a.end(), 0.0);
+    // deriv wrt center: 3
+    descrpt_a_deriv.resize (sec_a.back() * 4 * 3);
+    fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0);
+
+    /*
+    precs: NBIT_DATA_FL, NBIT_FEA_X, NBIT_FEA_FL
+    */
+   const double rc2 = rmax * rmax;
+
+
+    for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) {
+        for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) {      
+            if (fmt_nlist_a[nei_iter] < 0) break;
+            const FPTYPE * rr = &rij_a[nei_iter * 3];
+
+            // NVNMD
+            FPTYPE rij[3];
+            rij[0] = round(rr[0] * precs[0]) / precs[0];
+            rij[1] = round(rr[1] * precs[0]) / precs[0];
+            rij[2] = round(rr[2] * precs[0]) / precs[0];
+            FPTYPE nr2 = deepmd::dot3(rij, rij);
+            nr2 = floor(nr2 * precs[0]) / precs[0];
+
+            // FPTYPE nr2 = deepmd::dot3(rr, rr);
+            // FPTYPE inr = 1./sqrt(nr2);
+            // FPTYPE nr = nr2 * inr;
+            // FPTYPE inr2 = inr * inr;
+            // FPTYPE inr4 = inr2 * inr2;
+            // FPTYPE inr3 = inr4 * nr;
+            // FPTYPE sw, dsw;
+            // deepmd::spline5_switch(sw, dsw, nr, rmin, rmax);
+            int idx_deriv = nei_iter * 4 * 3;	// 4 components time 3 directions
+            int idx_value = nei_iter * 4;	// 4 components
+            // 4 value components
+            descrpt_a[idx_value + 0] = nr2;
+            descrpt_a[idx_value + 1] = rij[0];
+            descrpt_a[idx_value + 2] = rij[1];
+            descrpt_a[idx_value + 3] = rij[2];
+            // deriv of component 1/r
+            descrpt_a_deriv[idx_deriv + 0] = -2 * rij[0];
+            descrpt_a_deriv[idx_deriv + 1] = -2 * rij[1];
+            descrpt_a_deriv[idx_deriv + 2] = -2 * rij[2];
+            /*
+            d(sw*x/r)_d(x) = x * d(sw/r)_d(x) + sw/r
+            d(sw*y/r)_d(x) = y * d(sw/r)_d(x)
+            */
+            // deriv of component x/r
+            descrpt_a_deriv[idx_deriv + 3] = -1;
+            descrpt_a_deriv[idx_deriv + 4] =  0;
+            descrpt_a_deriv[idx_deriv + 5] =  0;
+            // deriv of component y/r2
+            descrpt_a_deriv[idx_deriv + 6] =  0;
+            descrpt_a_deriv[idx_deriv + 7] = -1;
+            descrpt_a_deriv[idx_deriv + 8] =  0;
+            // deriv of component z/r2
+            descrpt_a_deriv[idx_deriv + 9] =  0;
+            descrpt_a_deriv[idx_deriv +10] =  0;
+            descrpt_a_deriv[idx_deriv +11] = -1;
+        }
+    }
+}
+
+
+
+template
+void 
+deepmd::
+env_mat_a_nvnmd_quantize_cpu<double> (
+    std::vector<double > &	        descrpt_a,
+    std::vector<double > &	        descrpt_a_deriv,
+    std::vector<double > &	        rij_a,
+    const std::vector<double > &	posi,
+    const std::vector<int > &		type,
+    const int &				i_idx,
+    const std::vector<int > &		fmt_nlist,
+    const std::vector<int > &		sec, 
+    const float &			rmin,
+    const float &			rmax,
+    const double      precs[3]);
+
+
+template
+void 
+deepmd::
+env_mat_a_nvnmd_quantize_cpu<float> (
+    std::vector<float > &	        descrpt_a,
+    std::vector<float > &	        descrpt_a_deriv,
+    std::vector<float > &	        rij_a,
+    const std::vector<float > &		posi,
+    const std::vector<int > &		type,
+    const int &				i_idx,
+    const std::vector<int > &		fmt_nlist,
+    const std::vector<int > &		sec, 
+    const float &			rmin,
+    const float &			rmax,
+    const float       precs[3]);
+
+
diff --git a/source/lib/src/prod_env_mat_nvnmd.cc b/source/lib/src/prod_env_mat_nvnmd.cc
new file mode 100644
index 0000000000..ae0b737bae
--- /dev/null
+++ b/source/lib/src/prod_env_mat_nvnmd.cc
@@ -0,0 +1,176 @@
+/*
+//==================================================
+ _   _  __     __  _   _   __  __   ____  
+| \ | | \ \   / / | \ | | |  \/  | |  _ \ 
+|  \| |  \ \ / /  |  \| | | |\/| | | | | |
+| |\  |   \ V /   | |\  | | |  | | | |_| |
+|_| \_|    \_/    |_| \_| |_|  |_| |____/ 
+
+//==================================================
+
+code: nvnmd
+reference: deepmd
+author: mph (pinghui_mo@outlook.com)
+date: 2021-12-6
+
+*/
+
+
+#include <cassert>
+#include <iostream>
+#include <string.h>
+#include "prod_env_mat_nvnmd.h"
+#include "fmt_nlist.h"
+#include "env_mat_nvnmd.h"
+
+using namespace deepmd;
+
+/*
+//==================================================
+  prod_env_mat_a_nvnmd_cpu
+//==================================================
+*/
+
+// have been remove for the same function
+
+/*
+//==================================================
+  prod_env_mat_a_nvnmd_quantize_cpu
+//==================================================
+*/
+
+
+template<typename FPTYPE>
+void
+deepmd::
+prod_env_mat_a_nvnmd_quantize_cpu(
+    FPTYPE * em, 
+    FPTYPE * em_deriv, 
+    FPTYPE * rij, 
+    int * nlist, 
+    const FPTYPE * coord, 
+    const int * type, 
+    const InputNlist & inlist,
+    const int max_nbor_size,
+    const FPTYPE * avg, 
+    const FPTYPE * std, 
+    const int nloc, 
+    const int nall, 
+    const float rcut, 
+    const float rcut_smth, 
+    const std::vector<int> sec,
+    const FPTYPE precs[3]) 
+{
+  const int nnei = sec.back();
+  const int nem = nnei * 4;
+
+  // set & normalize coord
+  std::vector<FPTYPE> d_coord3(nall * 3);
+  for (int ii = 0; ii < nall; ++ii) {
+    for (int dd = 0; dd < 3; ++dd) {
+      d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+    }
+  }
+
+  // set type
+  std::vector<int> d_type (nall);
+  for (int ii = 0; ii < nall; ++ii) {
+    d_type[ii] = type[ii];
+  }
+    
+  // build nlist
+  std::vector<std::vector<int > > d_nlist_a(nloc);
+
+  assert(nloc == inlist.inum);
+  for (unsigned ii = 0; ii < nloc; ++ii) {
+    d_nlist_a[ii].reserve(max_nbor_size);
+  }
+  for (unsigned ii = 0; ii < nloc; ++ii) {
+    int i_idx = inlist.ilist[ii];
+    for(unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj){
+      int j_idx = inlist.firstneigh[ii][jj];
+      d_nlist_a[i_idx].push_back (j_idx);
+    }
+  }
+    
+#pragma omp parallel for 
+  for (int ii = 0; ii < nloc; ++ii) {
+    std::vector<int> fmt_nlist_a;
+    int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_type, ii, d_nlist_a[ii], rcut, sec);
+    std::vector<FPTYPE> d_em_a;
+    std::vector<FPTYPE> d_em_a_deriv;
+    std::vector<FPTYPE> d_em_r;
+    std::vector<FPTYPE> d_em_r_deriv;
+    std::vector<FPTYPE> d_rij_a;
+    env_mat_a_nvnmd_quantize_cpu (d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_type, ii, fmt_nlist_a, sec, rcut_smth, rcut, precs);
+
+    // check sizes
+    assert (d_em_a.size() == nem);
+    assert (d_em_a_deriv.size() == nem * 3);
+    assert (d_rij_a.size() == nnei * 3);
+    assert (fmt_nlist_a.size() == nnei);
+    // record outputs
+    for (int jj = 0; jj < nem; ++jj) {
+      // em[ii * nem + jj] = (d_em_a[jj] - avg[d_type[ii] * nem + jj]) / std[d_type[ii] * nem + jj];
+      em[ii * nem + jj] = d_em_a[jj];
+    }
+    for (int jj = 0; jj < nem * 3; ++jj) {
+      // em_deriv[ii * nem * 3 + jj] = d_em_a_deriv[jj] / std[d_type[ii] * nem + jj / 3];
+      em_deriv[ii * nem * 3 + jj] = d_em_a_deriv[jj];
+    }
+    for (int jj = 0; jj < nnei * 3; ++jj) {
+      rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+    }
+    for (int jj = 0; jj < nnei; ++jj) {
+      nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+    }
+  }
+}
+
+
+
+template
+void 
+deepmd::
+prod_env_mat_a_nvnmd_quantize_cpu<double>(
+    double * em, 
+    double * em_deriv, 
+    double * rij, 
+    int * nlist, 
+    const double * coord, 
+    const int * type, 
+    const InputNlist & inlist,
+    const int max_nbor_size,
+    const double * avg, 
+    const double * std, 
+    const int nloc, 
+    const int nall, 
+    const float rcut, 
+    const float rcut_smth, 
+    const std::vector<int> sec,
+    const double precs[3]);
+
+template
+void
+deepmd::
+prod_env_mat_a_nvnmd_quantize_cpu<float>(
+    float * em, 
+    float * em_deriv, 
+    float * rij, 
+    int * nlist, 
+    const float * coord, 
+    const int * type, 
+    const InputNlist & inlist,
+    const int max_nbor_size,
+    const float * avg, 
+    const float * std, 
+    const int nloc, 
+    const int nall, 
+    const float rcut, 
+    const float rcut_smth, 
+    const std::vector<int> sec,
+    const float precs[3]);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// UNDEFINE
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/lib/tests/test_env_mat_a_nvnmd.cc b/source/lib/tests/test_env_mat_a_nvnmd.cc
new file mode 100644
index 0000000000..c12a628e70
--- /dev/null
+++ b/source/lib/tests/test_env_mat_a_nvnmd.cc
@@ -0,0 +1,310 @@
+#include <iostream>
+#include <gtest/gtest.h>
+#include "fmt_nlist.h"
+#include "env_mat_nvnmd.h"
+#include "prod_env_mat_nvnmd.h"
+#include "neighbor_list.h"
+#include "device.h"
+
+
+class TestEnvMatANvnmd : public ::testing::Test
+{
+protected:
+  std::vector<double > posi = {12.83, 2.56, 2.18, 
+			       12.09, 2.87, 2.74,
+			       00.25, 3.32, 1.68,
+			       3.36, 3.00, 1.81,
+			       3.51, 2.51, 2.60,
+			       4.27, 3.22, 1.56
+  };
+  std::vector<int > atype = {0, 1, 1, 0, 1, 1};
+  std::vector<double > posi_cpy;
+  std::vector<int > atype_cpy;
+  int nloc, nall;
+  double rc = 6;
+  double rc_smth = 0.8;
+  SimulationRegion<double > region;
+  std::vector<int> mapping, ncell, ngcell;
+  std::vector<int> sec_a = {0, 10, 20};
+  std::vector<int> sec_r = {0, 0, 0};
+  std::vector<int> nat_stt, ext_stt, ext_end;
+  std::vector<std::vector<int>> nlist_a, nlist_r;
+  std::vector<std::vector<int>> nlist_a_cpy, nlist_r_cpy;
+  int ntypes = sec_a.size()-1;
+  int nnei = sec_a.back();
+  int ndescrpt = nnei * 4;
+  /* r_ij^2, x_ij, y_ij, z_ij */
+  std::vector<double > expected_env = {
+    12.79150,  3.53003,  0.43994, -0.37000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.95728, -0.73999,  0.31006,  0.56006,  1.00403,  0.42004,  0.76001, -0.50000, 13.72168,  3.68005, -0.05005,  0.42004, 20.53308,  4.43994,  0.66003, -0.62000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000, 
+    0.95728,  0.73999, -0.31006, -0.56006, 19.11487,  4.27002,  0.13000, -0.93005,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  2.67175,  1.16003,  0.44995, -1.06006, 19.68591,  4.42004, -0.35999, -0.14001, 28.34790,  5.18005,  0.34998, -1.18005,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000, 
+    1.00403, -0.42004, -0.76001,  0.50000,  9.79126,  3.10999, -0.31995,  0.13000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  2.67175, -1.16003, -0.44995,  1.06006, 12.13025,  3.26001, -0.81006,  0.92004, 16.18494,  4.02002, -0.09998, -0.12000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000, 
+    12.79150, -3.53003, -0.43994,  0.37000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.88672,  0.15002, -0.48999,  0.79004,  0.93896,  0.91003,  0.21997, -0.25000,  9.79126, -3.10999,  0.31995, -0.13000, 19.11487, -4.27002, -0.13000,  0.93005,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000, 
+    0.88672, -0.15002,  0.48999, -0.79004, 13.72168, -3.68005,  0.05005, -0.42004,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  2.16333,  0.76001,  0.70996, -1.04004, 12.13025, -3.26001,  0.81006, -0.92004, 19.68591, -4.42004,  0.35999,  0.14001,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000, 
+    0.93896, -0.91003, -0.21997,  0.25000, 20.53308, -4.43994, -0.66003,  0.62000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  2.16333, -0.76001, -0.70996,  1.04004, 16.18494, -4.02002,  0.09998,  0.12000, 28.34790, -5.18005, -0.34998,  1.18005,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000
+  };
+  
+  void SetUp() override {
+    double box[] = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+    region.reinitBox(box);
+    copy_coord(posi_cpy, atype_cpy, mapping, ncell, ngcell, posi, atype, rc, region);
+    nloc = posi.size() / 3;
+    nall = posi_cpy.size() / 3;
+    nat_stt.resize(3);
+    ext_stt.resize(3);
+    ext_end.resize(3);
+    for (int dd = 0; dd < 3; ++dd){
+      ext_stt[dd] = -ngcell[dd];
+      ext_end[dd] = ncell[dd] + ngcell[dd];
+    }
+    build_nlist(nlist_a, nlist_r, posi, rc, rc, ncell, region);
+    build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);
+  }
+  void TearDown() override {
+  }
+};
+
+
+class TestEnvMatANvnmdShortSel : public ::testing::Test
+{
+protected:
+  std::vector<double > posi = {12.83, 2.56, 2.18, 
+			       12.09, 2.87, 2.74,
+			       00.25, 3.32, 1.68,
+			       3.36, 3.00, 1.81,
+			       3.51, 2.51, 2.60,
+			       4.27, 3.22, 1.56
+  };
+  std::vector<int > atype = {0, 1, 1, 0, 1, 1};
+  std::vector<double > posi_cpy;
+  std::vector<int > atype_cpy;
+  int nloc, nall;
+  double rc = 6;
+  double rc_smth = 0.8;
+  SimulationRegion<double > region;
+  std::vector<int> mapping, ncell, ngcell;
+  std::vector<int> sec_a = {0, 2, 4};
+  std::vector<int> sec_r = {0, 0, 0};
+  std::vector<int> nat_stt, ext_stt, ext_end;
+  std::vector<std::vector<int>> nlist_a, nlist_r;
+  std::vector<std::vector<int>> nlist_a_cpy, nlist_r_cpy;
+  int ntypes = sec_a.size()-1;
+  int nnei = sec_a.back();
+  int ndescrpt = nnei * 4;
+  std::vector<double > expected_env = {
+    12.79150,  3.53003,  0.43994, -0.37000,  0.00000,  0.00000,  0.00000,  0.00000,  0.95728, -0.73999,  0.31006,  0.56006,  1.00403,  0.42004,  0.76001, -0.50000, 
+    0.95728,  0.73999, -0.31006, -0.56006, 19.11487,  4.27002,  0.13000, -0.93005,  2.67175,  1.16003,  0.44995, -1.06006, 19.68591,  4.42004, -0.35999, -0.14001, 
+    1.00403, -0.42004, -0.76001,  0.50000,  9.79126,  3.10999, -0.31995,  0.13000,  2.67175, -1.16003, -0.44995,  1.06006, 12.13025,  3.26001, -0.81006,  0.92004, 
+    12.79150, -3.53003, -0.43994,  0.37000,  0.00000,  0.00000,  0.00000,  0.00000,  0.88672,  0.15002, -0.48999,  0.79004,  0.93896,  0.91003,  0.21997, -0.25000, 
+    0.88672, -0.15002,  0.48999, -0.79004, 13.72168, -3.68005,  0.05005, -0.42004,  2.16333,  0.76001,  0.70996, -1.04004, 12.13025, -3.26001,  0.81006, -0.92004, 
+    0.93896, -0.91003, -0.21997,  0.25000, 20.53308, -4.43994, -0.66003,  0.62000,  2.16333, -0.76001, -0.70996,  1.04004, 16.18494, -4.02002,  0.09998,  0.12000
+  };  
+  
+  void SetUp() override {
+    double box[] = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+    region.reinitBox(box);
+    copy_coord(posi_cpy, atype_cpy, mapping, ncell, ngcell, posi, atype, rc, region);
+    nloc = posi.size() / 3;
+    nall = posi_cpy.size() / 3;
+    nat_stt.resize(3);
+    ext_stt.resize(3);
+    ext_end.resize(3);
+    for (int dd = 0; dd < 3; ++dd){
+      ext_stt[dd] = -ngcell[dd];
+      ext_end[dd] = ncell[dd] + ngcell[dd];
+    }
+    build_nlist(nlist_a, nlist_r, posi, rc, rc, ncell, region);
+    build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);
+  }
+  void TearDown() override {
+  }
+};
+
+
+/*  env_mat_a_nvnmd_quantize_cpu is not same as env_mat_a.
+remove some tests:
+TEST_F(TestEnvMatANvnmd, orig_cpy)
+TEST_F(TestEnvMatANvnmd, orig_pbc)
+TEST_F(TestEnvMatANvnmd, orig_cpy_equal_pbc)
+TEST_F(TestEnvMatANvnmd, orig_cpy_num_deriv)
+*/
+
+TEST_F(TestEnvMatANvnmd, cpu)
+{
+  std::vector<int> fmt_nlist_a, fmt_nlist_r;
+  std::vector<double> env, env_deriv, rij_a;
+  bool pbc = false;
+  double precs[3] = {8192, 1024, 16}; // NBIT_DATA_FL, NBIT_FEA_X, NBIT_FEA_X_FL
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);    
+    EXPECT_EQ(ret, -1);
+    deepmd::env_mat_a_nvnmd_quantize_cpu<double>(env, env_deriv, rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc, precs);    
+    EXPECT_EQ(env.size(), sec_a[2]*4);
+    EXPECT_EQ(env.size(), env_deriv.size()/3);
+    EXPECT_EQ(rij_a.size(), sec_a[2]*3);
+    for (int jj = 0; jj < sec_a[2]; ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	  EXPECT_LT(fabs(env[jj*4+dd] - expected_env[ii*sec_a[2]*4 + jj*4 + dd]) , 1e-5);
+      }
+    }    
+  }
+}
+
+/*  env_mat_a_nvnmd_quantize_cpu is not same as env_mat_a.
+remove some tests:
+TEST_F(TestEnvMatANvnmd, cpu_equal_orig_cpy)
+TEST_F(TestEnvMatANvnmd, cpu_num_deriv)
+TEST_F(TestEnvMatANvnmdShortSel, orig_cpy)
+TEST_F(TestEnvMatANvnmdShortSel, orig_pbc)
+*/
+
+
+TEST_F(TestEnvMatANvnmdShortSel, cpu)
+{
+  std::vector<int> fmt_nlist_a, fmt_nlist_r;
+  std::vector<double> env, env_deriv, rij_a;
+  bool pbc = false;
+  double precs[3] = {8192, 1024, 16}; // NBIT_DATA_FL, NBIT_FEA_X, NBIT_FEA_X_FL
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);    
+    EXPECT_EQ(ret, 1);
+    deepmd::env_mat_a_nvnmd_quantize_cpu<double>(env, env_deriv, rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc, precs);    
+    EXPECT_EQ(env.size(), sec_a[2]*4);
+    EXPECT_EQ(env.size(), env_deriv.size()/3);
+    EXPECT_EQ(rij_a.size(), sec_a[2]*3);
+    for (int jj = 0; jj < sec_a[2]; ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(env[jj*4+dd] - expected_env[ii*sec_a[2]*4 + jj*4 + dd]) , 1e-5);
+      }
+    }
+  }
+}
+
+
+TEST_F(TestEnvMatANvnmd, prod_cpu)
+{
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int tot_nnei = 0;
+  int max_nbor_size = 0;
+  double precs[3] = {8192, 1024, 16}; // NBIT_DATA_FL, NBIT_FEA_X, NBIT_FEA_X_FL
+  for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){
+    tot_nnei += nlist_a_cpy[ii].size();
+    if (nlist_a_cpy[ii].size() > max_nbor_size){
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::convert_nlist(inlist, nlist_a_cpy);
+  
+  std::vector<double > em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3), rij(nloc * nnei * 3);
+  std::vector<int> nlist(nloc * nnei);
+  std::vector<double > avg(ntypes * ndescrpt, 0);
+  std::vector<double > std(ntypes * ndescrpt, 1);
+  deepmd::prod_env_mat_a_nvnmd_quantize_cpu(
+      &em[0],
+      &em_deriv[0],
+      &rij[0],
+      &nlist[0],
+      &posi_cpy[0],
+      &atype_cpy[0],
+      inlist,
+      max_nbor_size,
+      &avg[0],
+      &std[0],
+      nloc,
+      nall,
+      rc, 
+      rc_smth,
+      sec_a,
+      precs);
+
+  for(int ii = 0; ii < nloc; ++ii){
+    for (int jj = 0; jj < nnei; ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(em[ii*nnei*4 + jj*4 + dd] - 
+		       expected_env[ii*nnei*4 + jj*4 + dd]) , 
+		  1e-5);
+      }
+    }    
+  }
+}
+
+
+TEST_F(TestEnvMatANvnmd, prod_cpu_equal_cpu)
+{
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int tot_nnei = 0;
+  int max_nbor_size = 0;
+  double precs[3] = {8192, 1024, 16}; // NBIT_DATA_FL, NBIT_FEA_X, NBIT_FEA_X_FL
+  for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){
+    tot_nnei += nlist_a_cpy[ii].size();
+    if (nlist_a_cpy[ii].size() > max_nbor_size){
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_a_cpy);
+  std::vector<double > em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3), rij(nloc * nnei * 3);
+  std::vector<int> nlist(nloc * nnei);
+  std::vector<double > avg(ntypes * ndescrpt, 0);
+  std::vector<double > std(ntypes * ndescrpt, 1);
+  deepmd::prod_env_mat_a_nvnmd_quantize_cpu(
+      &em[0],
+      &em_deriv[0],
+      &rij[0],
+      &nlist[0],
+      &posi_cpy[0],
+      &atype_cpy[0],
+      inlist,
+      max_nbor_size,
+      &avg[0],
+      &std[0],
+      nloc,
+      nall,
+      rc, 
+      rc_smth,
+      sec_a,
+      precs);
+
+  std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
+  std::vector<double> env_1, env_deriv_1, rij_a_1;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);  
+    EXPECT_EQ(ret_1, -1);
+    deepmd::env_mat_a_nvnmd_quantize_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc, precs);
+    EXPECT_EQ(env_1.size(), nnei * 4);
+    EXPECT_EQ(env_deriv_1.size(), nnei * 4 * 3);
+    EXPECT_EQ(rij_a_1.size(), nnei * 3);
+    EXPECT_EQ(fmt_nlist_a_1.size(), nnei);
+    EXPECT_EQ(env_1.size() * nloc, em.size());
+    EXPECT_EQ(env_deriv_1.size() * nloc, em_deriv.size());
+    EXPECT_EQ(rij_a_1.size() * nloc, rij.size());
+    EXPECT_EQ(fmt_nlist_a_1.size() * nloc, nlist.size());
+    for (unsigned jj = 0; jj < env_1.size(); ++jj){
+      EXPECT_LT(fabs(em[ii*nnei*4+jj] - env_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < env_deriv_1.size(); ++jj){
+      EXPECT_LT(fabs(em_deriv[ii*nnei*4*3+jj] - env_deriv_1[jj]), 1e-10);      
+    }    
+    for (unsigned jj = 0; jj < rij_a_1.size(); ++jj){
+      EXPECT_LT(fabs(rij[ii*nnei*3+jj] - rij_a_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < fmt_nlist_a_1.size(); ++jj){
+      EXPECT_EQ(nlist[ii*nnei+jj], fmt_nlist_a_1[jj]);
+    }
+  }
+
+  for(int ii = 0; ii < nloc; ++ii){
+    for (int jj = 0; jj < nnei; ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(em[ii*nnei*4 + jj*4 + dd] - 
+  		       expected_env[ii*nnei*4 + jj*4 + dd]) , 
+  		  1e-5);
+      }
+    }
+  }
+}
+
diff --git a/source/lmp/env.sh.in b/source/lmp/env.sh.in
index b419154a32..b0b2f4d7bf 100644
--- a/source/lmp/env.sh.in
+++ b/source/lmp/env.sh.in
@@ -6,6 +6,6 @@ TF_INCLUDE_DIRS=`echo $TENSORFLOW_INCLUDE_DIRS | sed "s/;/ -I/g"`
 TF_LIBRARY_PATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -L/g"`
 TF_RPATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -Wl,-rpath=/g"`
 
-NNP_INC=" -std=c++@CMAKE_CXX_STANDARD@ -D@prec_def@ @TTM_DEF@ -DLAMMPS_VERSION_NUMBER=@LAMMPS_VERSION_NUMBER@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/ "
+NNP_INC=" -std=c++@CMAKE_CXX_STANDARD@ -D@prec_def@ @TTM_DEF@ -DLAMMPS_VERSION_NUMBER=@LAMMPS_VERSION_NUMBER@ -I$DEEPMD_ROOT/include/ "
 NNP_PATH=" -L$TF_LIBRARY_PATH -L$DEEPMD_ROOT/lib"
 NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_CC@@variant_name@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
diff --git a/source/lmp/fix_dplr.h b/source/lmp/fix_dplr.h
index cdb5b36b1f..c5cf5d22da 100644
--- a/source/lmp/fix_dplr.h
+++ b/source/lmp/fix_dplr.h
@@ -8,6 +8,7 @@ FixStyle(dplr,FixDPLR)
 #define LMP_FIX_DPLR_H
 
 #include <stdio.h>
+#include <map>
 #include "fix.h"
 #include "pair_deepmd.h"
 #ifdef LMPPLUGIN
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index f457d2a183..fbc3a696ce 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -111,7 +111,7 @@ std::string PairDeepMD::get_file_content(const std::string & model) {
   int nchar = 0;
   std::string file_content;
   if (myrank == root) {
-    deepmd::check_status(tensorflow::ReadFileToString(tensorflow::Env::Default(), model, &file_content));
+    deepmd::read_file_to_string(model, file_content);
     nchar = file_content.size();
   }
   MPI_Bcast(&nchar, 1, MPI_INT, root, MPI_COMM_WORLD);  
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index e43c66da13..ffae97a506 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -3,7 +3,7 @@
 set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_DIR}/lib/src/neighbor_list.cc)
 
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
-file(GLOB OP_SRC custom_op.cc prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_multi_device.cc map_aparam.cc neighbor_stat.cc unaggregated_grad.cc tabulate_multi_device.cc prod_env_mat_multi_device.cc)
+file(GLOB OP_SRC prod_env_mat_multi_device_nvnmd.cc map_nvnmd.cc matmul_nvnmd.cc quantize_nvnmd.cc tanh2_nvnmd.cc tanh4_nvnmd.cc custom_op.cc prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_multi_device.cc map_aparam.cc neighbor_stat.cc unaggregated_grad.cc tabulate_multi_device.cc prod_env_mat_multi_device.cc)
 file(GLOB OP_GRADS_SRC custom_op.cc prod_force_grad.cc prod_force_grad_multi_device.cc prod_virial_grad.cc prod_virial_grad_multi_device.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 file(GLOB OP_REMAPPER_SRC optimizer/parallel.cc)
diff --git a/source/op/_map_nvnmd_grad.py b/source/op/_map_nvnmd_grad.py
new file mode 100644
index 0000000000..590cf1e0d0
--- /dev/null
+++ b/source/op/_map_nvnmd_grad.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+from tensorflow.python.framework import ops
+from deepmd.env import op_module
+from deepmd.env import tf 
+
+@ops.RegisterGradient("MapNvnmd")
+def _MapNvnmdGrad(op, grad):
+    x = op.inputs[0]
+    v = op.inputs[1]
+    dv = op.inputs[2]
+    grad_v = op.inputs[3]
+    grad_dv = op.inputs[4]
+    prec = op.get_attr("prec")
+    nbit = op.get_attr("nbit")
+    y = op.outputs[0]
+    dydx = op_module.map_nvnmd(x, grad_v, grad_dv, tf.zeros_like(v), tf.zeros_like(dv), prec, nbit)
+    dydx = op_module.quantize_nvnmd(dydx, 0, nbit, -1, -1)
+    dx = tf.reshape(tf.reduce_sum(dydx * grad, axis=1), [-1, 1])
+
+    d_v = None
+    d_dv = None
+    d_grad_v = None
+    d_grad_dv = None
+    return [dx, d_v, d_dv, d_grad_v, d_grad_dv]
+
diff --git a/source/op/_matmul_nvnmd_grad.py b/source/op/_matmul_nvnmd_grad.py
new file mode 100644
index 0000000000..9d8d70b332
--- /dev/null
+++ b/source/op/_matmul_nvnmd_grad.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+from tensorflow.python.framework import ops
+from deepmd.env import op_module
+from deepmd.env import tf 
+
+@ops.RegisterGradient("MatmulNvnmd")
+def _MatmulNvnmdGrad(op, grad):
+    x = op.inputs[0]
+    w = op.inputs[1]
+    isround = op.get_attr("isround")
+    nbit1 = op.get_attr("nbit1")
+    nbit2 = op.get_attr("nbit2")
+    nbit3 = op.get_attr("nbit3")
+    dx = op_module.matmul_nvnmd(grad, tf.transpose(w), isround, nbit2, nbit3, nbit1)
+    dw = op_module.matmul_nvnmd(tf.transpose(x), grad, isround, nbit2, nbit3, nbit1)
+    return [dx, dw]
diff --git a/source/op/_quantize_nvnmd_grad.py b/source/op/_quantize_nvnmd_grad.py
new file mode 100644
index 0000000000..9356d6f1cf
--- /dev/null
+++ b/source/op/_quantize_nvnmd_grad.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+from tensorflow.python.framework import ops
+from deepmd.env import op_module
+from deepmd.env import tf 
+
+@ops.RegisterGradient("QuantizeNvnmd")
+def _QuantizeNvnmdGrad(op, grad):
+    isround = op.get_attr("isround")
+    nbit1 = op.get_attr("nbit1")
+    nbit2 = op.get_attr("nbit2")
+    nbit3 = op.get_attr("nbit3")
+    dx = op_module.quantize_nvnmd(grad, isround, nbit2, nbit3, nbit1)
+    return dx
diff --git a/source/op/_tanh2_nvnmd_grad.py b/source/op/_tanh2_nvnmd_grad.py
new file mode 100644
index 0000000000..fb0c81f25b
--- /dev/null
+++ b/source/op/_tanh2_nvnmd_grad.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+
+from tensorflow.python.framework import ops
+from deepmd.env import op_module
+from deepmd.env import tf 
+
+@ops.RegisterGradient("Tanh2Nvnmd")
+def _Tanh2NvnmdGrad(op, grad):
+    isround = op.get_attr("isround")
+    nbit1 = op.get_attr("nbit1")
+    nbit2 = op.get_attr("nbit2")
+    nbit3 = op.get_attr("nbit3")
+    prec = 2 ** nbit2
+    x = op.inputs[0]
+    x_abs = tf.abs(x)
+    x1 = tf.clip_by_value(x_abs, 0, 2)
+    x2 = tf.clip_by_value(x_abs, 0, 4)
+    dydx = (132-64*x1-x2) * 0.0078125
+    if (nbit2 > -1):
+        dydx = dydx + tf.stop_gradient( tf.floor(dydx * prec) / prec - dydx )
+    dx = dydx * grad
+    if (nbit2 > -1):
+        dx = dx + tf.stop_gradient( tf.floor(dx * prec) / prec - dx )
+    return dx
diff --git a/source/op/_tanh4_nvnmd_grad.py b/source/op/_tanh4_nvnmd_grad.py
new file mode 100644
index 0000000000..392afadde2
--- /dev/null
+++ b/source/op/_tanh4_nvnmd_grad.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+from tensorflow.python.framework import ops
+from deepmd.env import op_module
+from deepmd.env import tf 
+
+@ops.RegisterGradient("Tanh4Nvnmd")
+def _Tanh4NvnmdGrad(op, grad):
+    isround = op.get_attr("isround")
+    nbit1 = op.get_attr("nbit1")
+    nbit2 = op.get_attr("nbit2")
+    nbit3 = op.get_attr("nbit3")
+    prec = 2 ** nbit2
+    x = op.inputs[0]
+    xc = tf.clip_by_value(x, -2, 2)
+    xa = tf.abs(xc)
+    xx = xa * xa 
+    if (nbit2 > -1):
+        xx = xx + tf.stop_gradient(tf.floor(xx * prec) / prec - xx)
+    #
+    dydx = xx * (xa/4 - 3/4) + 1
+    if (nbit2 > -1):
+        dydx = dydx + tf.stop_gradient( tf.floor(dydx * prec) / prec - dydx)
+    #
+    dx = dydx * grad
+    if (nbit2 > -1):
+        dx = dx + tf.stop_gradient( tf.floor(dx * prec) / prec - dx )
+    return dx
diff --git a/source/op/map_nvnmd.cc b/source/op/map_nvnmd.cc
new file mode 100644
index 0000000000..c2975ce916
--- /dev/null
+++ b/source/op/map_nvnmd.cc
@@ -0,0 +1,144 @@
+
+//
+// --------------------------------------------------------------------
+/*
+
+# Function
+x = xk+dx
+y = vk+dvk*dx
+
+build a mapping table V, use the X as index to select value Y
+
+# Parameters
+x index
+v mapping table
+dv mapping table of slope
+grad_v mapping table of 1st order derivative
+grad_dv  mapping table of slope of 1st order derivative
+prec precision
+nbit number of bits
+y output
+*/
+// --------------------------------------------------------------------
+//
+
+//- import the library of tensorflow
+#include "custom_op.h"
+
+using namespace tensorflow;
+
+//- register the operator
+// prec = 2^n, so it doesn't need to match `T`
+REGISTER_OP("MapNvnmd")
+  .Attr("T: {float, double} = DT_DOUBLE")
+  .Input("x: T")
+  .Input("v: T")
+  .Input("dv: T")
+  .Input("grad_v: T")
+  .Input("grad_dv: T")
+  .Attr("prec: float")
+  .Attr("nbit: int")
+  .Output("y: T");
+
+//- create the operator class
+//* the class must inherit the OpKernel Class
+template <typename Device, typename FPTYPE>
+class MapNvnmdOp : public OpKernel {
+public:
+
+  /// Constructor.
+  explicit MapNvnmdOp(OpKernelConstruction* context) : OpKernel(context) {	  
+    OP_REQUIRES_OK(context, context->GetAttr("prec", &prec));
+    div_prec = 1.0 / prec;
+  }
+  
+  
+  /// Compute the descriptor
+  /// param: context
+  void Compute(OpKernelContext* context) override {
+    
+    /* 
+     * Get input
+     * 1.check
+     * 2.get tensor
+     * 3.get shape and check
+     */
+	
+	//- 1.check
+    DCHECK_EQ(5, context->num_inputs());
+    
+    //- 2.get tensor
+    const Tensor& X = context->input(0);
+    const Tensor& V = context->input(1);
+    const Tensor& DV = context->input(2);
+    
+    //- 3. get shape and check
+    const TensorShape& shX = X.shape();
+    const TensorShape& shV = V.shape();
+    const TensorShape& shDV = DV.shape();
+    
+    int D1 = shX.dim_size(0);
+    int D2 = shX.dim_size(1);
+    int D3 = shV.dim_size(0);
+    int D4 = shV.dim_size(1);
+	
+    DCHECK_EQ(shX.dims(), 2);
+    DCHECK_EQ(shV.dims(), 2);
+    
+    /*
+     * Calculate the output
+     * 1.create tensor
+     * 2.allocate the memory
+     * 3.calculate
+     */
+    
+    //- 1.create tensor
+    TensorShape shY;
+    shY.AddDim(D1);
+    shY.AddDim(D2*D4);
+    Tensor* Y = NULL;
+    
+    //- 2.allocate the memory
+    //* allocate memory for the Y tensor which is called output 0
+    OP_REQUIRES_OK(context, context->allocate_output(0, shY, &Y));
+    auto x = X.matrix<FPTYPE>();
+    auto v = V.matrix<FPTYPE>();
+    auto dv = DV.matrix<FPTYPE>();
+    auto y = Y->matrix<FPTYPE>();
+
+    int ii, jj, kk, jk, n;
+    FPTYPE dx;
+    for(ii=0; ii<D1; ii++){
+      n = floor(x(ii, 0) * div_prec);
+      dx = x(ii, 0) - n * prec;
+      //check
+      if (n < 0)  {
+        std::cerr<<"ERROR: index is smaller than 0 \n"; 
+        n = 0;
+      }
+      if (n > D3) {
+        std::cerr<<"ERROR: index is bigger  than range \n";
+        n = 0;
+      }
+      n = (n == D3) ? 0 : n;
+      //map
+      for(kk=0; kk<D4; kk++){
+        y(ii, kk) = v(n, kk) + dv(n, kk) * dx;
+      }
+    }
+
+  }
+//- define the private variable for calculation
+private:
+float prec, div_prec;
+};
+
+#define REGISTER_CPU(T) \
+REGISTER_KERNEL_BUILDER( \
+    Name("MapNvnmd").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+    MapNvnmdOp<CPUDevice, T>);
+REGISTER_CPU(float);                  
+REGISTER_CPU(double);
+
+
+
diff --git a/source/op/matmul_nvnmd.cc b/source/op/matmul_nvnmd.cc
new file mode 100644
index 0000000000..6cecfde0a6
--- /dev/null
+++ b/source/op/matmul_nvnmd.cc
@@ -0,0 +1,168 @@
+
+
+//
+// --------------------------------------------------------------------
+/*
+
+# Function
+y = q(matmul(x, w), nbit)
+q is a quantization function
+
+# Parameter
+nbit nbit for x
+nbit2 nbit for dy_dx
+nbit3 nbit for dy2_dx2
+
+# Note
+1. if nbit < 0, y = matmul(x, w)
+2. the input and output dimensions must be 2
+
+*/
+// --------------------------------------------------------------------
+//
+
+//- import the library of tensorflow
+#include "custom_op.h"
+
+using namespace tensorflow;
+
+
+//- register the operator
+REGISTER_OP("MatmulNvnmd")
+  .Attr("T: {float, double} = DT_DOUBLE")
+  .Input("x: T")
+  .Input("w: T")
+  .Attr("isround: int")
+  .Attr("nbit1: int")
+  .Attr("nbit2: int")
+  .Attr("nbit3: int")
+  .Output("y: T");
+
+
+
+//- create the operator class
+//* the class must inherit the OpKernel Class
+template <typename Device, typename FPTYPE>
+class MatmulNvnmdOp : public OpKernel {
+public:
+
+  /// Constructor.
+  explicit MatmulNvnmdOp(OpKernelConstruction* context) : OpKernel(context) {
+	  //- define the attribute of context
+	  //* the context is the input from your tensorflow code
+    OP_REQUIRES_OK(context, context->GetAttr("nbit1", &nbit1));
+    OP_REQUIRES_OK(context, context->GetAttr("nbit2", &nbit2));
+    OP_REQUIRES_OK(context, context->GetAttr("nbit3", &nbit3));
+    OP_REQUIRES_OK(context, context->GetAttr("isround", &isround));
+  }
+  
+  
+  /// Compute the descriptor
+  /// param: context
+  void Compute(OpKernelContext* context) override {
+    
+    /* 
+     * Get input
+     * 1.check
+     * 2.get tensor
+     * 3.get shape and check
+     */
+	
+	//- 1.check
+    DCHECK_EQ(2, context->num_inputs());
+    
+    //- 2.get tensor
+    const Tensor& X = context->input(0);
+    const Tensor& W = context->input(1);
+    
+    //- 3. get shape and check
+    const TensorShape& shX = X.shape();
+    const TensorShape& shW = W.shape();
+    
+    int D1 = shX.dim_size(0);
+    int D2 = shX.dim_size(1);
+    int D3 = shW.dim_size(0);
+    int D4 = shW.dim_size(1);
+    
+    /*
+     * Calculate the output
+     * 1.create tensor
+     * 2.allocate the memory
+     * 3.calculate
+     */
+    
+    //- 1.create tensor
+    TensorShape shY;
+    shY.AddDim(D1);
+    shY.AddDim(D4);
+    
+    Tensor* Y = NULL;
+    
+    //- 2.allocate the memory
+    //* allocate memory for the Y tensor which is called output 0
+    OP_REQUIRES_OK(context, context->allocate_output(0, shY, &Y));
+    auto x = X.matrix<FPTYPE>();
+    auto w = W.matrix<FPTYPE>();
+    auto y = Y->matrix<FPTYPE>();
+    FPTYPE prec;
+
+    
+    // calculate
+    int ii, jj, kk;
+    
+    for(ii=0; ii<D1; ii++){
+        for(kk=0; kk<D4; kk++){
+          y(ii, kk) = (FPTYPE)0.0;
+        }
+      }
+
+    if (this->nbit1 < 0){
+      for(ii=0; ii<D1; ii++){
+        for(kk=0; kk<D4; kk++){
+          for(jj=0; jj<D2; jj++){
+            y(ii, kk) += x(ii, jj) * w(jj, kk);
+          }
+        }
+      }
+    }
+    //
+    else {
+      prec = 1 << this->nbit1;
+
+      if (this->isround)
+        for(ii=0; ii<D1; ii++){
+          for(kk=0; kk<D4; kk++){
+            for(jj=0; jj<D2; jj++){
+              y(ii, kk) += x(ii, jj) * w(jj, kk);
+            }
+            y(ii, kk) = round(y(ii, kk) * prec)/ prec;
+          }
+        }
+      //
+      else
+        for(ii=0; ii<D1; ii++){
+          for(kk=0; kk<D4; kk++){
+            for(jj=0; jj<D2; jj++){
+              y(ii, kk) += x(ii, jj) * w(jj, kk);
+            }
+            y(ii, kk) = floor(y(ii, kk) * prec)/ prec;
+          }
+        }
+    }
+  }
+  
+//- define the private variable for calculation
+private:
+int nbit1, nbit2, nbit3;
+int isround;
+};
+
+#define REGISTER_CPU(T) \
+REGISTER_KERNEL_BUILDER( \
+    Name("MatmulNvnmd").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+    MatmulNvnmdOp<CPUDevice, T>);
+REGISTER_CPU(float);                  
+REGISTER_CPU(double);
+
+
+
diff --git a/source/op/prod_env_mat_multi_device_nvnmd.cc b/source/op/prod_env_mat_multi_device_nvnmd.cc
new file mode 100644
index 0000000000..b4d64dee98
--- /dev/null
+++ b/source/op/prod_env_mat_multi_device_nvnmd.cc
@@ -0,0 +1,516 @@
+/*
+//==================================================
+ _   _  __     __  _   _   __  __   ____  
+| \ | | \ \   / / | \ | | |  \/  | |  _ \ 
+|  \| |  \ \ / /  |  \| | | |\/| | | | | |
+| |\  |   \ V /   | |\  | | |  | | | |_| |
+|_| \_|    \_/    |_| \_| |_|  |_| |____/ 
+
+//==================================================
+
+code: nvnmd
+reference: deepmd
+author: mph (pinghui_mo@outlook.com)
+date: 2021-12-6
+
+*/
+
+
+#include "custom_op.h"
+#include "utilities.h"
+#include "coord.h"
+#include "region.h"
+#include "neighbor_list.h"
+#include "prod_env_mat_nvnmd.h"
+#include "errors.h"
+
+// ProdEnvMatANvnmd
+// have been remove for the same function
+
+REGISTER_OP("ProdEnvMatANvnmdQuantize")
+    .Attr("T: {float, double} = DT_DOUBLE")
+    .Input("coord: T")          //atomic coordinates
+    .Input("type: int32")       //atomic type
+    .Input("natoms: int32")     //local atomic number; each type atomic number
+    .Input("box : T")
+    .Input("mesh : int32")
+    .Input("davg: T")           //average value of data
+    .Input("dstd: T")           //standard deviation
+    .Attr("rcut_a: float")      //no use
+    .Attr("rcut_r: float")
+    .Attr("rcut_r_smth: float")
+    .Attr("sel_a: list(int)")
+    .Attr("sel_r: list(int)")   //all zero
+    .Output("descrpt: T")
+    .Output("descrpt_deriv: T")
+    .Output("rij: T")
+    .Output("nlist: int32");
+    // only sel_a and rcut_r uesd.
+
+template<typename FPTYPE>
+static int
+_norm_copy_coord_cpu(
+    std::vector<FPTYPE> & coord_cpy,
+    std::vector<int> & type_cpy,
+    std::vector<int> & mapping,
+    int & nall,
+    int & mem_cpy,
+    const FPTYPE * coord,
+    const FPTYPE * box,
+    const int * type,
+    const int &nloc, 
+    const int &max_cpy_trial, 
+    const float & rcut_r);
+
+template<typename FPTYPE>
+static int
+_build_nlist_cpu(
+    std::vector<int> &ilist, 
+    std::vector<int> &numneigh,
+    std::vector<int*> &firstneigh,
+    std::vector<std::vector<int>> &jlist,
+    int & max_nnei,
+    int & mem_nnei,
+    const FPTYPE *coord,
+    const int & nloc,
+    const int & new_nall,
+    const int & max_nnei_trial,
+    const float & rcut_r);
+
+static void
+_map_nlist_cpu(
+    int * nlist,
+    const int * idx_mapping,
+    const int & nloc,
+    const int & nnei);
+
+template <typename FPTYPE>
+static void
+_prepare_coord_nlist_cpu(
+    OpKernelContext* context,
+    FPTYPE const ** coord,
+    std::vector<FPTYPE> & coord_cpy,
+    int const** type,
+    std::vector<int> & type_cpy,
+    std::vector<int> & idx_mapping,
+    deepmd::InputNlist & inlist,
+    std::vector<int> & ilist,
+    std::vector<int> & numneigh,
+    std::vector<int*> & firstneigh,
+    std::vector<std::vector<int>> & jlist,
+    int & new_nall,
+    int & mem_cpy,
+    int & mem_nnei,
+    int & max_nbor_size,
+    const FPTYPE * box,
+    const int * mesh_tensor_data,
+    const int & nloc,
+    const int & nei_mode,
+    const float & rcut_r,
+    const int & max_cpy_trial,
+    const int & max_nnei_trial);
+
+// instance of function
+
+template<typename FPTYPE>
+static int
+_norm_copy_coord_cpu(
+    std::vector<FPTYPE> & coord_cpy,
+    std::vector<int> & type_cpy,
+    std::vector<int> & idx_mapping,
+    int & nall,
+    int & mem_cpy,
+    const FPTYPE * coord,
+    const FPTYPE * box,
+    const int * type,
+    const int &nloc, 
+    const int &max_cpy_trial, 
+    const float & rcut_r)
+{
+  std::vector<FPTYPE> tmp_coord(nall*3);
+  std::copy(coord, coord+nall*3, tmp_coord.begin());
+  deepmd::Region<FPTYPE> region;
+  init_region_cpu(region, box);
+  normalize_coord_cpu(&tmp_coord[0], nall, region);
+  int tt;
+  for(tt = 0; tt < max_cpy_trial; ++tt){
+    coord_cpy.resize(mem_cpy*3);
+    type_cpy.resize(mem_cpy);
+    idx_mapping.resize(mem_cpy);
+    int ret = copy_coord_cpu(
+	&coord_cpy[0], &type_cpy[0], &idx_mapping[0], &nall, 
+	&tmp_coord[0], type, nloc, mem_cpy, rcut_r, region);
+    if(ret == 0){
+      break;
+    }
+    else{
+      mem_cpy *= 2;
+    }
+  }
+  return (tt != max_cpy_trial);
+}
+
+template<typename FPTYPE>
+static int
+_build_nlist_cpu(
+    std::vector<int> &ilist, 
+    std::vector<int> &numneigh,
+    std::vector<int*> &firstneigh,
+    std::vector<std::vector<int>> &jlist,
+    int & max_nnei,
+    int & mem_nnei,
+    const FPTYPE *coord,
+    const int & nloc,
+    const int & new_nall,
+    const int & max_nnei_trial,
+    const float & rcut_r)
+{
+  int tt;
+  for(tt = 0; tt < max_nnei_trial; ++tt){
+    for(int ii = 0; ii < nloc; ++ii){
+      jlist[ii].resize(mem_nnei);
+      firstneigh[ii] = &jlist[ii][0];
+    }
+    deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+    int ret = build_nlist_cpu(
+	inlist, &max_nnei, 
+	coord, nloc, new_nall, mem_nnei, rcut_r);
+    if(ret == 0){
+      break;
+    }
+    else{
+      mem_nnei *= 2;
+    }
+  }
+  return (tt != max_nnei_trial);
+}
+    
+static void
+_map_nlist_cpu(
+    int * nlist,
+    const int * idx_mapping,
+    const int & nloc,
+    const int & nnei)
+{
+  for (int ii = 0; ii < nloc; ++ii){
+    for (int jj = 0; jj < nnei; ++jj){
+      int record = nlist[ii*nnei+jj];
+      if (record >= 0) {		
+	nlist[ii*nnei+jj] = idx_mapping[record];	      
+      }
+    }
+  }  
+}
+
+template <typename FPTYPE>
+static void
+_prepare_coord_nlist_cpu(
+    OpKernelContext* context,
+    FPTYPE const ** coord,
+    std::vector<FPTYPE> & coord_cpy,
+    int const** type,
+    std::vector<int> & type_cpy,
+    std::vector<int> & idx_mapping,
+    deepmd::InputNlist & inlist,
+    std::vector<int> & ilist,
+    std::vector<int> & numneigh,
+    std::vector<int*> & firstneigh,
+    std::vector<std::vector<int>> & jlist,
+    int & new_nall,
+    int & mem_cpy,
+    int & mem_nnei,
+    int & max_nbor_size,
+    const FPTYPE * box,
+    const int * mesh_tensor_data,
+    const int & nloc,
+    const int & nei_mode,
+    const float & rcut_r,
+    const int & max_cpy_trial,
+    const int & max_nnei_trial)
+{    
+  inlist.inum = nloc;
+  if(nei_mode != 3){
+    // build nlist by myself
+    // normalize and copy coord
+    if(nei_mode == 1){
+      int copy_ok = _norm_copy_coord_cpu(
+	  coord_cpy, type_cpy, idx_mapping, new_nall, mem_cpy,
+	  *coord, box, *type, nloc, max_cpy_trial, rcut_r);
+      OP_REQUIRES (context, copy_ok, errors::Aborted("cannot allocate mem for copied coords"));
+      *coord = &coord_cpy[0];
+      *type = &type_cpy[0];
+    }
+    // build nlist
+    int build_ok = _build_nlist_cpu(
+	ilist, numneigh, firstneigh, jlist, max_nbor_size, mem_nnei,
+	*coord, nloc, new_nall, max_nnei_trial, rcut_r);
+    OP_REQUIRES (context, build_ok, errors::Aborted("cannot allocate mem for nlist"));
+    inlist.ilist = &ilist[0];
+    inlist.numneigh = &numneigh[0];
+    inlist.firstneigh = &firstneigh[0];
+  }
+  else{
+    // copy pointers to nlist data
+    memcpy(&inlist.ilist, 4 + mesh_tensor_data, sizeof(int *));
+    memcpy(&inlist.numneigh, 8 + mesh_tensor_data, sizeof(int *));
+    memcpy(&inlist.firstneigh, 12 + mesh_tensor_data, sizeof(int **));
+    max_nbor_size = max_numneigh(inlist);
+  }
+}
+
+/*
+//==================================================
+  PARAM
+//==================================================
+*/
+
+template <typename FPTYPE>
+void get_precs(FPTYPE precs[3]){
+  precs[0] = 8192; // NBIT_DATA_FL
+  precs[1] = 1024; // NBIT_FEA_X
+  precs[2] = 16; // NBIT_FEA_X_FL
+}
+
+
+/*
+//==================================================
+  ProdEnvMatANvnmdOp
+//==================================================
+*/
+
+// have been remove for the same function
+
+/*
+//==================================================
+  ProdEnvMatANvnmdQuantizeOp
+//==================================================
+*/
+
+
+
+template <typename Device, typename FPTYPE>
+class ProdEnvMatANvnmdQuantizeOp : public OpKernel {
+public:
+  explicit ProdEnvMatANvnmdQuantizeOp(OpKernelConstruction* context) : OpKernel(context) {
+    float nloc_f, nall_f;
+    OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a));
+    OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r));
+    OP_REQUIRES_OK(context, context->GetAttr("rcut_r_smth", &rcut_r_smth));
+    OP_REQUIRES_OK(context, context->GetAttr("sel_a", &sel_a));
+    OP_REQUIRES_OK(context, context->GetAttr("sel_r", &sel_r));
+    // OP_REQUIRES_OK(context, context->GetAttr("nloc", &nloc_f));
+    // OP_REQUIRES_OK(context, context->GetAttr("nall", &nall_f));
+    deepmd::cum_sum (sec_a, sel_a);
+    deepmd::cum_sum (sec_r, sel_r);
+    ndescrpt_a = sec_a.back() * 4;
+    ndescrpt_r = sec_r.back() * 1;
+    ndescrpt = ndescrpt_a + ndescrpt_r;
+    nnei_a = sec_a.back();
+    nnei_r = sec_r.back();
+    nnei = nnei_a + nnei_r;
+    max_nbor_size = 1024;
+    max_cpy_trial = 100;
+    mem_cpy = 256;
+    max_nnei_trial = 100;
+    mem_nnei = 256;
+
+    get_precs(precs);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
+    // Grab the input tensor
+    int context_input_index = 0;
+    const Tensor& coord_tensor	= context->input(context_input_index++);
+    const Tensor& type_tensor	= context->input(context_input_index++);
+    const Tensor& natoms_tensor	= context->input(context_input_index++);
+    const Tensor& box_tensor	= context->input(context_input_index++);
+    const Tensor& mesh_tensor   = context->input(context_input_index++);
+    const Tensor& avg_tensor	= context->input(context_input_index++);
+    const Tensor& std_tensor	= context->input(context_input_index++);
+    // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3]
+    OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),       errors::InvalidArgument ("Dim of coord should be 2"));
+    OP_REQUIRES (context, (type_tensor.shape().dims() == 2),        errors::InvalidArgument ("Dim of type should be 2"));
+    OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),      errors::InvalidArgument ("Dim of natoms should be 1"));
+    OP_REQUIRES (context, (box_tensor.shape().dims() == 2),         errors::InvalidArgument ("Dim of box should be 2"));
+    OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),        errors::InvalidArgument ("Dim of mesh should be 1"));
+    OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),         errors::InvalidArgument ("Dim of avg should be 2"));
+    OP_REQUIRES (context, (std_tensor.shape().dims() == 2),         errors::InvalidArgument ("Dim of std should be 2"));
+    OP_REQUIRES (context, (sec_r.back() == 0),                      errors::InvalidArgument ("Rotational free descriptor only support all-angular information: sel_r should be all zero."));
+    OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+    DeviceFunctor() (
+        device,
+        context->eigen_device<Device>()
+    );
+    const int * natoms = natoms_tensor.flat<int>().data();
+    int nloc = natoms[0];
+    int nall = natoms[1];
+    int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something.
+    int nsamples = coord_tensor.shape().dim_size(0);
+    //// check the sizes
+    OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),  errors::InvalidArgument ("number of samples should match"));
+    OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),   errors::InvalidArgument ("number of samples should match"));
+    OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)),     errors::InvalidArgument ("number of avg should be ntype"));
+    OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)),     errors::InvalidArgument ("number of std should be ntype"));
+    
+    OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match"));
+    OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),      errors::InvalidArgument ("number of atoms should match"));
+    OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),          errors::InvalidArgument ("number of box should be 9"));
+    OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)),   errors::InvalidArgument ("number of avg should be ndescrpt"));
+    OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)),   errors::InvalidArgument ("number of std should be ndescrpt"));   
+    
+    OP_REQUIRES (context, (ntypes == int(sel_a.size())),  errors::InvalidArgument ("number of types should match the length of sel array"));
+    OP_REQUIRES (context, (ntypes == int(sel_r.size())),  errors::InvalidArgument ("number of types should match the length of sel array"));
+
+    int nei_mode = 0;
+    bool b_nlist_map = false;
+    if (mesh_tensor.shape().dim_size(0) == 16) {
+      // lammps neighbor list
+      nei_mode = 3;
+    }
+    else if (mesh_tensor.shape().dim_size(0) == 6) {
+      // manual copied pbc
+      assert (nloc == nall);
+      nei_mode = 1;
+      b_nlist_map = true;
+    }
+    else if (mesh_tensor.shape().dim_size(0) == 0) {
+      // no pbc
+      assert (nloc == nall);
+      nei_mode = -1;
+    }
+    else {
+      throw deepmd::deepmd_exception("invalid mesh tensor");
+    }
+
+    // Create output tensors
+    TensorShape descrpt_shape ;
+    descrpt_shape.AddDim (nsamples);
+    descrpt_shape.AddDim (nloc * ndescrpt);
+    TensorShape descrpt_deriv_shape ;
+    descrpt_deriv_shape.AddDim (nsamples);
+    descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
+    TensorShape rij_shape ;
+    rij_shape.AddDim (nsamples);
+    rij_shape.AddDim (nloc * nnei * 3);
+    TensorShape nlist_shape ;
+    nlist_shape.AddDim (nsamples);
+    nlist_shape.AddDim (nloc * nnei);
+    // define output tensor
+    int context_output_index = 0;
+    Tensor* descrpt_tensor = NULL;
+    Tensor* descrpt_deriv_tensor = NULL;
+    Tensor* rij_tensor = NULL;
+    Tensor* nlist_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(
+        context_output_index++,
+        descrpt_shape,
+        &descrpt_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(
+        context_output_index++,
+        descrpt_deriv_shape,
+        &descrpt_deriv_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(
+        context_output_index++,
+        rij_shape,
+        &rij_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(
+        context_output_index++,
+        nlist_shape,
+        &nlist_tensor));
+
+    FPTYPE * p_em = descrpt_tensor->flat<FPTYPE>().data();
+    FPTYPE * p_em_deriv = descrpt_deriv_tensor->flat<FPTYPE>().data();
+    FPTYPE * p_rij = rij_tensor->flat<FPTYPE>().data();
+    int * p_nlist = nlist_tensor->flat<int>().data();
+    const FPTYPE * p_coord = coord_tensor.flat<FPTYPE>().data();
+    const FPTYPE * p_box = box_tensor.flat<FPTYPE>().data();
+    const FPTYPE * avg = avg_tensor.flat<FPTYPE>().data();
+    const FPTYPE * std = std_tensor.flat<FPTYPE>().data();
+    const int * p_type = type_tensor.flat<int>().data();
+
+    // loop over samples
+    for(int ff = 0; ff < nsamples; ++ff){
+      FPTYPE * em = p_em + ff*nloc*ndescrpt;
+      FPTYPE * em_deriv = p_em_deriv + ff*nloc*ndescrpt*3;
+      FPTYPE * rij = p_rij + ff*nloc*nnei*3;
+      int * nlist = p_nlist + ff*nloc*nnei;
+      const FPTYPE * coord = p_coord + ff*nall*3;
+      const FPTYPE * box = p_box + ff*9;
+      const int * type = p_type + ff*nall;
+
+    if(device == "GPU") {
+      #if GOOGLE_CUDA
+      // UNDEFINE
+      #endif //GOOGLE_CUDA
+
+      #if TENSORFLOW_USE_ROCM
+      // UNDEFINE
+      #endif //TENSORFLOW_USE_ROCM
+    }
+    else if (device == "CPU") {
+      deepmd::InputNlist inlist;
+      // some buffers, be freed after the evaluation of this frame
+      std::vector<int> idx_mapping;
+      std::vector<int> ilist(nloc), numneigh(nloc);
+      std::vector<int*> firstneigh(nloc);
+      std::vector<std::vector<int>> jlist(nloc);
+      std::vector<FPTYPE> coord_cpy;
+      std::vector<int> type_cpy;
+      int frame_nall = nall;
+      // prepare coord and nlist
+      _prepare_coord_nlist_cpu<FPTYPE>(
+	  context, &coord, coord_cpy, &type, type_cpy, idx_mapping, 
+	  inlist, ilist, numneigh, firstneigh, jlist,
+	  frame_nall, mem_cpy, mem_nnei, max_nbor_size,
+	  box, mesh_tensor.flat<int>().data(), nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial);
+      // launch the cpu compute function
+      deepmd::prod_env_mat_a_nvnmd_quantize_cpu(
+	  em, em_deriv, rij, nlist, 
+	  coord, type, inlist, max_nbor_size, avg, std, nloc, frame_nall, rcut_r, rcut_r_smth, sec_a, precs);
+      // do nlist mapping if coords were copied
+      if(b_nlist_map) _map_nlist_cpu(nlist, &idx_mapping[0], nloc, nnei);
+    }
+    }
+  }
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+private:
+  float rcut_a;
+  float rcut_r;
+  float rcut_r_smth;
+  std::vector<int32> sel_r;
+  std::vector<int32> sel_a;
+  std::vector<int> sec_a;
+  std::vector<int> sec_r;
+  int ndescrpt, ndescrpt_a, ndescrpt_r;
+  int nnei, nnei_a, nnei_r, nloc, nall, max_nbor_size;
+  int mem_cpy, max_cpy_trial;
+  int mem_nnei, max_nnei_trial;
+  std::string device;
+  int * array_int = NULL;
+  unsigned long long * array_longlong = NULL;
+  deepmd::InputNlist gpu_inlist;
+  int * nbor_list_dev = NULL;
+  FPTYPE precs[3];
+};
+
+
+// Register the CPU kernels.
+// Compatible with v1.3
+#define REGISTER_CPU(T) \
+REGISTER_KERNEL_BUILDER( \
+    Name("ProdEnvMatANvnmdQuantize").Device(DEVICE_CPU).TypeConstraint<T>("T"),  \
+    ProdEnvMatANvnmdQuantizeOp<CPUDevice, T>); 
+
+REGISTER_CPU(float);                  
+REGISTER_CPU(double);              
+            
+// Register the GPU kernels.                  
+// Compatible with v1.3
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM            
+// UNDEFINE
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/op/quantize_nvnmd.cc b/source/op/quantize_nvnmd.cc
new file mode 100644
index 0000000000..3e159cd29a
--- /dev/null
+++ b/source/op/quantize_nvnmd.cc
@@ -0,0 +1,148 @@
+
+
+// Quantization Operator of NVNMD
+// --------------------------------------------------------------------
+/*
+
+# Function
+prec = 2**nbit
+y = quantize(x * prec) / prec 
+quantize is floor/round
+
+# Parameter
+@nbit nbit for x
+@nbit2 nbit for dy_dx
+@nbit3 nbit for dy2_dx2
+
+# Note
+1. if nbit < 0， y = x
+2. The operator is only used for 2D tensor.
+
+*/
+// --------------------------------------------------------------------
+//
+
+//- import the library of tensorflow
+#include "custom_op.h"
+
+using namespace tensorflow;
+
+
+//- register the operator
+REGISTER_OP("QuantizeNvnmd")
+  .Attr("T: {float, double} = DT_DOUBLE")
+  .Input("x: T")
+  .Attr("isround: int")
+  .Attr("nbit1: int")
+  .Attr("nbit2: int")
+  .Attr("nbit3: int")
+  .Output("y: T");
+
+
+
+//- create the operator class
+//* the class must inherit the OpKernel Class
+template <typename Device, typename FPTYPE>
+class QuantizeNvnmdOp : public OpKernel {
+public:
+
+  /// Constructor.
+  explicit QuantizeNvnmdOp(OpKernelConstruction* context) : OpKernel(context) {
+	  //- define the attribute of context
+	  //* the context is the input from your tensorflow code
+    OP_REQUIRES_OK(context, context->GetAttr("nbit1", &nbit1));
+    OP_REQUIRES_OK(context, context->GetAttr("nbit2", &nbit2));
+    OP_REQUIRES_OK(context, context->GetAttr("nbit3", &nbit3));
+    OP_REQUIRES_OK(context, context->GetAttr("isround", &isround));
+  }
+  
+  
+  /// Compute the descriptor
+  /// param: context
+  void Compute(OpKernelContext* context) override {
+    
+    /* 
+     * Get input
+     * 1.check
+     * 2.get tensor
+     * 3.get shape and check
+     */
+	
+	//- 1.check
+    DCHECK_EQ(1, context->num_inputs());
+    
+    //- 2.get tensor
+    const Tensor& X = context->input(0);
+    
+    //- 3. get shape and check
+    const TensorShape& shX = X.shape();
+    
+    int D1 = shX.dim_size(0);
+    int D2 = shX.dim_size(1);
+    
+    /*
+     * Calculate the output
+     * 1.create tensor
+     * 2.allocate the memory
+     * 3.calculate
+     */
+    
+    //- 1.create tensor
+    TensorShape shY;
+    shY.AddDim(D1);
+    shY.AddDim(D2);
+    
+    Tensor* Y = NULL;
+    
+    //- 2.allocate the memory
+    //* allocate memory for the Y tensor which is called output 0
+    OP_REQUIRES_OK(context, context->allocate_output(0, shY, &Y));
+    auto x = X.matrix<FPTYPE>();
+    auto y = Y->matrix<FPTYPE>();
+    FPTYPE prec;
+    
+    // calculate
+    int ii, jj;
+
+    if (this->nbit1 < 0){
+      for(ii=0; ii<D1; ii++){
+        for(jj=0; jj<D2; jj++){
+          y(ii, jj) = x(ii, jj);
+        }
+      }
+    }
+    //
+    else {
+      prec = 1 << this->nbit1;
+
+      if (this->isround)
+      for(ii=0; ii<D1; ii++){
+        for(jj=0; jj<D2; jj++){
+          y(ii, jj) = round(x(ii, jj) * prec) / prec;
+        }
+      }
+      //
+      else
+      for(ii=0; ii<D1; ii++){
+        for(jj=0; jj<D2; jj++){
+          y(ii, jj) = floor(x(ii, jj) * prec) / prec;
+        }
+      }
+    }
+  }
+  
+//- define the private variable for calculation
+private:
+int nbit1, nbit2, nbit3;
+int isround;
+};
+
+#define REGISTER_CPU(T) \
+REGISTER_KERNEL_BUILDER( \
+    Name("QuantizeNvnmd").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+    QuantizeNvnmdOp<CPUDevice, T>);
+REGISTER_CPU(float);                  
+REGISTER_CPU(double);
+
+
+
diff --git a/source/op/tanh2_nvnmd.cc b/source/op/tanh2_nvnmd.cc
new file mode 100644
index 0000000000..cb638e2f8b
--- /dev/null
+++ b/source/op/tanh2_nvnmd.cc
@@ -0,0 +1,167 @@
+
+
+// Activation Function of NVNMD
+// --------------------------------------------------------------------
+/*
+
+# Function
+y = tanh2(x)
+
+x1 = clip(x, -2, 2)
+x2 = clip(x, -4, 4)
+y1 = x1    - x1*|x1|/4
+y2 = x2/32 - x2*|x2|/256 
+y = y1 + y2
+
+*/
+// --------------------------------------------------------------------
+//
+
+//- import the library of tensorflow
+#include "custom_op.h"
+
+using namespace tensorflow;
+
+
+//- register the operator
+REGISTER_OP("Tanh2Nvnmd")
+  .Attr("T: {float, double} = DT_DOUBLE")
+  .Input("x: T")
+  .Attr("isround: int")
+  .Attr("nbit1: int")
+  .Attr("nbit2: int")
+  .Attr("nbit3: int")
+  .Output("y: T");
+
+
+
+//- create the operator class
+//* the class must inherit the OpKernel Class
+template <typename Device, typename FPTYPE>
+class Tanh2NvnmdOp : public OpKernel {
+public:
+
+  /// Constructor.
+  explicit Tanh2NvnmdOp(OpKernelConstruction* context) : OpKernel(context) {
+	  //- define the attribute of context
+	  //* the context is the input from your tensorflow code
+    OP_REQUIRES_OK(context, context->GetAttr("nbit1", &nbit1));
+    OP_REQUIRES_OK(context, context->GetAttr("nbit2", &nbit2));
+    OP_REQUIRES_OK(context, context->GetAttr("nbit3", &nbit3));
+    OP_REQUIRES_OK(context, context->GetAttr("isround", &isround));
+  }
+  
+  
+  /// Compute the descriptor
+  /// param: context
+  void Compute(OpKernelContext* context) override {
+    
+    /* 
+     * Get input
+     * 1.check
+     * 2.get tensor
+     * 3.get shape and check
+     */
+	
+	//- 1.check
+    DCHECK_EQ(1, context->num_inputs());
+    
+    //- 2.get tensor
+    const Tensor& X = context->input(0);
+    
+    //- 3. get shape and check
+    const TensorShape& shX = X.shape();
+    
+    int D1 = shX.dim_size(0);
+    int D2 = shX.dim_size(1);
+    
+    /*
+     * Calculate the output
+     * 1.create tensor
+     * 2.allocate the memory
+     * 3.calculate
+     */
+    
+    //- 1.create tensor
+    TensorShape shY;
+    shY.AddDim(D1);
+    shY.AddDim(D2);
+    
+    Tensor* Y = NULL;
+    
+    //- 2.allocate the memory
+    //* allocate memory for the Y tensor which is called output 0
+    OP_REQUIRES_OK(context, context->allocate_output(0, shY, &Y));
+    auto xs = X.matrix<FPTYPE>();
+    auto ys = Y->matrix<FPTYPE>();
+    FPTYPE prec;
+    FPTYPE x, x1, x2;
+    FPTYPE y, y1, y2;
+
+    
+    // calculate
+    int ii, jj;
+    bool  sign;
+    
+
+    if (this->nbit1 < 0){
+      for(ii=0; ii<D1; ii++){
+        for(jj=0; jj<D2; jj++){
+          sign = xs(ii, jj) < 0;
+          x = (sign) ? -xs(ii, jj) : xs(ii, jj);
+          x1 = (x >  2) ?  2 : x;
+          x2 = (x >  4) ?  4 : x;
+          y1 = x1 - x1 * x1 * (FPTYPE)0.25;
+          y2 = x2 * (FPTYPE)0.03125 - x2 * x2 * (FPTYPE)0.00390625;
+          ys(ii, jj) = (sign) ? -(y1 + y2) : (y1 + y2);
+        }
+      }
+    }
+    //
+    else {
+      prec = 1 << this->nbit1;
+
+      if (this->isround)
+      for(ii=0; ii<D1; ii++){
+        for(jj=0; jj<D2; jj++){
+          sign = xs(ii, jj) < 0;
+          x = (sign) ? -xs(ii, jj) : xs(ii, jj);
+          x =  round(x * prec) / prec;
+          x1 = (x >  2) ?  2 : x;
+          x2 = (x >  4) ?  4 : x;
+          y1 = round((x1 - x1 * x1 * (FPTYPE)0.25) * prec) / prec;
+          y2 = round((x2 * (FPTYPE)0.03125 - x2 * x2 * (FPTYPE)0.00390625) * prec) / prec;
+          ys(ii, jj) = (sign) ? -(y1 + y2) : (y1 + y2);
+        }
+      }
+      //
+      else
+      for(ii=0; ii<D1; ii++){
+        for(jj=0; jj<D2; jj++){
+          sign = xs(ii, jj) < 0;
+          x = (sign) ? -xs(ii, jj) : xs(ii, jj);
+          x =  floor(x * prec) / prec;
+          x1 = (x >  2) ?  2 : x;
+          x2 = (x >  4) ?  4 : x;
+          y1 = floor((x1 - x1 * x1 * (FPTYPE)0.25) * prec) / prec;
+          y2 = floor((x2 * (FPTYPE)0.03125 - x2 * x2 * (FPTYPE)0.00390625) * prec) / prec;
+          ys(ii, jj) = (sign) ? -(y1 + y2) : (y1 + y2);
+        }
+      }
+    }
+  }
+  
+//- define the private variable for calculation
+private:
+int nbit1, nbit2, nbit3;
+int isround;
+};
+
+
+#define REGISTER_CPU(T) \
+REGISTER_KERNEL_BUILDER( \
+    Name("Tanh2Nvnmd").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+    Tanh2NvnmdOp<CPUDevice, T>);
+REGISTER_CPU(float);                  
+REGISTER_CPU(double);
+
diff --git a/source/op/tanh4_nvnmd.cc b/source/op/tanh4_nvnmd.cc
new file mode 100644
index 0000000000..6df4a7c32b
--- /dev/null
+++ b/source/op/tanh4_nvnmd.cc
@@ -0,0 +1,183 @@
+
+
+// New Activation Function of NVNMD
+// --------------------------------------------------------------------
+/*
+
+# Function
+y = tanh4(x)
+y = f(x) = a*x3*|x| + b*x3 + d*x
+a = 1/16
+b = -1/4
+d = 1
+
+*/
+// --------------------------------------------------------------------
+//
+
+//- import the library of tensorflow
+#include "custom_op.h"
+
+using namespace tensorflow;
+
+
+//- register the operator
+REGISTER_OP("Tanh4Nvnmd")
+  .Attr("T: {float, double} = DT_DOUBLE")
+  .Input("x: T")
+  .Attr("isround: int")
+  .Attr("nbit1: int")
+  .Attr("nbit2: int")
+  .Attr("nbit3: int")
+  .Output("y: T");
+
+
+
+//- create the operator class
+//* the class must inherit the OpKernel Class
+template <typename Device, typename FPTYPE>
+class Tanh4NvnmdOp : public OpKernel {
+public:
+
+  /// Constructor.
+  explicit Tanh4NvnmdOp(OpKernelConstruction* context) : OpKernel(context) {
+	  //- define the attribute of context
+	  //* the context is the input from your tensorflow code
+    OP_REQUIRES_OK(context, context->GetAttr("nbit1", &nbit1));
+    OP_REQUIRES_OK(context, context->GetAttr("nbit2", &nbit2));
+    OP_REQUIRES_OK(context, context->GetAttr("nbit3", &nbit3));
+    OP_REQUIRES_OK(context, context->GetAttr("isround", &isround));
+  }
+  
+  
+  /// Compute the descriptor
+  /// param: context
+  void Compute(OpKernelContext* context) override {
+    
+    /* 
+     * Get input
+     * 1.check
+     * 2.get tensor
+     * 3.get shape and check
+     */
+	
+	//- 1.check
+    DCHECK_EQ(1, context->num_inputs());
+    
+    //- 2.get tensor
+    const Tensor& X = context->input(0);
+    
+    //- 3. get shape and check
+    const TensorShape& shX = X.shape();
+    
+    int D1 = shX.dim_size(0);
+    int D2 = shX.dim_size(1);
+    
+    /*
+     * Calculate the output
+     * 1.create tensor
+     * 2.allocate the memory
+     * 3.calculate
+     */
+    
+    //- 1.create tensor
+    TensorShape shY;
+    shY.AddDim(D1);
+    shY.AddDim(D2);
+    
+    Tensor* Y = NULL;
+    
+    //- 2.allocate the memory
+    //* allocate memory for the Y tensor which is called output 0
+    OP_REQUIRES_OK(context, context->allocate_output(0, shY, &Y));
+    auto xs = X.matrix<FPTYPE>();
+    auto ys = Y->matrix<FPTYPE>();
+    FPTYPE prec, prec4;
+    FPTYPE x, xa, x1, x2, x3, x4, xx, xxa, xx4;
+    FPTYPE a, b, d;
+    FPTYPE a1, b1, d1;
+    FPTYPE a2, b2, d2;
+    FPTYPE y, y1, y2;
+    FPTYPE H1, H2;
+
+    
+    // calculate
+    int ii, jj;
+    bool  sign;
+    
+
+    if (this->nbit1 < 0){
+      for(ii=0; ii<D1; ii++){
+        for(jj=0; jj<D2; jj++){
+          x = xs(ii, jj);
+          xa = (x < 0) ? (-x) : x;
+          xx = x*x;
+          //
+          if (xa<2) {
+            y = xx * (xx * (FPTYPE)0.0625 - xa * (FPTYPE)0.25) + xa; 
+          } else {
+            y = 1;
+          }
+          //
+          ys(ii, jj) = (x<0) ? (-y) : y;
+        }
+      }
+    }
+    //
+    else {
+      prec = 1 << this->nbit1;
+
+      if (this->isround)
+      for(ii=0; ii<D1; ii++){
+        for(jj=0; jj<D2; jj++){
+          x = xs(ii, jj);
+          xa = (x<0) ? (-x) : x;
+          xx = x * x;
+          xx = round(xx * prec) / prec;
+          //
+          if (xa<2) {
+            y = xx * (xx * (FPTYPE)0.0625 - xa * (FPTYPE)0.25) + xa;
+          } else {
+            y = 1;
+          }
+          //
+          y = round(y * prec) / prec;
+          ys(ii, jj) = (x<0) ? (-y) : y;
+        }
+      }
+      //
+      else
+      for(ii=0; ii<D1; ii++){
+        for(jj=0; jj<D2; jj++){
+          x = xs(ii, jj);
+          xa = (x<0) ? (-x) : x;
+          xx = x * x;
+          xx = floor(xx * prec) / prec;
+          //
+          if (xa<2) {
+            y = xx * (xx * (FPTYPE)0.0625 - xa * (FPTYPE)0.25) + xa;
+          } else {
+            y = 1;
+          }
+          //
+          y = floor(y * prec) / prec;
+          ys(ii, jj) = (x<0) ? (-y) : y;
+        }
+      }
+    }
+  }
+  
+//- define the private variable for calculation
+private:
+int nbit1, nbit2, nbit3;
+int isround;
+};
+
+
+#define REGISTER_CPU(T) \
+REGISTER_KERNEL_BUILDER( \
+    Name("Tanh4Nvnmd").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+    Tanh4NvnmdOp<CPUDevice, T>);
+REGISTER_CPU(float);                  
+REGISTER_CPU(double);
+
diff --git a/source/tests/nvnmd/config_ref.npy b/source/tests/nvnmd/config_ref.npy
new file mode 100644
index 0000000000..1a3b25f5fa
Binary files /dev/null and b/source/tests/nvnmd/config_ref.npy differ
diff --git a/source/tests/nvnmd/map_ref.npy b/source/tests/nvnmd/map_ref.npy
new file mode 100644
index 0000000000..12ad8f2c94
Binary files /dev/null and b/source/tests/nvnmd/map_ref.npy differ
diff --git a/source/tests/nvnmd/model_ref.npy b/source/tests/nvnmd/model_ref.npy
new file mode 100644
index 0000000000..d8def608a0
Binary files /dev/null and b/source/tests/nvnmd/model_ref.npy differ
diff --git a/source/tests/nvnmd/train_ref.json b/source/tests/nvnmd/train_ref.json
new file mode 100644
index 0000000000..ab947028b8
--- /dev/null
+++ b/source/tests/nvnmd/train_ref.json
@@ -0,0 +1,40 @@
+{
+    "nvnmd":{
+        "net_size": 128,
+        "sel": [60, 60],
+        "rcut": 6.0,
+        "rcut_smth": 0.5
+    },
+    "learning_rate": {
+        "type": "exp",
+        "start_lr": 1e-3,
+        "stop_lr": 3e-8,
+        "decay_steps": 5000
+    },
+    "loss": {
+        "start_pref_e": 0.02,
+        "limit_pref_e": 1,
+        "start_pref_f": 1000,
+        "limit_pref_f": 1,
+        "start_pref_v": 0,
+        "limit_pref_v": 0
+    },
+    "training": {
+        "seed": 1,
+        "stop_batch": 200000,
+        "numb_test": 1,
+        "disp_file": "lcurve.out",
+        "disp_freq": 1000,
+        "save_ckpt": "model.ckpt",
+        "save_freq": 10000,
+        "training_data": {
+            "systems": [
+                "../data"
+            ],
+            "set_prefix": "set",
+            "batch_size": [
+                1
+            ]
+        }
+    }
+}
diff --git a/source/tests/nvnmd/train_ref2.json b/source/tests/nvnmd/train_ref2.json
new file mode 100644
index 0000000000..04aad33410
--- /dev/null
+++ b/source/tests/nvnmd/train_ref2.json
@@ -0,0 +1,77 @@
+{
+    "nvnmd": {
+        "net_size": 128,
+        "config_file": "none",
+        "weight_file": "none",
+        "map_file": "none",
+        "enable": true,
+        "restore_descriptor": false,
+        "restore_fitting_net": false,
+        "quantize_descriptor": false,
+        "quantize_fitting_net": false
+    },
+    "learning_rate": {
+        "type": "exp",
+        "start_lr": 0.001,
+        "stop_lr": 3e-08,
+        "decay_steps": 5000
+    },
+    "loss": {
+        "start_pref_e": 0.02,
+        "limit_pref_e": 1,
+        "start_pref_f": 1000,
+        "limit_pref_f": 1,
+        "start_pref_v": 0,
+        "limit_pref_v": 0
+    },
+    "training": {
+        "seed": 1,
+        "stop_batch": 200000,
+        "numb_test": 1,
+        "disp_file": "nvnmd_cnn/lcurve.out",
+        "disp_freq": 1000,
+        "save_ckpt": "nvnmd_cnn/model.ckpt",
+        "save_freq": 10000,
+        "training_data": {
+            "systems": [
+                "../data"
+            ],
+            "set_prefix": "set",
+            "batch_size": [
+                1
+            ]
+        },
+        "disp_training": true,
+        "time_training": true,
+        "profiling": false
+    },
+    "model": {
+        "descriptor": {
+            "seed": 1,
+            "type": "se_a",
+            "sel": [
+                60,
+                60
+            ],
+            "rcut": 6.0,
+            "rcut_smth": 0.5,
+            "neuron": [
+                8,
+                16,
+                32
+            ],
+            "type_one_side": true,
+            "axis_neuron": 4,
+            "resnet_dt": false
+        },
+        "fitting_net": {
+            "seed": 1,
+            "neuron": [
+                128,
+                128,
+                128
+            ],
+            "resnet_dt": false
+        }
+    }
+}
\ No newline at end of file
diff --git a/source/tests/nvnmd/train_ref3.json b/source/tests/nvnmd/train_ref3.json
new file mode 100644
index 0000000000..5eed614d6d
--- /dev/null
+++ b/source/tests/nvnmd/train_ref3.json
@@ -0,0 +1,77 @@
+{
+    "nvnmd": {
+        "net_size": 128,
+        "config_file": "none",
+        "weight_file": "none",
+        "map_file": "none",
+        "enable": true,
+        "restore_descriptor": true,
+        "restore_fitting_net": true,
+        "quantize_descriptor": true,
+        "quantize_fitting_net": true
+    },
+    "learning_rate": {
+        "type": "exp",
+        "start_lr": 0.001,
+        "stop_lr": 3e-08,
+        "decay_steps": 5000
+    },
+    "loss": {
+        "start_pref_e": 0.02,
+        "limit_pref_e": 1,
+        "start_pref_f": 1000,
+        "limit_pref_f": 1,
+        "start_pref_v": 0,
+        "limit_pref_v": 0
+    },
+    "training": {
+        "seed": 1,
+        "stop_batch": 200000,
+        "numb_test": 1,
+        "disp_file": "nvnmd_qnn/lcurve.out",
+        "disp_freq": 1000,
+        "save_ckpt": "nvnmd_qnn/model.ckpt",
+        "save_freq": 10000,
+        "training_data": {
+            "systems": [
+                "../data"
+            ],
+            "set_prefix": "set",
+            "batch_size": [
+                1
+            ]
+        },
+        "disp_training": true,
+        "time_training": true,
+        "profiling": false
+    },
+    "model": {
+        "descriptor": {
+            "seed": 1,
+            "type": "se_a",
+            "sel": [
+                60,
+                60
+            ],
+            "rcut": 6.0,
+            "rcut_smth": 0.5,
+            "neuron": [
+                8,
+                16,
+                32
+            ],
+            "type_one_side": true,
+            "axis_neuron": 4,
+            "resnet_dt": false
+        },
+        "fitting_net": {
+            "seed": 1,
+            "neuron": [
+                128,
+                128,
+                128
+            ],
+            "resnet_dt": false
+        }
+    }
+}
\ No newline at end of file
diff --git a/source/tests/nvnmd/weight_ref.npy b/source/tests/nvnmd/weight_ref.npy
new file mode 100644
index 0000000000..95ea2ea65e
Binary files /dev/null and b/source/tests/nvnmd/weight_ref.npy differ
diff --git a/source/tests/test_nvnmd_entrypoints.py b/source/tests/test_nvnmd_entrypoints.py
new file mode 100644
index 0000000000..9525c087a6
--- /dev/null
+++ b/source/tests/test_nvnmd_entrypoints.py
@@ -0,0 +1,157 @@
+import os
+import numpy as np
+import unittest
+
+from common import tests_path
+
+from deepmd.env import tf
+from deepmd.nvnmd.utils.fio import FioNpyDic, FioJsonDic, FioBin
+from deepmd.nvnmd.entrypoints.freeze import save_weight
+from deepmd.nvnmd.entrypoints.mapt import mapt
+from deepmd.nvnmd.entrypoints.train import normalized_input, normalized_input_qnn
+from deepmd.nvnmd.entrypoints.wrap import wrap
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.nvnmd.data.data import jdata_deepmd_input
+
+
+class TestNvnmdFreeze(tf.test.TestCase):
+    def setUp(self):
+        tf.reset_default_graph()
+        config = tf.ConfigProto()
+        if int(os.environ.get("DP_AUTO_PARALLELIZATION", 0)):
+            config.graph_options.rewrite_options.custom_optimizers.add().name = "dpparallel"
+        self.sess = self.test_session(config=config).__enter__()
+
+    def test_freeze(self):
+        namelist = (
+            "descrpt_attr/t_avg",
+            "descrpt_attr/t_std",
+            "filter_type_0/matrix_0_0",
+            "filter_type_0/bias_0_0",
+            "layer_0_type_0/matrix",
+            "layer_0_type_0/bias",
+            "final_layer_type_0/matrix",
+            "final_layer_type_0/bias",
+        )
+        tvlist = []
+        save_path = str(tests_path / os.path.join("nvnmd", "weight.npy"))
+        vinit = tf.random_normal_initializer(stddev=1.0, seed=0)
+        for sname in namelist:
+            scope, name = sname.split('/')[0:2]
+            with tf.variable_scope(scope, reuse=False):
+                tv = tf.get_variable(name, [1], tf.float32, vinit)
+                tvlist.append(tv)
+        #
+        self.sess.run(tf.global_variables_initializer())
+        save_weight(self.sess, save_path)
+        weight = FioNpyDic().load(save_path)
+        namelist = [sname.replace('/', '.') for sname in namelist]
+        print(namelist)
+        print(list(weight.keys()))
+        np.testing.assert_equal(namelist, list(weight.keys()))
+        tf.reset_default_graph()
+
+
+class TestNvnmdMapt(tf.test.TestCase):
+    def setUp(self):
+        config = tf.ConfigProto()
+        if int(os.environ.get("DP_AUTO_PARALLELIZATION", 0)):
+            config.graph_options.rewrite_options.custom_optimizers.add().name = "dpparallel"
+        self.sess = self.test_session(config=config).__enter__()
+
+    def test_mapt(self):
+        nvnmd_config = str(tests_path / os.path.join("nvnmd", "config_ref.npy"))
+        nvnmd_weight = str(tests_path / os.path.join("nvnmd", "weight_ref.npy"))
+        nvnmd_map = str(tests_path / os.path.join("nvnmd", "map.npy"))
+        jdata = {
+            'nvnmd_config': nvnmd_config,
+            'nvnmd_weight': nvnmd_weight,
+            'nvnmd_map': nvnmd_map,
+        }
+        mapt(**jdata)
+        #
+        data = FioNpyDic().load(nvnmd_map)
+        #
+        nvnmd_map2 = str(tests_path / os.path.join("nvnmd", "map_ref.npy"))
+        data2 = FioNpyDic().load(nvnmd_map2)
+        keys = [
+            'r2',
+            's2',
+            's_t0_t0',
+            'sr_t0_t0',
+            'ds_dr2_t0_t0',
+            'dsr_dr2_t0_t0',
+            'G_t0_t0',
+            'dG_ds_t0_t0',
+            's_t0_t1',
+            'sr_t0_t1',
+            'ds_dr2_t0_t1',
+            'dsr_dr2_t0_t1',
+            'G_t0_t1',
+            'dG_ds_t0_t1'
+        ]
+        np.testing.assert_equal(keys, list(data.keys()))
+        np.testing.assert_almost_equal(data['G_t0_t0'], data2['G_t0_t0'])
+        tf.reset_default_graph()
+        # close NVNMD
+        jdata = jdata_deepmd_input['nvnmd']
+        jdata['config_file'] = "none"
+        jdata['weight_file'] = "none"
+        jdata['map_file'] = "none"
+        jdata['enable'] = False
+        nvnmd_cfg.init_from_jdata(jdata)
+
+class TestNvnmdTrain(tf.test.TestCase):
+    def test_train_input(self):
+        # test1
+        INPUT = str(tests_path / os.path.join("nvnmd", "train_ref.json"))
+        PATH_CNN = "nvnmd_cnn"
+        jdata = normalized_input(INPUT, PATH_CNN)
+        fn_ref = str(tests_path / os.path.join("nvnmd", "train_ref2.json"))
+        FioJsonDic().save(fn_ref, jdata)
+        # test2
+        PATH_QNN = "nvnmd_qnn"
+        CONFIG_CNN = "none"
+        WEIGHT_CNN = "none"
+        MAP_CNN = "none"
+        jdata = normalized_input_qnn(jdata, PATH_QNN, CONFIG_CNN, WEIGHT_CNN, MAP_CNN)
+        fn_ref = str(tests_path / os.path.join("nvnmd", "train_ref3.json"))
+        FioJsonDic().save(fn_ref, jdata)
+        # close NVNMD
+        jdata = jdata_deepmd_input['nvnmd']
+        jdata['config_file'] = "none"
+        jdata['weight_file'] = "none"
+        jdata['map_file'] = "none"
+        jdata['enable'] = False
+        nvnmd_cfg.init_from_jdata(jdata)
+
+class TestNvnmdWrap(tf.test.TestCase):
+    def test_wrap(self):
+        nvnmd_config = str(tests_path / os.path.join("nvnmd", "config_ref.npy"))
+        nvnmd_weight = str(tests_path / os.path.join("nvnmd", "weight_ref.npy"))
+        nvnmd_map = str(tests_path / os.path.join("nvnmd", "map.npy"))
+        nvnmd_model = str(tests_path / os.path.join("nvnmd", "model.pb"))
+        jdata = {
+            'nvnmd_config': nvnmd_config,
+            'nvnmd_weight': nvnmd_weight,
+            'nvnmd_map': nvnmd_map,
+            'nvnmd_model': nvnmd_model,
+        }
+        wrap(**jdata)
+        # test
+        data = FioBin().load(nvnmd_model)
+        nvnmd_model2 = str(tests_path / os.path.join("nvnmd", "model_ref.npy"))
+        datas = ''.join([hex(d+256).replace('0x1', '') for d in data[::256]])
+        data2 = FioNpyDic().load(nvnmd_model2)['ref']
+        np.testing.assert_equal(datas, data2)
+        # close NVNMD
+        jdata = jdata_deepmd_input['nvnmd']
+        jdata['config_file'] = "none"
+        jdata['weight_file'] = "none"
+        jdata['map_file'] = "none"
+        jdata['enable'] = False
+        nvnmd_cfg.init_from_jdata(jdata)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/source/tests/test_nvnmd_op.py b/source/tests/test_nvnmd_op.py
new file mode 100644
index 0000000000..18d30244e8
--- /dev/null
+++ b/source/tests/test_nvnmd_op.py
@@ -0,0 +1,304 @@
+import os
+import sys
+import numpy as np
+import unittest
+
+import deepmd.op
+from deepmd.env import tf
+from deepmd.env import op_module
+from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.env import GLOBAL_NP_FLOAT_PRECISION
+from deepmd.env import GLOBAL_ENER_FLOAT_PRECISION
+
+
+def qr(x, nbit):
+    return np.round(x * 2**nbit) / (2**nbit)
+
+
+def qf(x, nbit):
+    return np.floor(x * 2**nbit) / (2**nbit)
+
+
+class TestNvnmdMapOp(tf.test.TestCase):
+    def setUp(self):
+        config = tf.ConfigProto()
+        if int(os.environ.get("DP_AUTO_PARALLELIZATION", 0)):
+            config.graph_options.rewrite_options.custom_optimizers.add().name = "dpparallel"
+        self.sess = self.test_session(config=config).__enter__()
+        self.nbit_x = 14
+        self.nbit_xk = 10
+        self.nbit_yk = 10
+        self.prec_x = 1.0/2**self.nbit_x
+        self.prec_xk = 1.0/2**self.nbit_xk
+        self.prec_yk = 1.0/2**self.nbit_yk
+
+    def gen_map_table(self):
+        n = 2**self.nbit_xk
+        #
+        t = np.arange(n+1) / n
+        v = np.power(t, 3)
+        dv = 3 * np.power(t, 2)
+        dv2 = 6 * np.power(t, 1)
+        self.v = qr(v, self.nbit_yk).reshape([-1, 1])
+        self.dv = qr(dv, self.nbit_yk).reshape([-1, 1])
+        self.dv2 = qr(dv2, self.nbit_yk).reshape([-1, 1])
+
+    def map_nvnmd_py(self, x, v, d_v, dv, d_dv, prec_xk, nbit_yk):
+        pv = []
+        for ii in range(len(x)):
+            # y = vk + dvk * dxk
+            k = np.int32(np.floor(x[ii] / prec_xk))
+            xk = k * prec_xk
+            dxk = x[ii] - xk
+            vk = v[k]
+            dvk = d_v[k]
+            pvi = vk + dvk*dxk
+            pv.append(pvi)
+        pv = np.array(pv).reshape([-1, 1])
+        return pv
+
+    def test_map_op(self):
+        self.gen_map_table()
+        x = qr(np.random.rand(100)*0.9, self.nbit_x).reshape([-1, 1])
+        y = self.map_nvnmd_py(x, self.v, self.dv, self.dv, self.dv2, self.prec_xk, self.nbit_yk)
+        ty = op_module.map_nvnmd(x, self.v, self.dv, self.dv, self.dv2, self.prec_xk, self.nbit_yk)
+        self.sess.run(tf.global_variables_initializer())
+        typ = self.sess.run(ty)
+        np.testing.assert_almost_equal(typ, y, 5)
+
+
+class TestNvnmdMatmulOp(tf.test.TestCase):
+    def setUp(self):
+        config = tf.ConfigProto()
+        if int(os.environ.get("DP_AUTO_PARALLELIZATION", 0)):
+            config.graph_options.rewrite_options.custom_optimizers.add().name = "dpparallel"
+        self.sess = self.test_session(config=config).__enter__()
+        self.nbit = 13
+
+    def nvnmd_matmul_py(self, x, w, is_round, nbit, nbit2, nbit3):
+        if nbit < 0:
+            return np.matmul(x, w)
+        else:
+            if is_round == 1:
+                return qr(np.matmul(x, w), nbit)
+            else:
+                return qf(np.matmul(x, w), nbit)
+
+    def test_nvnmd_matmul(self):
+        N = 10
+        M = 10
+        K = 10
+        x = np.random.rand(N, M)
+        w = np.random.rand(M, K)
+        x = qr(x, self.nbit)
+        w = qr(x, self.nbit)
+        y = self.nvnmd_matmul_py(x, w, 1, self.nbit, self.nbit, -1)
+        ty = op_module.matmul_nvnmd(x, w, 1, self.nbit, self.nbit, -1)
+        self.sess.run(tf.global_variables_initializer())
+        typ = self.sess.run(ty)
+        np.testing.assert_almost_equal(typ, y, 5)
+
+
+class TestNvnmdQuantizeOp(tf.test.TestCase):
+    def setUp(self):
+        config = tf.ConfigProto()
+        if int(os.environ.get("DP_AUTO_PARALLELIZATION", 0)):
+            config.graph_options.rewrite_options.custom_optimizers.add().name = "dpparallel"
+        self.sess = self.test_session(config=config).__enter__()
+        self.nbit = 13
+
+    def nvnmd_quantize_py(self, x, is_round, nbit, nbit2, nbit3):
+        if nbit < 0:
+            return x
+        else:
+            if is_round == 1:
+                return qr(x, nbit)
+            else:
+                return qf(x, nbit)
+
+    def test_nvnmd_quantize(self):
+        N = 10
+        M = 10
+        x = np.random.rand(N, M)
+        y = self.nvnmd_quantize_py(x, 1, self.nbit, self.nbit, -1)
+        ty = op_module.quantize_nvnmd(x, 1, self.nbit, self.nbit, -1)
+        self.sess.run(tf.global_variables_initializer())
+        typ = self.sess.run(ty)
+        np.testing.assert_almost_equal(typ, y, 5)
+
+
+class TestNvnmdTanh2Op(tf.test.TestCase):
+    def setUp(self):
+        config = tf.ConfigProto()
+        if int(os.environ.get("DP_AUTO_PARALLELIZATION", 0)):
+            config.graph_options.rewrite_options.custom_optimizers.add().name = "dpparallel"
+        self.sess = self.test_session(config=config).__enter__()
+        self.nbit = 13
+
+    def nvnmd_tanh2_py(self, x, is_round, nbit, nbit2, nbit3):
+        if nbit < 0:
+            x1 = np.clip(x, -2, 2)
+            x2 = np.clip(x, -4, 4)
+            x1a = np.abs(x1)
+            x2a = np.abs(x2)
+            y1 = x1a - x1a * x1a * 0.25
+            y2 = x2a * 0.03125 - x2a * x2a * 0.00390625
+            y = y1 + y2
+            y = y * np.sign(x1)
+        else:
+            if is_round:
+                x = qr(x, nbit)
+                x1 = np.clip(x, -2, 2)
+                x2 = np.clip(x, -4, 4)
+                x1a = np.abs(x1)
+                x2a = np.abs(x2)
+                y1 = x1a - x1a * x1a * 0.25
+                y2 = x2a * 0.03125 - x2a * x2a * 0.00390625
+                y = qr(y1, nbit) + qr(y2, nbit)
+                y = y * np.sign(x1)
+            else:
+                x = qf(x, nbit)
+                x1 = np.clip(x, -2, 2)
+                x2 = np.clip(x, -4, 4)
+                x1a = np.abs(x1)
+                x2a = np.abs(x2)
+                y1 = x1a - x1a * x1a * 0.25
+                y2 = x2a * 0.03125 - x2a * x2a * 0.00390625
+                y = qf(y1, nbit) + qf(y2, nbit)
+                y = y * np.sign(x1)
+        return y
+
+    def test_nvnmd_tanh2(self):
+        N = 10
+        M = 10
+        x = np.random.rand(N, M)
+        y = self.nvnmd_tanh2_py(x, 1, self.nbit, self.nbit, -1)
+        ty = op_module.tanh2_nvnmd(x, 1, self.nbit, self.nbit, -1)
+        self.sess.run(tf.global_variables_initializer())
+        typ = self.sess.run(ty)
+        np.testing.assert_almost_equal(typ, y, 5)
+
+
+class TestNvnmdTanh4Op(tf.test.TestCase):
+    def setUp(self):
+        config = tf.ConfigProto()
+        if int(os.environ.get("DP_AUTO_PARALLELIZATION", 0)):
+            config.graph_options.rewrite_options.custom_optimizers.add().name = "dpparallel"
+        self.sess = self.test_session(config=config).__enter__()
+        self.nbit = 13
+
+    def nvnmd_tanh4_py(self, x, is_round, nbit, nbit2, nbit3):
+        if nbit < 0:
+            x = np.clip(x, -2, 2)
+            xa = np.abs(x)
+            xx = x * x
+            y = xx * (xx * 0.0625 - xa * 0.25) + xa
+            y = y * np.sign(x)
+            return y
+        else:
+            if is_round:
+                x = np.clip(x, -2, 2)
+                xa = np.abs(x)
+                xx = qr(x * x, nbit)
+                y = xx * (xx * 0.0625 - xa * 0.25) + xa
+                y = qr(y, nbit)
+                y = y * np.sign(x)
+                return y
+            else:
+                x = np.clip(x, -2, 2)
+                xa = np.abs(x)
+                xx = qf(x * x, nbit)
+                y = xx * (xx * 0.0625 - xa * 0.25) + xa
+                y = qf(y, nbit)
+                y = y * np.sign(x)
+                return y
+
+    def test_nvnmd_tanh4(self):
+        N = 10
+        M = 10
+        x = np.random.rand(N, M)
+        y = self.nvnmd_tanh4_py(x, 1, self.nbit, self.nbit, -1)
+        ty = op_module.tanh4_nvnmd(x, 1, self.nbit, self.nbit, -1)
+        self.sess.run(tf.global_variables_initializer())
+        typ = self.sess.run(ty)
+        np.testing.assert_almost_equal(typ, y, 5)
+
+
+class TestProdEnvMatANvnmdQuantize(tf.test.TestCase):
+    def setUp(self):
+        config = tf.ConfigProto()
+        if int(os.environ.get("DP_AUTO_PARALLELIZATION", 0)):
+            config.graph_options.rewrite_options.custom_optimizers.add().name = "dpparallel"
+        self.sess = self.test_session(config=config).__enter__()
+
+    def prod_env_mat_a_nvnmd_quantize_py(self):
+        coord = [
+            12.83, 2.56, 2.18,
+            12.09, 2.87, 2.74,
+            0.25, 3.32, 1.68,
+            3.36, 3.00, 1.81,
+            3.51, 2.51, 2.60,
+            4.27, 3.22, 1.56]
+        coord = np.reshape(np.array(coord), [1, -1])
+        #
+        atype = [
+            0, 1, 1,
+            0, 1, 1]
+        atype = np.reshape(np.array(atype), [1, -1])
+        #
+        box = [
+            13., 0., 0.,
+            0., 13., 0.,
+            0., 0., 13.]
+        box = np.reshape(np.array(box), [1, -1])
+        #
+        natoms = [6, 6, 3, 3]
+        natoms = np.int32(np.array(natoms))
+        #
+        mesh = np.int32(np.array([0, 0, 0, 2, 2, 2]))
+        t_avg = np.zeros([2, 6*4])
+        t_std = np.ones([2, 6*4])
+        #
+        y = [
+            1.279150390625000000e+01, 3.530029296875000000e+00, 4.399414062500000000e-01, -3.699951171875000000e-01, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 9.572753906250000000e-01, -7.399902343750000000e-01, 3.100585937500000000e-01, 5.600585937500000000e-01, 1.004028320312500000e+00, 4.200439453125000000e-01, 7.600097656250000000e-01, -5.000000000000000000e-01, 1.372167968750000000e+01, 3.680053710937500000e+00, -5.004882812500000000e-02, 4.200439453125000000e-01, 2.053308105468750000e+01, 4.439941406250000000e+00, 6.600341796875000000e-01, -6.199951171875000000e-01,
+            9.572753906250000000e-01, 7.399902343750000000e-01, -3.100585937500000000e-01, -5.600585937500000000e-01, 1.911486816406250000e+01, 4.270019531250000000e+00, 1.300048828125000000e-01, -9.300537109375000000e-01, 2.671752929687500000e+00, 1.160034179687500000e+00, 4.499511718750000000e-01, -1.060058593750000000e+00, 1.968591308593750000e+01, 4.420043945312500000e+00, -3.599853515625000000e-01, -1.400146484375000000e-01, 2.834790039062500000e+01, 5.180053710937500000e+00, 3.499755859375000000e-01, -1.180053710937500000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00,
+            1.004028320312500000e+00, -4.200439453125000000e-01, -7.600097656250000000e-01, 5.000000000000000000e-01, 9.791259765625000000e+00, 3.109985351562500000e+00, -3.199462890625000000e-01, 1.300048828125000000e-01, 2.671752929687500000e+00, -1.160034179687500000e+00, -4.499511718750000000e-01, 1.060058593750000000e+00, 1.213024902343750000e+01, 3.260009765625000000e+00, -8.100585937500000000e-01, 9.200439453125000000e-01, 1.618493652343750000e+01, 4.020019531250000000e+00, -9.997558593750000000e-02, -1.199951171875000000e-01, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00,
+            1.279150390625000000e+01, -3.530029296875000000e+00, -4.399414062500000000e-01, 3.699951171875000000e-01, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 8.867187500000000000e-01, 1.500244140625000000e-01, -4.899902343750000000e-01, 7.900390625000000000e-01, 9.389648437500000000e-01, 9.100341796875000000e-01, 2.199707031250000000e-01, -2.500000000000000000e-01, 9.791259765625000000e+00, -3.109985351562500000e+00, 3.199462890625000000e-01, -1.300048828125000000e-01, 1.911486816406250000e+01, -4.270019531250000000e+00, -1.300048828125000000e-01, 9.300537109375000000e-01,
+            8.867187500000000000e-01, -1.500244140625000000e-01, 4.899902343750000000e-01, -7.900390625000000000e-01, 1.372167968750000000e+01, -3.680053710937500000e+00, 5.004882812500000000e-02, -4.200439453125000000e-01, 2.163330078125000000e+00, 7.600097656250000000e-01, 7.099609375000000000e-01, -1.040039062500000000e+00, 1.213024902343750000e+01, -3.260009765625000000e+00, 8.100585937500000000e-01, -9.200439453125000000e-01, 1.968591308593750000e+01, -4.420043945312500000e+00, 3.599853515625000000e-01, 1.400146484375000000e-01, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00,
+            9.389648437500000000e-01, -9.100341796875000000e-01, -2.199707031250000000e-01, 2.500000000000000000e-01, 2.053308105468750000e+01, -4.439941406250000000e+00, -6.600341796875000000e-01, 6.199951171875000000e-01, 2.163330078125000000e+00, -7.600097656250000000e-01, -7.099609375000000000e-01, 1.040039062500000000e+00, 1.618493652343750000e+01, -4.020019531250000000e+00, 9.997558593750000000e-02, 1.199951171875000000e-01, 2.834790039062500000e+01, -5.180053710937500000e+00, -3.499755859375000000e-01, 1.180053710937500000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00]
+        y = np.array(y).reshape([1, -1])
+        # 
+        dy = [
+            -7.060058593750000000e+00, -8.798828125000000000e-01, 7.399902343750000000e-01, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 1.479980468750000000e+00, -6.201171875000000000e-01, -1.120117187500000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, -8.400878906250000000e-01, -1.520019531250000000e+00, 1.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, -7.360107421875000000e+00, 1.000976562500000000e-01, -8.400878906250000000e-01, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, -8.879882812500000000e+00, -1.320068359375000000e+00, 1.239990234375000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00,
+            -1.479980468750000000e+00, 6.201171875000000000e-01, 1.120117187500000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, -8.540039062500000000e+00, -2.600097656250000000e-01, 1.860107421875000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, -2.320068359375000000e+00, -8.999023437500000000e-01, 2.120117187500000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, -8.840087890625000000e+00, 7.199707031250000000e-01, 2.800292968750000000e-01, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, -1.036010742187500000e+01, -6.999511718750000000e-01, 2.360107421875000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00,
+            8.400878906250000000e-01, 1.520019531250000000e+00, -1.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, -6.219970703125000000e+00, 6.398925781250000000e-01, -2.600097656250000000e-01, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 2.320068359375000000e+00, 8.999023437500000000e-01, -2.120117187500000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, -6.520019531250000000e+00, 1.620117187500000000e+00, -1.840087890625000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, -8.040039062500000000e+00, 1.999511718750000000e-01, 2.399902343750000000e-01, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00,
+            7.060058593750000000e+00, 8.798828125000000000e-01, -7.399902343750000000e-01, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -3.000488281250000000e-01, 9.799804687500000000e-01, -1.580078125000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, -1.820068359375000000e+00, -4.399414062500000000e-01, 5.000000000000000000e-01, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 6.219970703125000000e+00, -6.398925781250000000e-01, 2.600097656250000000e-01, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 8.540039062500000000e+00, 2.600097656250000000e-01, -1.860107421875000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00,
+            3.000488281250000000e-01, -9.799804687500000000e-01, 1.580078125000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 7.360107421875000000e+00, -1.000976562500000000e-01, 8.400878906250000000e-01, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, -1.520019531250000000e+00, -1.419921875000000000e+00, 2.080078125000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 6.520019531250000000e+00, -1.620117187500000000e+00, 1.840087890625000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 8.840087890625000000e+00, -7.199707031250000000e-01, -2.800292968750000000e-01, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00,
+            1.820068359375000000e+00, 4.399414062500000000e-01, -5.000000000000000000e-01, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 8.879882812500000000e+00, 1.320068359375000000e+00, -1.239990234375000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 1.520019531250000000e+00, 1.419921875000000000e+00, -2.080078125000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 8.040039062500000000e+00, -1.999511718750000000e-01, -2.399902343750000000e-01, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 1.036010742187500000e+01, 6.999511718750000000e-01, -2.360107421875000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, -1.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00, 0.000000000000000000e+00]
+        dy = np.array(dy).reshape([1, -1])
+        return coord, atype, natoms, box, mesh, t_avg, t_std, y, dy
+
+    def test_prod_env_mat_a_nvnmd_quantize(self):
+        coord, atype, natoms, box, mesh, t_avg, t_std, y, dy = self.prod_env_mat_a_nvnmd_quantize_py()
+        ty, tdy, trij, tnlist = op_module.prod_env_mat_a_nvnmd_quantize(
+            coord,
+            atype,
+            natoms,
+            box,
+            mesh,
+            t_avg,
+            t_std,
+            rcut_a=0,
+            rcut_r=6.0,
+            rcut_r_smth=0.5,
+            sel_a=[2, 4],
+            sel_r=[0, 0]
+        )
+        self.sess.run(tf.global_variables_initializer())
+        typ, tdyp, trijp, tnlistp = self.sess.run([ty, tdy, trij, tnlist])
+        np.testing.assert_almost_equal(typ, y, 5)
+        np.testing.assert_almost_equal(tdyp, dy, 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/source/tests/test_nvnmd_se_a.py b/source/tests/test_nvnmd_se_a.py
new file mode 100644
index 0000000000..bc5cb34c98
--- /dev/null
+++ b/source/tests/test_nvnmd_se_a.py
@@ -0,0 +1,134 @@
+
+import dpdata,os,sys,unittest
+import numpy as np
+from deepmd.env import tf
+import pickle
+from common import Data, gen_data, j_loader
+
+from deepmd.utils.data_system import DataSystem
+from deepmd.descriptor import DescrptSeA
+from deepmd.fit import EnerFitting
+from deepmd.model import EnerModel
+from deepmd.common import j_must_have
+from deepmd.utils.type_embed import embed_atom_type, TypeEmbedNet
+
+GLOBAL_ENER_FLOAT_PRECISION = tf.float64
+GLOBAL_TF_FLOAT_PRECISION = tf.float64
+GLOBAL_NP_FLOAT_PRECISION = np.float64
+
+#
+from common import tests_path
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.nvnmd.data.data import jdata_deepmd_input
+
+
+class TestModel(tf.test.TestCase):
+    def setUp(self):
+        # generate data of GeTe
+        a = 3.01585  # crystal constant
+        natom = 64
+        box = np.zeros([9])
+        box[0] = a * 4
+        box[4] = a * 4
+        box[8] = a * 4
+        box = box.reshape([-1, 9])
+        types = np.zeros([natom], dtype=np.int32)
+        types[32:] = 1
+        types = types.reshape([1, natom])
+        coord = np.zeros([natom, 3])
+        ct = 0
+        ct2 = 32
+        for ix in range(4):
+            for iy in range(4):
+                for iz in range(4):
+                    if (ix + iy + iz) % 2 == 0:
+                        coord[ct] = np.array([ix*a, iy*a, iz*a])
+                        ct += 1
+                    else:
+                        coord[ct2] = np.array([ix*a, iy*a, iz*a])
+                        ct2 += 1
+        coord = coord.reshape([1, natom*3])
+        natoms = np.array([64, 64, 32, 32])
+        mesh = np.array([0, 0, 0, 2, 2, 2])
+        #
+        self.box = box
+        self.types = types
+        self.coord = coord
+        self.natoms = natoms
+        self.mesh = mesh
+
+    def test_descriptor_one_side_qnn(self):
+        """: test se_a of NVNMD with quantized value
+
+        Reference:
+            test_descrpt_se_a_type.py
+            
+        Note: 
+            The test_nvnmd_se_a.py must be run after test_nvnmd_entrypoints.py.
+            Because the data file map.npy ia generated in running test_nvnmd_entrypoints.py.
+        """
+        tf.reset_default_graph()
+        # open NVNMD
+        jdata_cf = jdata_deepmd_input['nvnmd']
+        jdata_cf['config_file'] = str(tests_path / os.path.join("nvnmd", "config_ref.npy"))
+        jdata_cf['weight_file'] = str(tests_path / os.path.join("nvnmd", "weight_ref.npy"))
+        jdata_cf['map_file'] = str(tests_path / os.path.join("nvnmd", "map.npy"))
+        jdata_cf['enable'] = True
+        nvnmd_cfg.init_from_jdata(jdata_cf)
+        nvnmd_cfg.quantize_descriptor = True
+        nvnmd_cfg.restore_descriptor = True
+        # load input
+        jfile = str(tests_path / os.path.join("nvnmd", "train_ref2.json"))
+        jdata = j_loader(jfile)
+        ntypes = nvnmd_cfg.dscp['ntype']
+
+        # build descriptor
+        jdata['model']['descriptor'].pop('type', None)
+        descrpt = DescrptSeA(**jdata['model']['descriptor'], uniform_seed = True)
+
+        t_coord            = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, None], name='i_coord')
+        t_type             = tf.placeholder(tf.int32,   [None, None], name='i_type')
+        t_natoms           = tf.placeholder(tf.int32,   [ntypes+2], name='i_natoms')
+        t_box              = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, 9], name='i_box')
+        t_mesh             = tf.placeholder(tf.int32,   [None], name='i_mesh')
+        is_training        = tf.placeholder(tf.bool)
+
+        dout \
+            = descrpt.build(
+                t_coord,
+                t_type,
+                t_natoms,
+                t_box,
+                t_mesh,
+                {},
+                reuse = False,
+                suffix = "_se_a_nvnmd"
+            )
+        # data
+        feed_dict_test = {
+                          t_coord:         self.coord,
+                          t_box:           self.box,
+                          t_type:          self.types,
+                          t_natoms:        self.natoms,
+                          t_mesh:          self.mesh,
+                          is_training:     False}
+        # run
+        sess = self.test_session().__enter__()
+        sess.run(tf.global_variables_initializer())
+        [model_dout] = sess.run([dout], feed_dict = feed_dict_test)
+        model_dout = model_dout.reshape([-1])
+        # compare
+        ref_dout = [
+            0.0067138671875, 0.0078125, -0.02587890625, 0.0081787109375, 0.0091552734375, 
+            -0.0302734375, 0.0096435546875, 0.01220703125, 0.0986328125, -0.03173828125
+        ]
+        places = 10
+        np.testing.assert_almost_equal(model_dout[0:10], ref_dout, places)
+        # close NVNMD
+        jdata_cf['enable'] = False
+        nvnmd_cfg.init_from_jdata(jdata_cf)
+
+        
+if __name__ == '__main__':
+    unittest.main()
+        
diff --git a/source/tests/test_nvnmd_utils.py b/source/tests/test_nvnmd_utils.py
new file mode 100644
index 0000000000..057dc74a27
--- /dev/null
+++ b/source/tests/test_nvnmd_utils.py
@@ -0,0 +1,57 @@
+import os
+import numpy as np
+import unittest
+
+from deepmd.env import tf
+from deepmd.nvnmd.utils.network import one_layer
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.nvnmd.data.data import jdata_deepmd_input
+
+
+class TestNvnmdNetwork(tf.test.TestCase):
+    def setUp(self):
+        tf.reset_default_graph()
+        config = tf.ConfigProto()
+        if int(os.environ.get("DP_AUTO_PARALLELIZATION", 0)):
+            config.graph_options.rewrite_options.custom_optimizers.add().name = "dpparallel"
+        self.sess = self.test_session(config=config).__enter__()
+    
+    def test_onelayer(self):
+        # open NVNMD
+        jdata = jdata_deepmd_input['nvnmd']
+        jdata['config_file'] = "none"
+        jdata['weight_file'] = "none"
+        jdata['map_file'] = "none"
+        jdata['enable'] = True
+        nvnmd_cfg.init_from_jdata(jdata)
+        w = np.array([-0.313429, 0.783452, -0.423276, 0.832279]).reshape(4, 1)
+        b = np.array([0.3482787]).reshape([1, 1])
+        nvnmd_cfg.weight = {
+            "nvnmd.matrix": w,
+            "nvnmd.bias": b
+        }
+        nvnmd_cfg.quantize_fitting_net = True
+        nvnmd_cfg.restore_fitting_net = True
+        # build
+        x = np.array([-0.313429, 1.436861, 0.324769, -1.4823674, 
+        0.783452, -0.171208, -0.033421, -1.324673]).reshape([2, 4])
+        y = np.array([0.19897461, -0.86706543]).reshape([-1])
+        ty = one_layer(
+            tf.constant(x),
+            1,
+            name="nvnmd"
+        )
+        # run
+        self.sess.run(tf.global_variables_initializer())
+        typ = self.sess.run(ty)
+        typ = typ.reshape([-1])
+        np.testing.assert_almost_equal(typ, y, 5)
+        # close NVNMD
+        jdata['enable'] = False
+        nvnmd_cfg.init_from_jdata(jdata)
+        nvnmd_cfg.weight = {}
+        nvnmd_cfg.quantize_fitting_net = False
+        nvnmd_cfg.restore_fitting_net = False
+
+if __name__ == '__main__':
+    unittest.main()