diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1922ea33f8..ca8be9147d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,67 +1,49 @@
-# See https://pre-commit.com for more information
-# See https://pre-commit.com/hooks.html for more hooks
 repos:
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
-    hooks:
-    -   id: trailing-whitespace
-        exclude: "^.+\\.pbtxt$"
-    -   id: end-of-file-fixer
-        exclude: "^.+\\.pbtxt$"
-    -   id: check-yaml
-    #-   id: check-json
-    -   id: check-added-large-files
-    -   id: check-merge-conflict
-    -   id: check-symlinks
-    -   id: check-toml
-# Python
--   repo: https://github.com/psf/black
-    rev: 23.3.0
-    hooks:
-    -   id: black-jupyter
--   repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
-    hooks:
-    - id: isort
-      files: \.py$
--   repo: https://github.com/charliermarsh/ruff-pre-commit
-    # Ruff version.
-    rev: v0.0.269
-    hooks:
-    - id: ruff
-      args: ["--fix"]
-# numpydoc
--   repo: https://github.com/Carreau/velin
-    rev: 0.0.12
-    hooks:
-    - id: velin
-      args: ["--write"]
-# Python inside docs
--   repo: https://github.com/asottile/blacken-docs
-    rev: 1.13.0
-    hooks:
-    -   id: blacken-docs
-# C++
--   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v16.0.4
-    hooks:
-    -   id: clang-format
-        exclude: ^source/3rdparty|source/lib/src/cuda/cudart/.+\.inc
-# CSS
--   repo: https://github.com/pre-commit/mirrors-csslint
-    rev: v1.0.5
-    hooks:
-    -   id: csslint
-# Shell
-- repo: https://github.com/scop/pre-commit-shfmt
-  rev: v3.6.0-2
-  hooks:
-    - id: shfmt
-# CMake
-- repo: https://github.com/cheshirekow/cmake-format-precommit
-  rev: v0.6.13
-  hooks:
-    - id: cmake-format
-    #- id: cmake-lint
-ci:
-  autoupdate_branch: devel
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.11.5
+    hooks:
+      - id: isort
+        args: ["--multi-line=7", "--sl"]
+
+  - repo: https://github.com/psf/black
+    rev: 22.3.0
+    hooks:
+      - id: black
+
+#   - repo: https://github.com/charliermarsh/ruff-pre-commit
+#     rev: 'v0.0.272'
+#     hooks:
+#       - id: ruff
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: a11d9314b22d8f8c7556443875b731ef05965464
+    hooks:
+      - id: check-merge-conflict
+      - id: check-symlinks
+      - id: detect-private-key
+        files: (?!.*paddle)^.*$
+      - id: end-of-file-fixer
+        files: \.md$
+      - id: trailing-whitespace
+        files: \.md$
+
+  - repo: https://github.com/Lucas-C/pre-commit-hooks
+    rev: v1.0.1
+    hooks:
+      - id: forbid-crlf
+        files: \.md$
+      - id: remove-crlf
+        files: \.md$
+      - id: forbid-tabs
+        files: \.md$
+      - id: remove-tabs
+        files: \.md$
+
+#   - repo: local
+#     hooks:
+#       - id: clang-format
+#         name: clang-format
+#         description: Format files with ClangFormat
+#         entry: bash .clang_format.hook -i
+#         language: system
+#         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
diff --git a/deepmd/common.py b/deepmd/common.py
index d0afbf0784..2f8334f9f5 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -2,39 +2,29 @@
 
 import json
 import warnings
-from functools import (
-    wraps,
-)
-from pathlib import (
-    Path,
-)
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    List,
-    Optional,
-    TypeVar,
-    Union,
-)
+from functools import wraps
+from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import TypeVar
+from typing import Union
 
 import numpy as np
 import tensorflow
 import yaml
-from tensorflow.python.framework import (
-    tensor_util,
-)
-
-from deepmd.env import (
-    GLOBAL_NP_FLOAT_PRECISION,
-    GLOBAL_TF_FLOAT_PRECISION,
-    op_module,
-    tf,
-)
-from deepmd.utils.path import (
-    DPPath,
-)
+from tensorflow.python.framework import tensor_util
+
+from deepmd.env import GLOBAL_NP_FLOAT_PRECISION
+from deepmd.env import GLOBAL_PD_FLOAT_PRECISION
+from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.env import op_module
+from deepmd.env import paddle
+from deepmd.env import tf
+from deepmd.utils.path import DPPath
 
 if TYPE_CHECKING:
     _DICT_VAL = TypeVar("_DICT_VAL")
@@ -50,11 +40,11 @@
 
 # define constants
 PRECISION_DICT = {
-    "default": GLOBAL_TF_FLOAT_PRECISION,
-    "float16": tf.float16,
-    "float32": tf.float32,
-    "float64": tf.float64,
-    "bfloat16": tf.bfloat16,
+    "default": GLOBAL_PD_FLOAT_PRECISION,
+    "float16": paddle.float16,
+    "float32": paddle.float32,
+    "float64": paddle.float64,
+    "bfloat16": paddle.bfloat16,
 }
 
 
@@ -119,11 +109,11 @@ def gelu_wrapper(x):
 data_requirement = {}
 
 ACTIVATION_FN_DICT = {
-    "relu": tf.nn.relu,
-    "relu6": tf.nn.relu6,
-    "softplus": tf.nn.softplus,
-    "sigmoid": tf.sigmoid,
-    "tanh": tf.nn.tanh,
+    "relu": paddle.nn.functional.relu,
+    "relu6": paddle.nn.functional.relu6,
+    "softplus": paddle.nn.functional.softplus,
+    "sigmoid": paddle.nn.functional.sigmoid,
+    "tanh": paddle.nn.functional.tanh,
     "gelu": gelu,
     "gelu_tf": gelu_tf,
     "None": None,
diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 641210f0d1..c29b667143 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -1,68 +1,42 @@
-from typing import (
-    List,
-    Optional,
-    Tuple,
-)
+from typing import List
+from typing import Optional
+from typing import Tuple
 
 import numpy as np
 
-from deepmd.common import (
-    cast_precision,
-    get_activation_func,
-    get_precision,
-)
-from deepmd.env import (
-    GLOBAL_NP_FLOAT_PRECISION,
-    GLOBAL_TF_FLOAT_PRECISION,
-    default_tf_session_config,
-    op_module,
-    tf,
-)
-from deepmd.nvnmd.descriptor.se_a import (
-    build_davg_dstd,
-    build_op_descriptor,
-    check_switch_range,
-    descrpt2r4,
-    filter_GR2D,
-    filter_lower_R42GR,
-)
-from deepmd.nvnmd.utils.config import (
-    nvnmd_cfg,
-)
-from deepmd.utils.errors import (
-    GraphWithoutTensorError,
-)
-from deepmd.utils.graph import (
-    get_tensor_by_name_from_graph,
-)
-from deepmd.utils.network import (
-    embedding_net,
-    embedding_net_rand_seed_shift,
-)
-from deepmd.utils.sess import (
-    run_sess,
-)
-from deepmd.utils.spin import (
-    Spin,
-)
-from deepmd.utils.tabulate import (
-    DPTabulate,
-)
-from deepmd.utils.type_embed import (
-    embed_atom_type,
-)
-
-from .descriptor import (
-    Descriptor,
-)
-from .se import (
-    DescrptSe,
-)
-
-
-@Descriptor.register("se_e2_a")
-@Descriptor.register("se_a")
-class DescrptSeA(DescrptSe):
+from deepmd.common import cast_precision
+from deepmd.common import get_activation_func
+from deepmd.common import get_precision
+from deepmd.env import GLOBAL_NP_FLOAT_PRECISION
+from deepmd.env import GLOBAL_PD_FLOAT_PRECISION
+from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.env import default_tf_session_config
+from deepmd.env import op_module
+from deepmd.env import paddle
+from deepmd.env import tf
+from deepmd.nvnmd.descriptor.se_a import build_davg_dstd
+from deepmd.nvnmd.descriptor.se_a import build_op_descriptor
+from deepmd.nvnmd.descriptor.se_a import check_switch_range
+from deepmd.nvnmd.descriptor.se_a import descrpt2r4
+from deepmd.nvnmd.descriptor.se_a import filter_GR2D
+from deepmd.nvnmd.descriptor.se_a import filter_lower_R42GR
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.utils.errors import GraphWithoutTensorError
+from deepmd.utils.graph import get_tensor_by_name_from_graph
+from deepmd.utils.network import EmbeddingNet  # embedding_net,
+from deepmd.utils.network import embedding_net_rand_seed_shift
+from deepmd.utils.sess import run_sess
+from deepmd.utils.spin import Spin
+from deepmd.utils.tabulate import DPTabulate
+from deepmd.utils.type_embed import embed_atom_type
+
+from .descriptor import Descriptor
+from .se import DescrptSe
+
+
+# @Descriptor.register("se_e2_a")
+# @Descriptor.register("se_a")
+class DescrptSeA(paddle.nn.Layer):
     r"""DeepPot-SE constructed from all information (both angular and radial) of
     atomic configurations. The embedding takes the distance between atoms as input.
 
@@ -166,6 +140,7 @@ def __init__(
         spin: Optional[Spin] = None,
     ) -> None:
         """Constructor."""
+        super().__init__()
         if rcut < rcut_smth:
             raise RuntimeError(
                 f"rcut_smth ({rcut_smth:f}) should be no more than rcut ({rcut:f})!"
@@ -190,6 +165,7 @@ def __init__(
             self.exclude_types.add((tt[1], tt[0]))
         self.set_davg_zero = set_davg_zero
         self.type_one_side = type_one_side
+        self.type_one_side = False
         self.spin = spin
 
         # extend sel_a for spin system
@@ -215,49 +191,71 @@ def __init__(
         self.useBN = False
         self.dstd = None
         self.davg = None
+
+        self.avg_zero = paddle.zeros([self.ntypes, self.ndescrpt], dtype="float32")
+        self.std_ones = paddle.ones([self.ntypes, self.ndescrpt], dtype="float32")
+        nets = []
+        for type_input in range(self.ntypes):
+            layer = []
+            for type_i in range(self.ntypes):
+                layer.append(
+                    EmbeddingNet(
+                        self.filter_neuron,
+                        self.filter_precision,
+                        self.filter_activation_fn,
+                        self.filter_resnet_dt,
+                        self.seed,
+                        self.trainable,
+                        name="filter_type_" + str(type_input) + str(type_i),
+                    )
+                )
+            nets.append(paddle.nn.LayerList(layer))
+
+        self.embedding_nets = paddle.nn.LayerList(nets)
+
         self.compress = False
         self.embedding_net_variables = None
         self.mixed_prec = None
-        self.place_holders = {}
+        # self.place_holders = {}
         self.nei_type = np.repeat(np.arange(self.ntypes), self.sel_a)  # like a mask
 
-        avg_zero = np.zeros([self.ntypes, self.ndescrpt]).astype(
-            GLOBAL_NP_FLOAT_PRECISION
-        )
-        std_ones = np.ones([self.ntypes, self.ndescrpt]).astype(
-            GLOBAL_NP_FLOAT_PRECISION
-        )
-        sub_graph = tf.Graph()
-        with sub_graph.as_default():
-            name_pfx = "d_sea_"
-            for ii in ["coord", "box"]:
-                self.place_holders[ii] = tf.placeholder(
-                    GLOBAL_NP_FLOAT_PRECISION, [None, None], name=name_pfx + "t_" + ii
-                )
-            self.place_holders["type"] = tf.placeholder(
-                tf.int32, [None, None], name=name_pfx + "t_type"
-            )
-            self.place_holders["natoms_vec"] = tf.placeholder(
-                tf.int32, [self.ntypes + 2], name=name_pfx + "t_natoms"
-            )
-            self.place_holders["default_mesh"] = tf.placeholder(
-                tf.int32, [None], name=name_pfx + "t_mesh"
-            )
-            self.stat_descrpt, descrpt_deriv, rij, nlist = op_module.prod_env_mat_a(
-                self.place_holders["coord"],
-                self.place_holders["type"],
-                self.place_holders["natoms_vec"],
-                self.place_holders["box"],
-                self.place_holders["default_mesh"],
-                tf.constant(avg_zero),
-                tf.constant(std_ones),
-                rcut_a=self.rcut_a,
-                rcut_r=self.rcut_r,
-                rcut_r_smth=self.rcut_r_smth,
-                sel_a=self.sel_a,
-                sel_r=self.sel_r,
-            )
-        self.sub_sess = tf.Session(graph=sub_graph, config=default_tf_session_config)
+        # avg_zero = np.zeros([self.ntypes, self.ndescrpt]).astype(
+        #     GLOBAL_NP_FLOAT_PRECISION
+        # )
+        # std_ones = np.ones([self.ntypes, self.ndescrpt]).astype(
+        #     GLOBAL_NP_FLOAT_PRECISION
+        # )
+        # sub_graph = tf.Graph()
+        # with sub_graph.as_default():
+        # name_pfx = "d_sea_"
+        # for ii in ["coord", "box"]:
+        #     self.place_holders[ii] = tf.placeholder(
+        #         GLOBAL_NP_FLOAT_PRECISION, [None, None], name=name_pfx + "t_" + ii
+        #     )
+        # self.place_holders["type"] = tf.placeholder(
+        #     tf.int32, [None, None], name=name_pfx + "t_type"
+        # )
+        # self.place_holders["natoms_vec"] = tf.placeholder(
+        #     tf.int32, [self.ntypes + 2], name=name_pfx + "t_natoms"
+        # )
+        # self.place_holders["default_mesh"] = tf.placeholder(
+        #     tf.int32, [None], name=name_pfx + "t_mesh"
+        # )
+        # self.stat_descrpt, descrpt_deriv, rij, nlist = op_module.prod_env_mat_a(
+        #     self.place_holders["coord"],
+        #     self.place_holders["type"],
+        #     self.place_holders["natoms_vec"],
+        #     self.place_holders["box"],
+        #     self.place_holders["default_mesh"],
+        #     self.avg_zero,
+        #     self.std_ones,
+        #     rcut_a=self.rcut_a,
+        #     rcut_r=self.rcut_r,
+        #     rcut_r_smth=self.rcut_r_smth,
+        #     sel_a=self.sel_a,
+        #     sel_r=self.sel_r,
+        # )
+        # self.sub_sess = tf.Session(graph=sub_graph, config=default_tf_session_config)
         self.original_sel = None
         self.multi_task = multi_task
         if multi_task:
@@ -269,6 +267,20 @@ def __init__(
                 "suma2": [],
             }
 
+        self.t_rcut = paddle.to_tensor(
+            np.max([self.rcut_r, self.rcut_a]), dtype="float32"
+        )
+        self.t_ntypes = paddle.to_tensor(self.ntypes, dtype="int32")
+        self.t_ndescrpt = paddle.to_tensor(self.ndescrpt, dtype="int32")
+        self.t_sel = paddle.to_tensor(self.sel_a, dtype="int32")
+
+        t_avg = paddle.to_tensor(
+            np.zeros([self.ntypes, self.ndescrpt]), dtype="float64"
+        )
+        t_std = paddle.to_tensor(np.ones([self.ntypes, self.ndescrpt]), dtype="float64")
+        self.register_buffer("t_avg", t_avg)
+        self.register_buffer("t_std", t_std)
+
     def get_rcut(self) -> float:
         """Returns the cut-off radius."""
         return self.rcut_r
@@ -285,7 +297,7 @@ def get_dim_rot_mat_1(self) -> int:
         """Returns the first dimension of the rotation matrix. The rotation is of shape dim_1 x 3."""
         return self.filter_neuron[-1]
 
-    def get_nlist(self) -> Tuple[tf.Tensor, tf.Tensor, List[int], List[int]]:
+    def get_nlist(self) -> Tuple[paddle.Tensor, paddle.Tensor, List[int], List[int]]:
         """Returns neighbor information.
 
         Returns
@@ -360,6 +372,9 @@ def compute_input_stats(
                 self.stat_dict["sumr2"] += sumr2
                 self.stat_dict["suma2"] += suma2
 
+        self.t_avg = paddle.to_tensor(self.davg, dtype="float64")
+        self.t_std = paddle.to_tensor(self.dstd, dtype="float64")
+
     def merge_input_stats(self, stat_dict):
         """Merge the statisitcs computed from compute_input_stats to obtain the self.davg and self.dstd.
 
@@ -498,17 +513,17 @@ def enable_mixed_precision(self, mixed_prec: Optional[dict] = None) -> None:
         self.mixed_prec = mixed_prec
         self.filter_precision = get_precision(mixed_prec["output_prec"])
 
-    def build(
+    def forward(
         self,
-        coord_: tf.Tensor,
-        atype_: tf.Tensor,
-        natoms: tf.Tensor,
-        box_: tf.Tensor,
-        mesh: tf.Tensor,
+        coord_: paddle.Tensor,
+        atype_: paddle.Tensor,
+        natoms: paddle.Tensor,
+        box_: paddle.Tensor,
+        mesh: paddle.Tensor,
         input_dict: dict,
         reuse: Optional[bool] = None,
         suffix: str = "",
-    ) -> tf.Tensor:
+    ) -> paddle.Tensor:
         """Build the computational graph for the descriptor.
 
         Parameters
@@ -542,73 +557,114 @@ def build(
         """
         davg = self.davg
         dstd = self.dstd
-        if nvnmd_cfg.enable:
-            if nvnmd_cfg.restore_descriptor:
-                davg, dstd = build_davg_dstd()
-            check_switch_range(davg, dstd)
-        with tf.variable_scope("descrpt_attr" + suffix, reuse=reuse):
-            if davg is None:
-                davg = np.zeros([self.ntypes, self.ndescrpt])
-            if dstd is None:
-                dstd = np.ones([self.ntypes, self.ndescrpt])
-            t_rcut = tf.constant(
-                np.max([self.rcut_r, self.rcut_a]),
-                name="rcut",
-                dtype=GLOBAL_TF_FLOAT_PRECISION,
-            )
-            t_ntypes = tf.constant(self.ntypes, name="ntypes", dtype=tf.int32)
-            t_ndescrpt = tf.constant(self.ndescrpt, name="ndescrpt", dtype=tf.int32)
-            t_sel = tf.constant(self.sel_a, name="sel", dtype=tf.int32)
-            t_original_sel = tf.constant(
-                self.original_sel if self.original_sel is not None else self.sel_a,
-                name="original_sel",
-                dtype=tf.int32,
-            )
-            self.t_avg = tf.get_variable(
-                "t_avg",
-                davg.shape,
-                dtype=GLOBAL_TF_FLOAT_PRECISION,
-                trainable=False,
-                initializer=tf.constant_initializer(davg),
-            )
-            self.t_std = tf.get_variable(
-                "t_std",
-                dstd.shape,
-                dtype=GLOBAL_TF_FLOAT_PRECISION,
-                trainable=False,
-                initializer=tf.constant_initializer(dstd),
-            )
-
-        with tf.control_dependencies([t_sel, t_original_sel]):
-            coord = tf.reshape(coord_, [-1, natoms[1] * 3])
-        box = tf.reshape(box_, [-1, 9])
-        atype = tf.reshape(atype_, [-1, natoms[1]])
-
-        op_descriptor = (
-            build_op_descriptor() if nvnmd_cfg.enable else op_module.prod_env_mat_a
-        )
-        self.descrpt, self.descrpt_deriv, self.rij, self.nlist = op_descriptor(
+        # if nvnmd_cfg.enable:
+        #     if nvnmd_cfg.restore_descriptor:
+        #         davg, dstd = build_davg_dstd()
+        #     check_switch_range(davg, dstd)
+        # with tf.variable_scope("descrpt_attr" + suffix, reuse=reuse):
+        if davg is None:
+            davg = np.zeros([self.ntypes, self.ndescrpt])
+        if dstd is None:
+            dstd = np.ones([self.ntypes, self.ndescrpt])
+        # t_rcut = tf.constant(
+        #     np.max([self.rcut_r, self.rcut_a]),
+        #     name="rcut",
+        #     dtype=GLOBAL_TF_FLOAT_PRECISION,
+        # )
+        # t_ntypes = tf.constant(self.ntypes, name="ntypes", dtype=tf.int32)
+        # t_ndescrpt = tf.constant(self.ndescrpt, name="ndescrpt", dtype=tf.int32)
+        # t_sel = tf.constant(self.sel_a, name="sel", dtype=tf.int32)
+        # t_original_sel = paddle.to_tensor(
+        #     self.original_sel if self.original_sel is not None else self.sel_a,
+        # )
+        # self.t_avg = tf.get_variable(
+        #     "t_avg",
+        #     davg.shape,
+        #     dtype=GLOBAL_TF_FLOAT_PRECISION,
+        #     trainable=False,
+        #     initializer=tf.constant_initializer(davg),
+        # )
+        # self.t_std = tf.get_variable(
+        #     "t_std",
+        #     dstd.shape,
+        #     dtype=GLOBAL_TF_FLOAT_PRECISION,
+        #     trainable=False,
+        #     initializer=tf.constant_initializer(dstd),
+        # )
+
+        coord = paddle.reshape(coord_, [-1, natoms[1] * 3])
+        box = paddle.reshape(box_, [-1, 9])
+        atype = paddle.reshape(atype_, [-1, natoms[1]])
+        # op_descriptor = (
+        #     build_op_descriptor() if nvnmd_cfg.enable else op_module.prod_env_mat_a
+        # )
+        # print(coord.dtype) # paddle.float64
+        # print(atype.dtype) # paddle.int32
+        # print(box.dtype) # paddle.float64
+        # print(mesh.dtype) # paddle.int32
+        # print(self.t_avg.dtype) # paddle.float32
+        # print(self.t_std.dtype) # paddle.float32
+        # print(natoms)
+        # exit()
+        (
+            self.descrpt,
+            self.descrpt_deriv,
+            self.rij,
+            self.nlist,
+        ) = op_module.prod_env_mat_a(
             coord,
             atype,
-            natoms,
             box,
             mesh,
             self.t_avg,
             self.t_std,
+            natoms,
             rcut_a=self.rcut_a,
             rcut_r=self.rcut_r,
             rcut_r_smth=self.rcut_r_smth,
             sel_a=self.sel_a,
             sel_r=self.sel_r,
         )
+        # np.save(
+        #     "/workspace/hesensen/deepmd_backend/"
+        #     "deepmd-kit/examples/water/se_e2_a/align_input/pred_descrpt",
+        #     self.descrpt,
+        # )
+        # np.save(
+        #     "/workspace/hesensen/deepmd_backend/"
+        #     "deepmd-kit/examples/water/se_e2_a/align_input/pred_descrpt_deriv",
+        #     self.descrpt_deriv,
+        # )
+        # np.save(
+        #     "/workspace/hesensen/deepmd_backend/"
+        #     "deepmd-kit/examples/water/se_e2_a/align_input/pred_rij",
+        #     self.rij,
+        # )
+        # np.save(
+        #     "/workspace/hesensen/deepmd_backend/"
+        #     "deepmd-kit/examples/water/se_e2_a/align_input/pred_nlist",
+        #     self.nlist,
+        # )
+        # np.save(
+        #     "/workspace/hesensen/deepmd_backend/"
+        #     "deepmd-kit/examples/water/se_e2_a/align_input/pred_nlist",
+        #     self.nlist,
+        # )
+        # np.save(
+        #     "/workspace/hesensen/deepmd_backend/"
+        #     "deepmd-kit/examples/water/se_e2_a/align_input/pred_nlist",
+        #     self.nlist,
+        # )
+        # exit()
+        # self.descrpt.shape = [1, 105984]
         # only used when tensorboard was set as true
-        tf.summary.histogram("descrpt", self.descrpt)
-        tf.summary.histogram("rij", self.rij)
-        tf.summary.histogram("nlist", self.nlist)
-
-        self.descrpt_reshape = tf.reshape(self.descrpt, [-1, self.ndescrpt])
-        self._identity_tensors(suffix=suffix)
-
+        # tf.summary.histogram("descrpt", self.descrpt)
+        # tf.summary.histogram("rij", self.rij)
+        # tf.summary.histogram("nlist", self.nlist)
+        self.descrpt_reshape = paddle.reshape(self.descrpt, [-1, self.ndescrpt])
+        # [1, 105984] --> [192, 552]
+        self.descrpt_reshape.stop_gradient = False
+        # self._identity_tensors(suffix=suffix)
         self.dout, self.qmat = self._pass_filter(
             self.descrpt_reshape,
             atype,
@@ -618,18 +674,32 @@ def build(
             reuse=reuse,
             trainable=self.trainable,
         )
+        # np.save(
+        #     "/workspace/hesensen/deepmd_backend/"
+        #     "deepmd-kit/examples/water/se_e2_a/align_input/pred_dout",
+        #     self.dout,
+        # )
+        # np.save(
+        #     "/workspace/hesensen/deepmd_backend/"
+        #     "deepmd-kit/examples/water/se_e2_a/align_input/pred_qmat",
+        #     self.qmat,
+        # )
+        # exit()
 
         # only used when tensorboard was set as true
-        tf.summary.histogram("embedding_net_output", self.dout)
+        # tf.summary.histogram("embedding_net_output", self.dout)
+        # print(self.dout.shape)
+        # np.save(f"/workspace/hesensen/deepmd_backend/infer_align/dout_pd.npy", self.dout)
+        # exit()
         return self.dout
 
-    def get_rot_mat(self) -> tf.Tensor:
+    def get_rot_mat(self) -> paddle.Tensor:
         """Get rotational matrix."""
         return self.qmat
 
     def prod_force_virial(
-        self, atom_ener: tf.Tensor, natoms: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        self, atom_ener: paddle.Tensor, natoms: paddle.Tensor
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         """Compute force and virial.
 
         Parameters
@@ -651,11 +721,11 @@ def prod_force_virial(
         atom_virial
             The atomic virial
         """
-        [net_deriv] = tf.gradients(atom_ener, self.descrpt_reshape)
-        tf.summary.histogram("net_derivative", net_deriv)
-        net_deriv_reshape = tf.reshape(
+        net_deriv = paddle.grad(atom_ener, self.descrpt_reshape, create_graph=True)[0]
+        # tf.summary.histogram("net_derivative", net_deriv)
+        net_deriv_reshape = paddle.reshape(
             net_deriv,
-            [np.cast["int64"](-1), natoms[0] * np.cast["int64"](self.ndescrpt)],
+            [-1, natoms[0] * self.ndescrpt],
         )
         force = op_module.prod_force_se_a(
             net_deriv_reshape,
@@ -674,29 +744,43 @@ def prod_force_virial(
             n_a_sel=self.nnei_a,
             n_r_sel=self.nnei_r,
         )
-        tf.summary.histogram("force", force)
-        tf.summary.histogram("virial", virial)
-        tf.summary.histogram("atom_virial", atom_virial)
+        # tf.summary.histogram("force", force)
+        # tf.summary.histogram("virial", virial)
+        # tf.summary.histogram("atom_virial", atom_virial)
 
         return force, virial, atom_virial
 
     def _pass_filter(
         self, inputs, atype, natoms, input_dict, reuse=None, suffix="", trainable=True
     ):
+        # natoms = [192, 192, 64 , 128]
         if input_dict is not None:
             type_embedding = input_dict.get("type_embedding", None)
         else:
             type_embedding = None
         start_index = 0
-        inputs = tf.reshape(inputs, [-1, natoms[0], self.ndescrpt])
+        # print(inputs.shape) # [192, 552]
+        inputs = paddle.reshape(inputs, [-1, int(natoms[0].item()), int(self.ndescrpt)])
+        # print(inputs.shape) # [1, 192, 552]
+        # exit()
         output = []
         output_qmat = []
+        # print(self.type_one_side, type_embedding)
+        # exit()
         if not self.type_one_side and type_embedding is None:
+            # print("here", self.ntypes)
             for type_i in range(self.ntypes):
-                inputs_i = tf.slice(
-                    inputs, [0, start_index, 0], [-1, natoms[2 + type_i], -1]
-                )
-                inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
+                inputs_i = paddle.slice(
+                    inputs,
+                    [0, 1, 2],
+                    [0, start_index, 0],
+                    [
+                        inputs.shape[0],
+                        start_index + natoms[2 + type_i],
+                        inputs.shape[2],
+                    ],
+                )  # [1, 192, 552] --> [1, 64, 552]
+                inputs_i = paddle.reshape(inputs_i, [-1, self.ndescrpt])  # [64, 552]
                 filter_name = "filter_type_" + str(type_i) + suffix
                 layer, qmat = self._filter(
                     inputs_i,
@@ -707,13 +791,13 @@ def _pass_filter(
                     trainable=trainable,
                     activation_fn=self.filter_activation_fn,
                 )
-                layer = tf.reshape(
-                    layer, [tf.shape(inputs)[0], natoms[2 + type_i], self.get_dim_out()]
+                layer = paddle.reshape(
+                    layer, [inputs.shape[0], natoms[2 + type_i], self.get_dim_out()]
                 )
-                qmat = tf.reshape(
+                qmat = paddle.reshape(
                     qmat,
                     [
-                        tf.shape(inputs)[0],
+                        inputs.shape[0],
                         natoms[2 + type_i],
                         self.get_dim_rot_mat_1() * 3,
                     ],
@@ -722,61 +806,97 @@ def _pass_filter(
                 output_qmat.append(qmat)
                 start_index += natoms[2 + type_i]
         else:
-            inputs_i = inputs
-            inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
-            type_i = -1
-            if nvnmd_cfg.enable and nvnmd_cfg.quantize_descriptor:
-                inputs_i = descrpt2r4(inputs_i, natoms)
-            if len(self.exclude_types):
-                atype_nloc = tf.reshape(
-                    tf.slice(atype, [0, 0], [-1, natoms[0]]), [-1]
-                )  # when nloc != nall, pass nloc to mask
-                mask = self.build_type_exclude_mask(
-                    self.exclude_types,
-                    self.ntypes,
-                    self.sel_a,
-                    self.ndescrpt,
-                    atype_nloc,
-                    tf.shape(inputs_i)[0],
-                )
-                inputs_i *= mask
-
-            layer, qmat = self._filter(
-                inputs_i,
-                type_i,
-                name="filter_type_all" + suffix,
-                natoms=natoms,
-                reuse=reuse,
-                trainable=trainable,
-                activation_fn=self.filter_activation_fn,
-                type_embedding=type_embedding,
-            )
-            layer = tf.reshape(
-                layer, [tf.shape(inputs)[0], natoms[0], self.get_dim_out()]
-            )
-            qmat = tf.reshape(
-                qmat, [tf.shape(inputs)[0], natoms[0], self.get_dim_rot_mat_1() * 3]
-            )
-            output.append(layer)
-            output_qmat.append(qmat)
-        output = tf.concat(output, axis=1)
-        output_qmat = tf.concat(output_qmat, axis=1)
+            ...
+            # inputs_i = inputs
+            # inputs_i = paddle.reshape(inputs_i, [-1, self.ndescrpt])
+            # type_i = -1
+            # if nvnmd_cfg.enable and nvnmd_cfg.quantize_descriptor:
+            #     inputs_i = descrpt2r4(inputs_i, natoms)
+            # if len(self.exclude_types):
+            #     atype_nloc = paddle.reshape(
+            #         paddle.slice(atype, [0, 0], [-1, natoms[0]]), [-1]
+            #     )  # when nloc != nall, pass nloc to mask
+            #     mask = self.build_type_exclude_mask(
+            #         self.exclude_types,
+            #         self.ntypes,
+            #         self.sel_a,
+            #         self.ndescrpt,
+            #         atype_nloc,
+            #         paddle.shape(inputs_i)[0],
+            #     )
+            #     inputs_i *= mask
+
+            # layer, qmat = self._filter(
+            #     inputs_i,
+            #     type_i,
+            #     name="filter_type_all" + suffix,
+            #     natoms=natoms,
+            #     reuse=reuse,
+            #     trainable=trainable,
+            #     activation_fn=self.filter_activation_fn,
+            #     type_embedding=type_embedding,
+            # )
+            # layer = paddle.reshape(
+            #     layer, [inputs.shape[0], natoms[0], self.get_dim_out()]
+            # )
+            # qmat = paddle.reshape(
+            #     qmat, [inputs.shape[0], natoms[0], self.get_dim_rot_mat_1() * 3]
+            # )
+            # output.append(layer)
+            # output_qmat.append(qmat)
+        # print(f"len(output) = {len(output)}")
+        output = paddle.concat(output, axis=1)
+        output_qmat = paddle.concat(output_qmat, axis=1)
         return output, output_qmat
 
     def _compute_dstats_sys_smth(
         self, data_coord, data_box, data_atype, natoms_vec, mesh
     ):
-        dd_all = run_sess(
-            self.sub_sess,
-            self.stat_descrpt,
-            feed_dict={
-                self.place_holders["coord"]: data_coord,
-                self.place_holders["type"]: data_atype,
-                self.place_holders["natoms_vec"]: natoms_vec,
-                self.place_holders["box"]: data_box,
-                self.place_holders["default_mesh"]: mesh,
-            },
+        input_dict = {}
+        # dd_all = run_sess(
+        #     self.sub_sess,
+        #     self.stat_descrpt,
+        #     feed_dict={
+        #         self.place_holders["coord"]: data_coord,
+        #         self.place_holders["type"]: data_atype,
+        #         self.place_holders["natoms_vec"]: natoms_vec,
+        #         self.place_holders["box"]: data_box,
+        #         self.place_holders["default_mesh"]: mesh,
+        #     },
+        # )
+        input_dict["coord"] = paddle.to_tensor(data_coord, dtype="float32")
+        input_dict["box"] = paddle.to_tensor(data_box, dtype="float32")
+        input_dict["type"] = paddle.to_tensor(data_atype, dtype="int32")
+        input_dict["natoms_vec"] = paddle.to_tensor(
+            natoms_vec, dtype="int32", place="cpu"
         )
+        input_dict["default_mesh"] = paddle.to_tensor(mesh, dtype="int32")
+
+        # print(input_dict["coord"].dtype) # fp64
+        # print(input_dict["type"].dtype) # int32
+        # print(input_dict["natoms_vec"].dtype) # int32
+        # print(input_dict["box"].dtype) # fp64
+        # print(input_dict["default_mesh"].dtype) # int32
+        # print(self.avg_zero)
+        # print(self.std_ones)
+        # print(self.sel_a)
+        # print(self.sel_r)
+        self.stat_descrpt, descrpt_deriv, rij, nlist = op_module.prod_env_mat_a(
+            input_dict["coord"],  # fp32
+            input_dict["type"],  # int32
+            input_dict["box"],  # fp32
+            input_dict["default_mesh"],  # int32
+            self.avg_zero,
+            self.std_ones,
+            input_dict["natoms_vec"],  # int32
+            rcut_a=self.rcut_a,
+            rcut_r=self.rcut_r,
+            rcut_r_smth=self.rcut_r_smth,
+            sel_a=self.sel_a,
+            sel_r=self.sel_r,
+        )
+
+        dd_all = self.stat_descrpt.numpy()
         natoms = natoms_vec
         dd_all = np.reshape(dd_all, [-1, self.ndescrpt * natoms[0]])
         start_index = 0
@@ -840,29 +960,30 @@ def _concat_type_embedding(
         embedding:
             environment of each atom represented by embedding.
         """
-        te_out_dim = type_embedding.get_shape().as_list()[-1]
-        self.t_nei_type = tf.constant(self.nei_type, dtype=tf.int32)
-        nei_embed = tf.nn.embedding_lookup(
-            type_embedding, tf.cast(self.t_nei_type, dtype=tf.int32)
+        te_out_dim = type_embedding.shape[-1]
+        self.t_nei_type = paddle.to_tensor(self.nei_type, dtype=paddle.int32)
+        nei_embed = paddle.nn.functional.embedding(
+            paddle.cast(self.t_nei_type, dtype=paddle.int32),
+            type_embedding,
         )  # shape is [self.nnei, 1+te_out_dim]
-        nei_embed = tf.tile(
+        nei_embed = paddle.tile(
             nei_embed, (nframes * natoms[0], 1)
         )  # shape is [nframes*natoms[0]*self.nnei, te_out_dim]
-        nei_embed = tf.reshape(nei_embed, [-1, te_out_dim])
-        embedding_input = tf.concat(
+        nei_embed = paddle.reshape(nei_embed, [-1, te_out_dim])
+        embedding_input = paddle.concat(
             [xyz_scatter, nei_embed], 1
         )  # shape is [nframes*natoms[0]*self.nnei, 1+te_out_dim]
         if not self.type_one_side:
             atm_embed = embed_atom_type(
                 self.ntypes, natoms, type_embedding
             )  # shape is [natoms[0], te_out_dim]
-            atm_embed = tf.tile(
+            atm_embed = paddle.tile(
                 atm_embed, (nframes, self.nnei)
             )  # shape is [nframes*natoms[0], self.nnei*te_out_dim]
-            atm_embed = tf.reshape(
+            atm_embed = paddle.reshape(
                 atm_embed, [-1, te_out_dim]
             )  # shape is [nframes*natoms[0]*self.nnei, te_out_dim]
-            embedding_input = tf.concat(
+            embedding_input = paddle.concat(
                 [embedding_input, atm_embed], 1
             )  # shape is [nframes*natoms[0]*self.nnei, 1+te_out_dim+te_out_dim]
         return embedding_input
@@ -888,13 +1009,42 @@ def _filter_lower(
         outputs_size = [1] + self.filter_neuron
         # cut-out inputs
         # with natom x (nei_type_i x 4)
-        inputs_i = tf.slice(inputs, [0, start_index * 4], [-1, incrs_index * 4])
-        shape_i = inputs_i.get_shape().as_list()
-        natom = tf.shape(inputs_i)[0]
+        # if not hasattr(self, "debug_inputs"):
+        #     self.debug_inputs = inputs
+        #     paddle.save(self.debug_inputs, "/workspace/hesensen/deepmd_backend/small_case/debug_inputs.pddata")
+        # print(__file__, "inputs.shape", inputs.shape)
+
+        inputs_i = paddle.slice(
+            inputs,
+            [0, 1],
+            [0, start_index * 4],
+            [inputs.shape[0], start_index * 4 + incrs_index * 4],
+        )
+        # if not hasattr(self, "debug_inputs_i"):
+        #     self.debug_inputs_i = inputs_i
+        #     paddle.save(self.debug_inputs_i, "/workspace/hesensen/deepmd_backend/small_case/debug_inputs_i.pddata")
+        # print(__file__, "inputs_i.shape", inputs_i.shape)
+
+        shape_i = inputs_i.shape
+        natom = inputs_i.shape[0]
+
         # with (natom x nei_type_i) x 4
-        inputs_reshape = tf.reshape(inputs_i, [-1, 4])
+        inputs_reshape = paddle.reshape(inputs_i, [-1, 4])
+        # if not hasattr(self, "debug_inputs_reshape"):
+        #     self.debug_inputs_reshape = inputs_reshape
+        #     paddle.save(self.debug_inputs_reshape, "/workspace/hesensen/deepmd_backend/small_case/debug_inputs_reshape.pddata")
+        # print(__file__, "inputs_reshape.shape", inputs_reshape.shape)
+
         # with (natom x nei_type_i) x 1
-        xyz_scatter = tf.reshape(tf.slice(inputs_reshape, [0, 0], [-1, 1]), [-1, 1])
+        xyz_scatter = paddle.reshape(
+            paddle.slice(inputs_reshape, [0, 1], [0, 0], [inputs_reshape.shape[0], 1]),
+            [-1, 1],
+        )
+        # if not hasattr(self, "debug_xyz_scatter"):
+        #     self.debug_xyz_scatter = xyz_scatter
+        #     paddle.save(self.debug_xyz_scatter, "/workspace/hesensen/deepmd_backend/small_case/debug_xyz_scatter.pddata")
+        # print(__file__, "xyz_scatter.shape", xyz_scatter.shape)
+
         if type_embedding is not None:
             xyz_scatter = self._concat_type_embedding(
                 xyz_scatter, nframes, natoms, type_embedding
@@ -904,25 +1054,25 @@ def _filter_lower(
                     "compression of type embedded descriptor is not supported at the moment"
                 )
         # natom x 4 x outputs_size
-        if nvnmd_cfg.enable:
-            return filter_lower_R42GR(
-                type_i,
-                type_input,
-                inputs_i,
-                is_exclude,
-                activation_fn,
-                bavg,
-                stddev,
-                trainable,
-                suffix,
-                self.seed,
-                self.seed_shift,
-                self.uniform_seed,
-                self.filter_neuron,
-                self.filter_precision,
-                self.filter_resnet_dt,
-                self.embedding_net_variables,
-            )
+        # if nvnmd_cfg.enable:
+        #     return filter_lower_R42GR(
+        #         type_i,
+        #         type_input,
+        #         inputs_i,
+        #         is_exclude,
+        #         activation_fn,
+        #         bavg,
+        #         stddev,
+        #         trainable,
+        #         suffix,
+        #         self.seed,
+        #         self.seed_shift,
+        #         self.uniform_seed,
+        #         self.filter_neuron,
+        #         self.filter_precision,
+        #         self.filter_resnet_dt,
+        #         self.embedding_net_variables,
+        #     )
         if self.compress and (not is_exclude):
             if self.type_one_side:
                 net = "filter_-1_net_" + str(type_i)
@@ -937,70 +1087,85 @@ def _filter_lower(
                 self.table_config[3],
             ]
             return op_module.tabulate_fusion_se_a(
-                tf.cast(self.table.data[net], self.filter_precision),
+                paddle.cast(self.table.data[net], self.filter_precision),
                 info,
                 xyz_scatter,
-                tf.reshape(inputs_i, [natom, shape_i[1] // 4, 4]),
+                paddle.reshape(inputs_i, [natom, shape_i[1] // 4, 4]),
                 last_layer_size=outputs_size[-1],
             )
         else:
             if not is_exclude:
                 # with (natom x nei_type_i) x out_size
-                xyz_scatter = embedding_net(
-                    xyz_scatter,
-                    self.filter_neuron,
-                    self.filter_precision,
-                    activation_fn=activation_fn,
-                    resnet_dt=self.filter_resnet_dt,
-                    name_suffix=suffix,
-                    stddev=stddev,
-                    bavg=bavg,
-                    seed=self.seed,
-                    trainable=trainable,
-                    uniform_seed=self.uniform_seed,
-                    initial_variables=self.embedding_net_variables,
-                    mixed_prec=self.mixed_prec,
-                )
+                # if not hasattr(self, "xyz_scatter_input"):
+                #     self.debug_xyz_scatter_input = xyz_scatter
+                # paddle.save(self.xyz_scatter_input, "/workspace/hesensen/deepmd_backend/small_case/embd_net_0_0_input.pddata")
+                # paddle.save(self.embedding_nets[type_input][type_i].state_dict(), "/workspace/hesensen/deepmd_backend/small_case/embd_net_0_0.pdparams")
+                # print(__file__, "saved")
+                xyz_scatter_out = self.embedding_nets[type_input][type_i](xyz_scatter)
+                # print(__file__, "xyz_scatter.shape", xyz_scatter.shape)
+                # if not hasattr(self, "xyz_scatter_output"):
+                #     self.debug_xyz_scatter_output = xyz_scatter_out
+                # paddle.save(self.xyz_scatter_output, "/workspace/hesensen/deepmd_backend/small_case/embd_net_0_0_output.pddata")
+                # print(__file__, "saved")
+
+                # xyz_scatter = embedding_net(
+                #     xyz_scatter,
+                #     self.filter_neuron,
+                #     self.filter_precision,
+                #     activation_fn=activation_fn,
+                #     resnet_dt=self.filter_resnet_dt,
+                #     name_suffix=suffix,
+                #     stddev=stddev,
+                #     bavg=bavg,
+                #     seed=self.seed,
+                #     trainable=trainable,
+                #     uniform_seed=self.uniform_seed,
+                #     initial_variables=self.embedding_net_variables,
+                #     mixed_prec=self.mixed_prec,
+                # )
+                # xyz_scatter = paddle.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))
                 if (not self.uniform_seed) and (self.seed is not None):
                     self.seed += self.seed_shift
             else:
                 # we can safely return the final xyz_scatter filled with zero directly
-                return tf.cast(
-                    tf.fill((natom, 4, outputs_size[-1]), 0.0), self.filter_precision
+                return paddle.cast(
+                    paddle.fill((natom, 4, outputs_size[-1]), 0.0),
+                    self.filter_precision,
                 )
             # natom x nei_type_i x out_size
-            xyz_scatter = tf.reshape(
-                xyz_scatter, (-1, shape_i[1] // 4, outputs_size[-1])
+            xyz_scatter_out = paddle.reshape(
+                xyz_scatter_out, (-1, shape_i[1] // 4, outputs_size[-1])
             )
-            # When using tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]) below
+            # When using paddle.reshape(inputs_i, [-1, shape_i[1]//4, 4]) below
             # [588 24] -> [588 6 4] correct
             # but if sel is zero
             # [588 0] -> [147 0 4] incorrect; the correct one is [588 0 4]
-            # So we need to explicitly assign the shape to tf.shape(inputs_i)[0] instead of -1
+            # So we need to explicitly assign the shape to paddle.shape(inputs_i)[0] instead of -1
             # natom x 4 x outputs_size
-            return tf.matmul(
-                tf.reshape(inputs_i, [natom, shape_i[1] // 4, 4]),
-                xyz_scatter,
-                transpose_a=True,
+            return paddle.matmul(
+                paddle.reshape(inputs_i, [natom, shape_i[1] // 4, 4]),
+                xyz_scatter_out,
+                transpose_x=True,
             )
 
-    @cast_precision
+    # @cast_precision
     def _filter(
         self,
         inputs,
         type_input,
         natoms,
         type_embedding=None,
-        activation_fn=tf.nn.tanh,
+        activation_fn=paddle.nn.functional.tanh,
         stddev=1.0,
         bavg=0.0,
         name="linear",
         reuse=None,
         trainable=True,
     ):
-        nframes = tf.shape(tf.reshape(inputs, [-1, natoms[0], self.ndescrpt]))[0]
+        # nframes = paddle.shape(paddle.reshape(inputs, [-1, natoms[0], self.ndescrpt]))[0]
+        nframes = 1
         # natom x (nei x 4)
-        shape = inputs.get_shape().as_list()
+        shape = inputs.shape
         outputs_size = [1] + self.filter_neuron
         outputs_size_2 = self.n_axis_neuron
         all_excluded = all(
@@ -1009,101 +1174,112 @@ def _filter(
                 for type_i in range(self.ntypes)
             ]
         )
+        # print(__file__, all_excluded)
         if all_excluded:
             # all types are excluded so result and qmat should be zeros
             # we can safaly return a zero matrix...
             # See also https://stackoverflow.com/a/34725458/9567349
             # result: natom x outputs_size x outputs_size_2
             # qmat: natom x outputs_size x 3
-            natom = tf.shape(inputs)[0]
-            result = tf.cast(
-                tf.fill((natom, outputs_size_2, outputs_size[-1]), 0.0),
-                GLOBAL_TF_FLOAT_PRECISION,
+            natom = paddle.shape(inputs)[0]
+            result = paddle.cast(
+                paddle.full((natom, outputs_size_2, outputs_size[-1]), 0.0),
+                GLOBAL_PD_FLOAT_PRECISION,
             )
-            qmat = tf.cast(
-                tf.fill((natom, outputs_size[-1], 3), 0.0), GLOBAL_TF_FLOAT_PRECISION
+            qmat = paddle.cast(
+                paddle.full((natom, outputs_size[-1], 3), 0.0),
+                GLOBAL_PD_FLOAT_PRECISION,
             )
             return result, qmat
 
-        with tf.variable_scope(name, reuse=reuse):
-            start_index = 0
-            type_i = 0
-            # natom x 4 x outputs_size
-            if type_embedding is None:
-                rets = []
-                for type_i in range(self.ntypes):
-                    ret = self._filter_lower(
-                        type_i,
-                        type_input,
-                        start_index,
-                        self.sel_a[type_i],
-                        inputs,
-                        nframes,
-                        natoms,
-                        type_embedding=type_embedding,
-                        is_exclude=(type_input, type_i) in self.exclude_types,
-                        activation_fn=activation_fn,
-                        stddev=stddev,
-                        bavg=bavg,
-                        trainable=trainable,
-                        suffix="_" + str(type_i),
-                    )
-                    if (type_input, type_i) not in self.exclude_types:
-                        # add zero is meaningless; skip
-                        rets.append(ret)
-                    start_index += self.sel_a[type_i]
-                # faster to use accumulate_n than multiple add
-                xyz_scatter_1 = tf.accumulate_n(rets)
-            else:
-                xyz_scatter_1 = self._filter_lower(
+        # with tf.variable_scope(name, reuse=reuse):
+        start_index = 0
+        type_i = 0
+        # natom x 4 x outputs_size
+        if type_embedding is None:
+            rets = []
+            for type_i in range(self.ntypes):
+                ret = self._filter_lower(
                     type_i,
                     type_input,
                     start_index,
-                    np.cumsum(self.sel_a)[-1],
+                    self.sel_a[type_i],
                     inputs,
                     nframes,
                     natoms,
                     type_embedding=type_embedding,
-                    is_exclude=False,
+                    is_exclude=(type_input, type_i) in self.exclude_types,
                     activation_fn=activation_fn,
                     stddev=stddev,
                     bavg=bavg,
                     trainable=trainable,
+                    suffix="_" + str(type_i),
                 )
-            if nvnmd_cfg.enable:
-                return filter_GR2D(xyz_scatter_1)
-            # natom x nei x outputs_size
-            # xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
-            # natom x nei x 4
-            # inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4])
-            # natom x 4 x outputs_size
-            # xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True)
-            if self.original_sel is None:
-                # shape[1] = nnei * 4
-                nnei = shape[1] / 4
-            else:
-                nnei = tf.cast(
-                    tf.Variable(
-                        np.sum(self.original_sel),
-                        dtype=tf.int32,
-                        trainable=False,
-                        name="nnei",
-                    ),
-                    self.filter_precision,
-                )
-            xyz_scatter_1 = xyz_scatter_1 / nnei
-            # natom x 4 x outputs_size_2
-            xyz_scatter_2 = tf.slice(xyz_scatter_1, [0, 0, 0], [-1, -1, outputs_size_2])
-            # # natom x 3 x outputs_size_2
-            # qmat = tf.slice(xyz_scatter_2, [0,1,0], [-1, 3, -1])
-            # natom x 3 x outputs_size_1
-            qmat = tf.slice(xyz_scatter_1, [0, 1, 0], [-1, 3, -1])
-            # natom x outputs_size_1 x 3
-            qmat = tf.transpose(qmat, perm=[0, 2, 1])
-            # natom x outputs_size x outputs_size_2
-            result = tf.matmul(xyz_scatter_1, xyz_scatter_2, transpose_a=True)
-            # natom x (outputs_size x outputs_size_2)
-            result = tf.reshape(result, [-1, outputs_size_2 * outputs_size[-1]])
+                if (type_input, type_i) not in self.exclude_types:
+                    # add zero is meaningless; skip
+                    rets.append(ret)
+                start_index += self.sel_a[type_i]
+            # faster to use accumulate_n than multiple add
+            xyz_scatter_1 = paddle.add_n(rets)
+        else:
+            xyz_scatter_1 = self._filter_lower(
+                type_i,
+                type_input,
+                start_index,
+                np.cumsum(self.sel_a)[-1],
+                inputs,
+                nframes,
+                natoms,
+                type_embedding=type_embedding,
+                is_exclude=False,
+                activation_fn=activation_fn,
+                stddev=stddev,
+                bavg=bavg,
+                trainable=trainable,
+            )
+        # if nvnmd_cfg.enable:
+        #     return filter_GR2D(xyz_scatter_1)
+        # natom x nei x outputs_size
+        # xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
+        # natom x nei x 4
+        # inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4])
+        # natom x 4 x outputs_size
+        # xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True)
+        if self.original_sel is None:
+            # shape[1] = nnei * 4
+            nnei = shape[1] / 4
+        else:
+            nnei = paddle.cast(
+                paddle.to_tensor(
+                    np.sum(self.original_sel),
+                    dtype=paddle.int32,
+                    stop_gradient=True,
+                ),
+                self.filter_precision,
+            )
+        xyz_scatter_1 = xyz_scatter_1 / nnei
+        # natom x 4 x outputs_size_2
+        xyz_scatter_2 = paddle.slice(
+            xyz_scatter_1,
+            [0, 1, 2],
+            [0, 0, 0],
+            [xyz_scatter_1.shape[0], xyz_scatter_1.shape[1], outputs_size_2],
+        )
+        # # natom x 3 x outputs_size_2
+        # qmat = tf.slice(xyz_scatter_2, [0,1,0], [-1, 3, -1])
+        # natom x 3 x outputs_size_1
+        qmat = paddle.slice(
+            xyz_scatter_1,
+            [0, 1, 2],
+            [0, 1, 0],
+            [xyz_scatter_1.shape[0], 1 + 3, xyz_scatter_1.shape[2]],
+        )
+        # natom x outputs_size_1 x 3
+        qmat = paddle.transpose(qmat, perm=[0, 2, 1])
+        # natom x outputs_size x outputs_size_2
+        result = paddle.matmul(xyz_scatter_1, xyz_scatter_2, transpose_x=True)
+        # natom x (outputs_size x outputs_size_2)
+        result = paddle.reshape(result, [-1, outputs_size_2 * outputs_size[-1]])
 
         return result, qmat
 
diff --git a/deepmd/entrypoints/freeze.py b/deepmd/entrypoints/freeze.py
index 9f6547998f..121b6c77a6 100755
--- a/deepmd/entrypoints/freeze.py
+++ b/deepmd/entrypoints/freeze.py
@@ -8,36 +8,22 @@
 
 import json
 import logging
-from os.path import (
-    abspath,
-)
-from typing import (
-    List,
-    Optional,
-    Union,
-)
+from os.path import abspath
+from typing import List
+from typing import Optional
+from typing import Union
 
 import google.protobuf.message
 
 # load grad of force module
 import deepmd.op  # noqa: F401
-from deepmd.env import (
-    FITTING_NET_PATTERN,
-    REMOVE_SUFFIX_DICT,
-    tf,
-)
-from deepmd.nvnmd.entrypoints.freeze import (
-    save_weight,
-)
-from deepmd.utils.errors import (
-    GraphTooLargeError,
-)
-from deepmd.utils.graph import (
-    get_pattern_nodes_from_graph_def,
-)
-from deepmd.utils.sess import (
-    run_sess,
-)
+from deepmd.env import FITTING_NET_PATTERN
+from deepmd.env import REMOVE_SUFFIX_DICT
+from deepmd.env import tf
+from deepmd.nvnmd.entrypoints.freeze import save_weight
+from deepmd.utils.errors import GraphTooLargeError
+from deepmd.utils.graph import get_pattern_nodes_from_graph_def
+from deepmd.utils.sess import run_sess
 
 __all__ = ["freeze"]
 
@@ -320,14 +306,16 @@ def _make_node_names(
 
 
 def freeze_graph(
-    sess,
-    input_graph,
-    input_node,
-    freeze_type,
-    modifier,
-    out_graph_name,
-    node_names=None,
-    out_suffix="",
+    model_file: str,
+    output: str,
+    # sess,
+    # input_graph,
+    # input_node,
+    # freeze_type,
+    # modifier,
+    # out_graph_name,
+    # node_names=None,
+    # out_suffix="",
 ):
     """Freeze the single graph with chosen out_suffix.
 
@@ -350,40 +338,94 @@ def freeze_graph(
     out_suffix : str
         The chosen suffix to freeze in the input_graph.
     """
-    output_node = _make_node_names(
-        freeze_type, modifier, out_suffix=out_suffix, node_names=node_names
-    )
-    different_set = set(output_node) - set(input_node)
-    if different_set:
-        log.warning(
-            "The following nodes are not in the graph: %s. "
-            "Skip freezeing these nodes. You may be freezing "
-            "a checkpoint generated by an old version." % different_set
-        )
-        # use intersection as output list
-        output_node = list(set(output_node) & set(input_node))
-    log.info(f"The following nodes will be frozen: {output_node}")
-    # We use a built-in TF helper to export variables to constants
-    output_graph_def = tf.graph_util.convert_variables_to_constants(
-        sess,  # The session is used to retrieve the weights
-        input_graph,  # The graph_def is used to retrieve the nodes
-        output_node,  # The output node names are used to select the usefull nodes
+    # output_node = _make_node_names(
+    #     freeze_type, modifier, out_suffix=out_suffix, node_names=node_names
+    # )
+    # different_set = set(output_node) - set(input_node)
+    # if different_set:
+    #     log.warning(
+    #         "The following nodes are not in the graph: %s. "
+    #         "Skip freezeing these nodes. You may be freezing "
+    #         "a checkpoint generated by an old version." % different_set
+    #     )
+    #     # use intersection as output list
+    #     output_node = list(set(output_node) & set(input_node))
+    # log.info(f"The following nodes will be frozen: {output_node}")
+    # # We use a built-in TF helper to export variables to constants
+    # output_graph_def = tf.graph_util.convert_variables_to_constants(
+    #     sess,  # The session is used to retrieve the weights
+    #     input_graph,  # The graph_def is used to retrieve the nodes
+    #     output_node,  # The output node names are used to select the usefull nodes
+    # )
+    # # if multi-task, change fitting_net suffix and model_type
+    # if out_suffix != "":
+    #     output_graph_def = _modify_model_suffix(
+    #         output_graph_def, out_suffix, freeze_type
+    #     )
+
+    # # If we need to transfer the fitting net variables
+    # output_graph_def = _transfer_fitting_net_trainable_variables(
+    #     sess, output_graph_def, input_graph
+    # )
+
+    # # Finally we serialize and dump the output graph to the filesystem
+    # with tf.gfile.GFile(out_graph_name, "wb") as f:
+    #     f.write(output_graph_def.SerializeToString())
+    # log.info(f"{len(output_graph_def.node):d} ops in the final graph.")
+    import paddle
+
+    from deepmd.infer import DeepPot
+
+    dp = DeepPot(
+        model_file,
+        load_prefix="load",
+        default_tf_graph=False,
     )
-    # if multi-task, change fitting_net suffix and model_type
-    if out_suffix != "":
-        output_graph_def = _modify_model_suffix(
-            output_graph_def, out_suffix, freeze_type
-        )
-
-    # If we need to transfer the fitting net variables
-    output_graph_def = _transfer_fitting_net_trainable_variables(
-        sess, output_graph_def, input_graph
+    # print(dp.model.descrpt.embedding_nets[0][0].weight[0])
+    # for w in dp.model.descrpt.embedding_nets[0][0].weight:
+    #     print(f"w {w.shape} {w.mean().item()} {w.var().item()}")
+    # print("从state_dict打印载入的参数")
+    # for k, v in dp.model.state_dict().items():
+    #     print(f"{k} {v.shape} {v.dtype} {v.mean().item()} {v.var().item()}")
+    # exit()
+    # for b in dp.model.descrpt.embedding_nets[0][0].bias:
+    #     print(f"b {b.shape} {b.mean().item()} {b.var().item()}")
+    dp.model.eval()
+    from paddle.static import InputSpec
+
+    st_model = paddle.jit.to_static(
+        dp.model,
+        input_spec=[
+            InputSpec(shape=[None], dtype="float64"),  # coord_
+            InputSpec(shape=[None], dtype="int32"),  # atype_
+            InputSpec(shape=[4], dtype="int32"),  # natoms
+            InputSpec(shape=[None], dtype="float64"),  # box
+            InputSpec(shape=[6], dtype="int32"),  # mesh
+            {
+                # "coord": InputSpec(
+                #     shape=[2880],
+                #     dtype="float64"
+                # ),
+                # "type": InputSpec(
+                #     shape=[960],
+                #     dtype="int32"
+                # ),
+                # "natoms_vec": InputSpec(
+                #     shape=[4],
+                #     dtype="int32"
+                # ),
+                "box": InputSpec(shape=[None], dtype="float64"),
+                # "default_mesh": InputSpec(
+                #     shape=[6],
+                #     dtype="int32"
+                # ),
+            },
+            "",
+            False,
+        ],
     )
-
-    # Finally we serialize and dump the output graph to the filesystem
-    with tf.gfile.GFile(out_graph_name, "wb") as f:
-        f.write(output_graph_def.SerializeToString())
-    log.info(f"{len(output_graph_def.node):d} ops in the final graph.")
+    paddle.jit.save(st_model, output)
+    print(f"Saved to path: {output}")
 
 
 def freeze_graph_multi(
@@ -464,11 +506,12 @@ def freeze_graph_multi(
 
 def freeze(
     *,
-    checkpoint_folder: str,
+    # checkpoint_folder: str,
+    input_file: str,
     output: str,
-    node_names: Optional[str] = None,
-    nvnmd_weight: Optional[str] = None,
-    united_model: bool = False,
+    # node_names: Optional[str] = None,
+    # nvnmd_weight: Optional[str] = None,
+    # united_model: bool = False,
     **kwargs,
 ):
     """Freeze the graph in supplied folder.
@@ -489,75 +532,77 @@ def freeze(
         other arguments
     """
     # We retrieve our checkpoint fullpath
-    checkpoint = tf.train.get_checkpoint_state(checkpoint_folder)
-    input_checkpoint = checkpoint.model_checkpoint_path
-
-    # expand the output file to full path
-    output_graph = abspath(output)
-
-    # Before exporting our graph, we need to precise what is our output node
-    # This is how TF decides what part of the Graph he has to keep
-    # and what part it can dump
-    # NOTE: this variable is plural, because you can have multiple output nodes
-    # node_names = "energy_test,force_test,virial_test,t_rcut"
-
-    # We clear devices to allow TensorFlow to control
-    # on which device it will load operations
-    clear_devices = True
-
-    # We import the meta graph and retrieve a Saver
-    try:
-        # In case paralle training
-        import horovod.tensorflow as _  # noqa: F401
-    except ImportError:
-        pass
-    saver = tf.train.import_meta_graph(
-        f"{input_checkpoint}.meta", clear_devices=clear_devices
+    # checkpoint = tf.train.get_checkpoint_state(checkpoint_folder)
+    # input_checkpoint = checkpoint.model_checkpoint_path
+
+    # # expand the output file to full path
+    # output_graph = abspath(output)
+
+    # # Before exporting our graph, we need to precise what is our output node
+    # # This is how TF decides what part of the Graph he has to keep
+    # # and what part it can dump
+    # # NOTE: this variable is plural, because you can have multiple output nodes
+    # # node_names = "energy_test,force_test,virial_test,t_rcut"
+
+    # # We clear devices to allow TensorFlow to control
+    # # on which device it will load operations
+    # clear_devices = True
+
+    # # We import the meta graph and retrieve a Saver
+    # try:
+    #     # In case paralle training
+    #     import horovod.tensorflow as _  # noqa: F401
+    # except ImportError:
+    #     pass
+    # saver = tf.train.import_meta_graph(
+    #     f"{input_checkpoint}.meta", clear_devices=clear_devices
+    # )
+
+    # # We retrieve the protobuf graph definition
+    # graph = tf.get_default_graph()
+    # try:
+    #     input_graph_def = graph.as_graph_def()
+    # except google.protobuf.message.DecodeError as e:
+    #     raise GraphTooLargeError(
+    #         "The graph size exceeds 2 GB, the hard limitation of protobuf."
+    #         " Then a DecodeError was raised by protobuf. You should "
+    #         "reduce the size of your model."
+    #     ) from e
+    # nodes = [n.name for n in input_graph_def.node]
+
+    # # We start a session and restore the graph weights
+    # with tf.Session() as sess:
+    #     saver.restore(sess, input_checkpoint)
+    #     model_type = run_sess(sess, "model_attr/model_type:0", feed_dict={}).decode(
+    #         "utf-8"
+    #     )
+    #     if "modifier_attr/type" in nodes:
+    #         modifier_type = run_sess(sess, "modifier_attr/type:0", feed_dict={}).decode(
+    #             "utf-8"
+    #         )
+    #     else:
+    #         modifier_type = None
+    #     if nvnmd_weight is not None:
+    #         save_weight(sess, nvnmd_weight)  # nvnmd
+    # if model_type != "multi_task":
+    freeze_graph(
+        input_file,
+        output,
+        # sess,
+        # input_graph_def,
+        # nodes,
+        # model_type,
+        # modifier_type,
+        # output_graph,
+        # node_names,
     )
-
-    # We retrieve the protobuf graph definition
-    graph = tf.get_default_graph()
-    try:
-        input_graph_def = graph.as_graph_def()
-    except google.protobuf.message.DecodeError as e:
-        raise GraphTooLargeError(
-            "The graph size exceeds 2 GB, the hard limitation of protobuf."
-            " Then a DecodeError was raised by protobuf. You should "
-            "reduce the size of your model."
-        ) from e
-    nodes = [n.name for n in input_graph_def.node]
-
-    # We start a session and restore the graph weights
-    with tf.Session() as sess:
-        saver.restore(sess, input_checkpoint)
-        model_type = run_sess(sess, "model_attr/model_type:0", feed_dict={}).decode(
-            "utf-8"
-        )
-        if "modifier_attr/type" in nodes:
-            modifier_type = run_sess(sess, "modifier_attr/type:0", feed_dict={}).decode(
-                "utf-8"
-            )
-        else:
-            modifier_type = None
-        if nvnmd_weight is not None:
-            save_weight(sess, nvnmd_weight)  # nvnmd
-        if model_type != "multi_task":
-            freeze_graph(
-                sess,
-                input_graph_def,
-                nodes,
-                model_type,
-                modifier_type,
-                output_graph,
-                node_names,
-            )
-        else:
-            freeze_graph_multi(
-                sess,
-                input_graph_def,
-                nodes,
-                modifier_type,
-                output_graph,
-                node_names,
-                united_model=united_model,
-            )
+    # else:
+    #     freeze_graph_multi(
+    #         sess,
+    #         input_graph_def,
+    #         nodes,
+    #         modifier_type,
+    #         output_graph,
+    #         node_names,
+    #         united_model=united_model,
+    #     )
diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py
index 587bdaace7..ba008f9908 100644
--- a/deepmd/entrypoints/main.py
+++ b/deepmd/entrypoints/main.py
@@ -3,37 +3,23 @@
 import argparse
 import logging
 import textwrap
-from pathlib import (
-    Path,
-)
-from typing import (
-    List,
-    Optional,
-)
-
-from deepmd import (
-    __version__,
-)
-from deepmd.common import (
-    clear_session,
-)
-from deepmd.entrypoints import (
-    compress,
-    convert,
-    doc_train_input,
-    freeze,
-    make_model_devi,
-    neighbor_stat,
-    test,
-    train_dp,
-    transfer,
-)
-from deepmd.loggers import (
-    set_log_handles,
-)
-from deepmd.nvnmd.entrypoints.train import (
-    train_nvnmd,
-)
+from pathlib import Path
+from typing import List
+from typing import Optional
+
+from deepmd import __version__
+from deepmd.common import clear_session
+from deepmd.entrypoints import compress
+from deepmd.entrypoints import convert
+from deepmd.entrypoints import doc_train_input
+from deepmd.entrypoints import freeze
+from deepmd.entrypoints import make_model_devi
+from deepmd.entrypoints import neighbor_stat
+from deepmd.entrypoints import test
+from deepmd.entrypoints import train_dp
+from deepmd.entrypoints import transfer
+from deepmd.loggers import set_log_handles
+from deepmd.nvnmd.entrypoints.train import train_nvnmd
 
 __all__ = ["main", "parse_args", "get_ll", "main_parser"]
 
@@ -217,8 +203,8 @@ def main_parser() -> argparse.ArgumentParser:
         ),
     )
     parser_frz.add_argument(
-        "-c",
-        "--checkpoint-folder",
+        "-i",
+        "--input_file",
         type=str,
         default=".",
         help="path to checkpoint folder",
@@ -230,26 +216,26 @@ def main_parser() -> argparse.ArgumentParser:
         default="frozen_model.pb",
         help="name of graph, will output to the checkpoint folder",
     )
-    parser_frz.add_argument(
-        "-n",
-        "--node-names",
-        type=str,
-        default=None,
-        help="the frozen nodes, if not set, determined from the model type",
-    )
-    parser_frz.add_argument(
-        "-w",
-        "--nvnmd-weight",
-        type=str,
-        default=None,
-        help="the name of weight file (.npy), if set, save the model's weight into the file",
-    )
-    parser_frz.add_argument(
-        "--united-model",
-        action="store_true",
-        default=False,
-        help="When in multi-task mode, freeze all nodes into one united model",
-    )
+    # parser_frz.add_argument(
+    #     "-n",
+    #     "--node-names",
+    #     type=str,
+    #     default=None,
+    #     help="the frozen nodes, if not set, determined from the model type",
+    # )
+    # parser_frz.add_argument(
+    #     "-w",
+    #     "--nvnmd-weight",
+    #     type=str,
+    #     default=None,
+    #     help="the name of weight file (.npy), if set, save the model's weight into the file",
+    # )
+    # parser_frz.add_argument(
+    #     "--united-model",
+    #     action="store_true",
+    #     default=False,
+    #     help="When in multi-task mode, freeze all nodes into one united model",
+    # )
 
     # * test script ********************************************************************
     parser_tst = subparsers.add_parser(
diff --git a/deepmd/entrypoints/test.py b/deepmd/entrypoints/test.py
index 2ecc52ebe4..fde5139e1e 100644
--- a/deepmd/entrypoints/test.py
+++ b/deepmd/entrypoints/test.py
@@ -1,43 +1,27 @@
 """Test trained DeePMD model."""
 import logging
-from pathlib import (
-    Path,
-)
-from typing import (
-    TYPE_CHECKING,
-    Dict,
-    List,
-    Optional,
-    Tuple,
-)
+from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
 
 import numpy as np
 
-from deepmd import (
-    DeepPotential,
-)
-from deepmd.common import (
-    expand_sys_str,
-)
+from deepmd import DeepPotential
+from deepmd.common import expand_sys_str
 from deepmd.utils import random as dp_random
-from deepmd.utils.data import (
-    DeepmdData,
-)
-from deepmd.utils.weight_avg import (
-    weighted_average,
-)
+from deepmd.utils.data import DeepmdData
+from deepmd.utils.weight_avg import weighted_average
 
 if TYPE_CHECKING:
-    from deepmd.infer import (
-        DeepDipole,
-        DeepDOS,
-        DeepPolar,
-        DeepPot,
-        DeepWFC,
-    )
-    from deepmd.infer.deep_tensor import (
-        DeepTensor,
-    )
+    from deepmd.infer import DeepDipole
+    from deepmd.infer import DeepDOS
+    from deepmd.infer import DeepPolar
+    from deepmd.infer import DeepPot
+    from deepmd.infer import DeepWFC
+    from deepmd.infer.deep_tensor import DeepTensor
 
 __all__ = ["test"]
 
@@ -260,7 +244,7 @@ def test_ener(
     data.add("energy", 1, atomic=False, must=False, high_prec=True)
     data.add("force", 3, atomic=True, must=False, high_prec=False)
     data.add("virial", 9, atomic=False, must=False, high_prec=False)
-    if dp.has_efield:
+    if dp.has_efield:  # False
         data.add("efield", 3, atomic=True, must=True, high_prec=False)
     if has_atom_ener:
         data.add("atom_ener", 1, atomic=True, must=True, high_prec=False)
@@ -298,6 +282,13 @@ def test_ener(
     else:
         aparam = None
 
+    # print(type(coord))
+    # print(type(box))
+    # print(type(atype))
+    # np.save("/workspace/hesensen/deepmd_backend/infer_align/coord_pd.npy", coord)
+    # np.save("/workspace/hesensen/deepmd_backend/infer_align/box_pd.npy", box)
+    # np.save("/workspace/hesensen/deepmd_backend/infer_align/atype_pd.npy", atype)
+    # exit()
     ret = dp.eval(
         coord,
         box,
@@ -341,6 +332,40 @@ def test_ener(
         )[1]
 
     diff_e = energy - test_data["energy"][:numb_test].reshape([-1, 1])
+    # print(energy)
+    """
+    [[-29857.71310608]
+    [-29863.80820815]
+    [-29860.15135615]
+    [-29854.51192426]
+    [-29863.13812543]
+    [-29855.93205087]
+    [-29855.50978599]
+    [-29865.49989375]
+    [-29859.1466963 ]
+    [-29857.09336879]
+    [-29862.98884167]
+    [-29859.11198703]
+    [-29861.66000458]
+    [-29861.923259  ]
+    [-29865.03699558]
+    [-29860.04100619]
+    [-29858.07084488]
+    [-29865.77369217]
+    [-29856.55031266]
+    [-29856.55155207]
+    [-29855.50095994]
+    [-29855.1020719 ]
+    [-29855.39086308]
+    [-29863.13015616]
+    [-29858.15176772]
+    [-29860.35238411]
+    [-29855.99364597]
+    [-29862.08350903]
+    [-29861.07073953]
+    [-29862.65406131]]
+    """
+    # exit()
     mae_e = mae(diff_e)
     rmse_e = rmse(diff_e)
     diff_f = force - test_data["force"][:numb_test]
diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py
index c806fb3804..05c1af4b8b 100755
--- a/deepmd/entrypoints/train.py
+++ b/deepmd/entrypoints/train.py
@@ -6,57 +6,31 @@
 import json
 import logging
 import time
-from typing import (
-    Any,
-    Dict,
-    Optional,
-)
-
-from deepmd.common import (
-    data_requirement,
-    expand_sys_str,
-    j_loader,
-    j_must_have,
-)
-from deepmd.env import (
-    GLOBAL_ENER_FLOAT_PRECISION,
-    reset_default_tf_session_config,
-    tf,
-)
-from deepmd.infer.data_modifier import (
-    DipoleChargeModifier,
-)
-from deepmd.train.run_options import (
-    BUILD,
-    CITATION,
-    WELCOME,
-    RunOptions,
-)
-from deepmd.train.trainer import (
-    DPTrainer,
-)
+from typing import Any
+from typing import Dict
+from typing import Optional
+
+from deepmd.common import data_requirement
+from deepmd.common import expand_sys_str
+from deepmd.common import j_loader
+from deepmd.common import j_must_have
+from deepmd.env import GLOBAL_ENER_FLOAT_PRECISION
+from deepmd.env import reset_default_tf_session_config
+from deepmd.env import tf
+from deepmd.infer.data_modifier import DipoleChargeModifier
+from deepmd.train.run_options import BUILD
+from deepmd.train.run_options import CITATION
+from deepmd.train.run_options import WELCOME
+from deepmd.train.run_options import RunOptions
+from deepmd.train.trainer import DPTrainer
 from deepmd.utils import random as dp_random
-from deepmd.utils.argcheck import (
-    normalize,
-)
-from deepmd.utils.compat import (
-    update_deepmd_input,
-)
-from deepmd.utils.data_system import (
-    DeepmdDataSystem,
-)
-from deepmd.utils.finetune import (
-    replace_model_params_with_pretrained_model,
-)
-from deepmd.utils.multi_init import (
-    replace_model_params_with_frz_multi_model,
-)
-from deepmd.utils.neighbor_stat import (
-    NeighborStat,
-)
-from deepmd.utils.path import (
-    DPPath,
-)
+from deepmd.utils.argcheck import normalize
+from deepmd.utils.compat import update_deepmd_input
+from deepmd.utils.data_system import DeepmdDataSystem
+from deepmd.utils.finetune import replace_model_params_with_pretrained_model
+from deepmd.utils.multi_init import replace_model_params_with_frz_multi_model
+from deepmd.utils.neighbor_stat import NeighborStat
+from deepmd.utils.path import DPPath
 
 __all__ = ["train"]
 
@@ -202,7 +176,7 @@ def _do_work(jdata: Dict[str, Any], run_opt: RunOptions, is_compress: bool = Fal
     dp_random.seed(seed)
 
     # setup data modifier
-    modifier = get_modifier(jdata["model"].get("modifier", None))
+    modifier = get_modifier(jdata["model"].get("modifier", None))  # None
 
     # check the multi-task mode
     multi_task_mode = "fitting_net_dict" in jdata["model"]
@@ -270,12 +244,13 @@ def _do_work(jdata: Dict[str, Any], run_opt: RunOptions, is_compress: bool = Fal
         origin_type_map = get_data(
             jdata["training"]["training_data"], rcut, None, modifier
         ).get_type_map()
+    print("model.build")
     model.build(train_data, stop_batch, origin_type_map=origin_type_map)
 
     if not is_compress:
         # train the model with the provided systems in a cyclic way
         start_time = time.time()
-        model.train(train_data, valid_data)
+        model.train(train_data, valid_data, stop_batch)
         end_time = time.time()
         log.info("finished training")
         log.info(f"wall time: {(end_time - start_time):.3f} s")
@@ -371,7 +346,7 @@ def get_nbor_stat(jdata, rcut, one_type: bool = False):
     if type_map and len(type_map) == 0:
         type_map = None
     multi_task_mode = "data_dict" in jdata["training"]
-    if not multi_task_mode:
+    if not multi_task_mode:  # here
         train_data = get_data(
             jdata["training"]["training_data"], max_rcut, type_map, None
         )
@@ -411,17 +386,20 @@ def get_nbor_stat(jdata, rcut, one_type: bool = False):
 
     neistat = NeighborStat(ntypes, rcut, one_type=one_type)
 
-    min_nbor_dist, max_nbor_size = neistat.get_stat(train_data)
+    min_nbor_dist, max_nbor_size = neistat.get_stat(
+        train_data
+    )  # 0.8854385688525511, [38 72]
+    # paddle: 0.8854385614395142 [38 72]
 
     # moved from traier.py as duplicated
     # TODO: this is a simple fix but we should have a clear
     #       architecture to call neighbor stat
-    tf.constant(
-        min_nbor_dist,
-        name="train_attr/min_nbor_dist",
-        dtype=GLOBAL_ENER_FLOAT_PRECISION,
-    )
-    tf.constant(max_nbor_size, name="train_attr/max_nbor_size", dtype=tf.int32)
+    # tf.constant(
+    #     min_nbor_dist,
+    #     name="train_attr/min_nbor_dist",
+    #     dtype=GLOBAL_ENER_FLOAT_PRECISION,
+    # )
+    # tf.constant(max_nbor_size, name="train_attr/max_nbor_size", dtype=tf.int32)
     return min_nbor_dist, max_nbor_size
 
 
@@ -467,8 +445,10 @@ def update_one_sel(jdata, descriptor):
     if descriptor["type"] == "loc_frame":
         return descriptor
     rcut = descriptor["rcut"]
-    tmp_sel = get_sel(jdata, rcut, one_type=descriptor["type"] in ("se_atten",))
-    sel = descriptor["sel"]
+    tmp_sel = get_sel(
+        jdata, rcut, one_type=descriptor["type"] in ("se_atten",)
+    )  # [38 72]，每个原子截断半径内，最多的邻域原子个数
+    sel = descriptor["sel"]  # [46, 92]
     if isinstance(sel, int):
         # convert to list and finnally convert back to int
         sel = [sel]
@@ -486,6 +466,25 @@ def update_one_sel(jdata, descriptor):
                     "not less than %d, but you set it to %d. The accuracy"
                     " of your model may get worse." % (ii, tt, dd)
                 )
+    """
+    descriptor:
+    {
+        'type': 'se_e2_a',
+        'sel': [46, 92],
+        'rcut_smth': 0.5,
+        'rcut': 6.0,
+        'neuron': [25, 50, 100],
+        'resnet_dt': False,
+        'axis_neuron': 16,
+        'seed': 1,
+        'activation_function': 'tanh',
+        'type_one_side': False,
+        'precision': 'default',
+        'trainable': True,
+        'exclude_types': [],
+        'set_davg_zero': False
+    }
+    """
     if descriptor["type"] in ("se_atten",):
         descriptor["sel"] = sel = sum(sel)
     return descriptor
@@ -499,7 +498,7 @@ def update_sel(jdata):
     if descrpt_data["type"] == "hybrid":
         for ii in range(len(descrpt_data["list"])):
             descrpt_data["list"][ii] = update_one_sel(jdata, descrpt_data["list"][ii])
-    else:
+    else:  # here
         descrpt_data = update_one_sel(jdata, descrpt_data)
     jdata["model"]["descriptor"] = descrpt_data
     return jdata
diff --git a/deepmd/env.py b/deepmd/env.py
index 2917fff1e8..a34fa32897 100644
--- a/deepmd/env.py
+++ b/deepmd/env.py
@@ -4,32 +4,20 @@
 import logging
 import os
 import platform
-from configparser import (
-    ConfigParser,
-)
-from importlib import (
-    import_module,
-    reload,
-)
-from pathlib import (
-    Path,
-)
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Dict,
-    Tuple,
-)
+from configparser import ConfigParser
+from importlib import import_module
+from importlib import reload
+from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Any
+from typing import Dict
+from typing import Tuple
 
 import numpy as np
-from packaging.version import (
-    Version,
-)
+from packaging.version import Version
 
 if TYPE_CHECKING:
-    from types import (
-        ModuleType,
-    )
+    from types import ModuleType
 
 
 def dlopen_library(module: str, filename: str):
@@ -67,6 +55,7 @@ def dlopen_library(module: str, filename: str):
 
 # import tensorflow v1 compatability
 try:
+    import paddle
     import tensorflow.compat.v1 as tf
 
     tf.disable_v2_behavior()
@@ -105,6 +94,7 @@ def dlopen_library(module: str, filename: str):
 # Python library version
 try:
     tf_py_version = tf.version.VERSION
+    pd_py_version = paddle.version.commit
 except AttributeError:
     tf_py_version = tf.__version__
 
@@ -370,7 +360,26 @@ def get_module(module_name: str) -> "ModuleType":
         raise FileNotFoundError(f"module {module_name} does not exist")
     else:
         try:
-            module = tf.load_op_library(str(module_file))
+            # module = tf.load_op_library(str(module_file))
+            import paddle_deepmd_lib
+            from paddle.utils import cpp_extension
+
+            # module = cpp_extension.load(
+            #     name="paddle_custom_ops",
+            #     sources=[
+            #         "/workspace/hesensen/deepmd-kit/source/op/paddle/neighbor_stat.cc",
+            #         "/workspace/hesensen/deepmd-kit/source/op/paddle/prod_env_mat.cc",
+            #     ],
+            #     extra_include_paths=[
+            #         "/workspace/hesensen/deepmd-kit/source/lib/include/",
+            #         "/usr/local/cuda/lib64/",
+            #         "/workspace/hesensen/deepmd-kit/source/op/paddle/cub/",
+            #         "/usr/local/cuda/include/"
+            #     ],
+            #     verbose=True,
+            # )
+            module = paddle_deepmd_lib
+
         except tf.errors.NotFoundError as e:
             # check CXX11_ABI_FLAG is compatiblity
             # see https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html
@@ -452,9 +461,9 @@ def _get_package_constants(
 
 
 GLOBAL_CONFIG = _get_package_constants()
-MODEL_VERSION = GLOBAL_CONFIG["model_version"]
-TF_VERSION = GLOBAL_CONFIG["tf_version"]
-TF_CXX11_ABI_FLAG = int(GLOBAL_CONFIG["tf_cxx11_abi_flag"])
+MODEL_VERSION = 0
+TF_VERSION = 0
+TF_CXX11_ABI_FLAG = 0
 
 op_module = get_module("deepmd_op")
 op_grads_module = get_module("op_grads")
@@ -464,11 +473,13 @@ def _get_package_constants(
 if dp_float_prec in ("high", ""):
     # default is high
     GLOBAL_TF_FLOAT_PRECISION = tf.float64
+    GLOBAL_PD_FLOAT_PRECISION = paddle.float64
     GLOBAL_NP_FLOAT_PRECISION = np.float64
     GLOBAL_ENER_FLOAT_PRECISION = np.float64
     global_float_prec = "double"
 elif dp_float_prec == "low":
     GLOBAL_TF_FLOAT_PRECISION = tf.float32
+    GLOBAL_PD_FLOAT_PRECISION = paddle.float32
     GLOBAL_NP_FLOAT_PRECISION = np.float32
     GLOBAL_ENER_FLOAT_PRECISION = np.float64
     global_float_prec = "float"
@@ -496,17 +507,33 @@ def global_cvt_2_tf_float(xx: tf.Tensor) -> tf.Tensor:
     return tf.cast(xx, GLOBAL_TF_FLOAT_PRECISION)
 
 
-def global_cvt_2_ener_float(xx: tf.Tensor) -> tf.Tensor:
+def global_cvt_2_pd_float(xx: paddle.Tensor) -> paddle.Tensor:
+    """Cast tensor to globally set TF precision.
+
+    Parameters
+    ----------
+    xx : paddle.Tensor
+        input tensor
+
+    Returns
+    -------
+    paddle.Tensor
+        output tensor cast to `GLOBAL_TF_FLOAT_PRECISION`
+    """
+    return paddle.cast(xx, GLOBAL_PD_FLOAT_PRECISION)
+
+
+def global_cvt_2_ener_float(xx: paddle.Tensor) -> paddle.Tensor:
     """Cast tensor to globally set energy precision.
 
     Parameters
     ----------
-    xx : tf.Tensor
+    xx : paddle.Tensor
         input tensor
 
     Returns
     -------
-    tf.Tensor
+    paddle.Tensor
         output tensor cast to `GLOBAL_ENER_FLOAT_PRECISION`
     """
-    return tf.cast(xx, GLOBAL_ENER_FLOAT_PRECISION)
+    return paddle.cast(xx, GLOBAL_ENER_FLOAT_PRECISION)
diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py
index f482173495..036bdc54f9 100644
--- a/deepmd/fit/ener.py
+++ b/deepmd/fit/ener.py
@@ -1,54 +1,37 @@
 import logging
-from typing import (
-    List,
-    Optional,
-)
+from typing import List
+from typing import Optional
 
 import numpy as np
-
-from deepmd.common import (
-    add_data_requirement,
-    cast_precision,
-    get_activation_func,
-    get_precision,
-)
-from deepmd.env import (
-    GLOBAL_TF_FLOAT_PRECISION,
-    global_cvt_2_tf_float,
-    tf,
-)
-from deepmd.fit.fitting import (
-    Fitting,
-)
-from deepmd.infer import (
-    DeepPotential,
-)
-from deepmd.nvnmd.fit.ener import (
-    one_layer_nvnmd,
-)
-from deepmd.nvnmd.utils.config import (
-    nvnmd_cfg,
-)
-from deepmd.utils.errors import (
-    GraphWithoutTensorError,
-)
-from deepmd.utils.graph import (
-    get_fitting_net_variables_from_graph_def,
-    get_tensor_by_name_from_graph,
-)
+from paddle import nn
+
+from deepmd.common import add_data_requirement
+from deepmd.common import cast_precision
+from deepmd.common import get_activation_func
+from deepmd.common import get_precision
+from deepmd.env import GLOBAL_PD_FLOAT_PRECISION
+from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.env import global_cvt_2_pd_float
+from deepmd.env import global_cvt_2_tf_float
+from deepmd.env import paddle
+from deepmd.env import tf
+from deepmd.fit.fitting import Fitting
+from deepmd.infer import DeepPotential
+from deepmd.nvnmd.fit.ener import one_layer_nvnmd
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.utils.errors import GraphWithoutTensorError
+from deepmd.utils.graph import get_fitting_net_variables_from_graph_def
+from deepmd.utils.graph import get_tensor_by_name_from_graph
+from deepmd.utils.network import OneLayer as OneLayer_deepmd
 from deepmd.utils.network import one_layer as one_layer_deepmd
-from deepmd.utils.network import (
-    one_layer_rand_seed_shift,
-)
-from deepmd.utils.spin import (
-    Spin,
-)
+from deepmd.utils.network import one_layer_rand_seed_shift
+from deepmd.utils.spin import Spin
 
 log = logging.getLogger(__name__)
 
 
-@Fitting.register("ener")
-class EnerFitting(Fitting):
+# @Fitting.register("ener")
+class EnerFitting(nn.Layer):
     r"""Fitting the energy of the system. The force and the virial can also be trained.
 
     The potential energy :math:`E` is a fitting network function of the descriptor :math:`\mathcal{D}`:
@@ -121,7 +104,7 @@ class EnerFitting(Fitting):
 
     def __init__(
         self,
-        descrpt: tf.Tensor,
+        descrpt: paddle.Tensor,
         neuron: List[int] = [120, 120, 120],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
@@ -138,6 +121,7 @@ def __init__(
         use_aparam_as_mask: bool = False,
         spin: Optional[Spin] = None,
     ) -> None:
+        super().__init__(name_scope="EnerFitting")
         """Constructor."""
         # model param
         self.ntypes = descrpt.get_ntypes()
@@ -180,13 +164,15 @@ def __init__(
         self.atom_ener_v = atom_ener
         for at, ae in enumerate(atom_ener):
             if ae is not None:
-                self.atom_ener.append(
-                    tf.constant(ae, GLOBAL_TF_FLOAT_PRECISION, name="atom_%d_ener" % at)
-                )
+                self.atom_ener.append(paddle.to_tensor(ae, GLOBAL_PD_FLOAT_PRECISION))
             else:
                 self.atom_ener.append(None)
         self.useBN = False
         self.bias_atom_e = np.zeros(self.ntypes, dtype=np.float64)
+        self.register_buffer(
+            "t_bias_atom_e",
+            paddle.to_tensor(self.bias_atom_e),
+        )
         # data requirement
         if self.numb_fparam > 0:
             add_data_requirement(
@@ -212,6 +198,96 @@ def __init__(
                 len(self.layer_name) == len(self.n_neuron) + 1
             ), "length of layer_name should be that of n_neuron + 1"
 
+        type_suffix = ""
+        suffix = ""
+        self.one_layers = nn.LayerList()
+        self.final_layers = nn.LayerList()
+        ntypes_atom = self.ntypes - self.ntypes_spin
+        for type_i in range(0, ntypes_atom):
+            type_i_layers = nn.LayerList()
+            for ii in range(0, len(self.n_neuron)):
+                if self.layer_name is not None and self.layer_name[ii] is not None:
+                    layer_suffix = "share_" + self.layer_name[ii] + type_suffix
+                else:
+                    layer_suffix = "layer_" + str(ii) + type_suffix + suffix
+
+                if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii - 1]:
+                    type_i_layers.append(
+                        OneLayer_deepmd(
+                            self.n_neuron[ii - 1],
+                            self.n_neuron[ii],
+                            activation_fn=self.fitting_activation_fn,
+                            precision=self.fitting_precision,
+                            name=layer_suffix,
+                            seed=self.seed,
+                            use_timestep=self.resnet_dt,
+                            trainable=self.trainable[ii],
+                        )
+                    )
+                else:
+                    type_i_layers.append(
+                        OneLayer_deepmd(
+                            self.dim_descrpt + self.numb_fparam + self.numb_aparam,
+                            self.n_neuron[ii],
+                            activation_fn=self.fitting_activation_fn,
+                            precision=self.fitting_precision,
+                            name=layer_suffix,
+                            seed=self.seed,
+                            trainable=self.trainable[ii],
+                        )
+                    )
+                if (not self.uniform_seed) and (self.seed is not None):
+                    self.seed += self.seed_shift
+
+            self.one_layers.append(type_i_layers)
+            self.final_layers.append(
+                OneLayer_deepmd(
+                    self.n_neuron[-1],
+                    1,
+                    activation_fn=None,
+                    precision=self.fitting_precision,
+                    bavg=self.bias_atom_e,
+                    name=layer_suffix,
+                    seed=self.seed,
+                    trainable=self.trainable[-1],
+                )
+            )
+
+        # print("create bias_atom_e", self.bias_atom_e.shape, self.bias_atom_e)
+        # self.register_buffer(
+        #     "t_bias_atom_e",
+        #     paddle.to_tensor(self.bias_atom_e),
+        # )
+        if self.numb_fparam > 0:
+            if self.fparam_avg is None:
+                self.fparam_avg = 0.0
+            if self.fparam_inv_std is None:
+                self.fparam_inv_std = 1.0
+        if self.numb_aparam > 0:
+            if self.aparam_avg is None:
+                self.aparam_avg = 0.0
+            if self.aparam_inv_std is None:
+                self.aparam_inv_std = 1.0
+
+        if self.numb_fparam > 0:
+            self.register_buffer(
+                "t_fparam_avg",
+                paddle.to_tensor(self.fparam_avg),
+            )
+            self.register_buffer(
+                "t_fparam_istd",
+                paddle.to_tensor(self.fparam_inv_std),
+            )
+        if self.numb_aparam > 0:
+            self.register_buffer(
+                "t_aparam_avg",
+                paddle.to_tensor(self.aparam_avg),
+            )
+            self.register_buffer(
+                "t_aparam_istd",
+                paddle.to_tensor(self.aparam_inv_std),
+            )
+
     def get_numb_fparam(self) -> int:
         """Get the number of frame parameters."""
         return self.numb_fparam
@@ -237,6 +313,11 @@ def compute_output_stats(self, all_stat: dict, mixed_type: bool = False) -> None
         self.bias_atom_e = self._compute_output_stats(
             all_stat, rcond=self.rcond, mixed_type=mixed_type
         )
+        paddle.assign(self.bias_atom_e, self.t_bias_atom_e)
+        # self.register_buffer(
+        #     "t_bias_atom_e",
+        #     paddle.to_tensor(self.bias_atom_e),
+        # )
 
     def _compute_output_stats(self, all_stat, rcond=1e-3, mixed_type=False):
         data = all_stat["energy"]
@@ -335,7 +416,7 @@ def compute_input_stats(self, all_stat: dict, protection: float = 1e-2) -> None:
     def _compute_std(self, sumv2, sumv, sumn):
         return np.sqrt(sumv2 / sumn - np.multiply(sumv / sumn, sumv / sumn))
 
-    @cast_precision
+    # @cast_precision
     def _build_lower(
         self,
         start_index,
@@ -346,103 +427,91 @@ def _build_lower(
         bias_atom_e=0.0,
         type_suffix="",
         suffix="",
-        reuse=None,
+        # reuse=None,
+        type_i=None,
     ):
         # cut-out inputs
-        inputs_i = tf.slice(inputs, [0, start_index, 0], [-1, natoms, -1])
-        inputs_i = tf.reshape(inputs_i, [-1, self.dim_descrpt])
+        inputs_i = paddle.slice(
+            inputs,
+            [0, 1, 2],
+            [0, start_index, 0],
+            [inputs.shape[0], start_index + natoms, inputs.shape[2]],
+        )
+        inputs_i = paddle.reshape(inputs_i, [-1, self.dim_descrpt])
         layer = inputs_i
         if fparam is not None:
-            ext_fparam = tf.tile(fparam, [1, natoms])
-            ext_fparam = tf.reshape(ext_fparam, [-1, self.numb_fparam])
-            ext_fparam = tf.cast(ext_fparam, self.fitting_precision)
-            layer = tf.concat([layer, ext_fparam], axis=1)
+            ext_fparam = paddle.tile(fparam, [1, natoms])
+            ext_fparam = paddle.reshape(ext_fparam, [-1, self.numb_fparam])
+            ext_fparam = paddle.cast(ext_fparam, self.fitting_precision)
+            layer = paddle.concat([layer, ext_fparam], axis=1)
         if aparam is not None:
-            ext_aparam = tf.slice(
+            ext_aparam = paddle.slice(
                 aparam,
+                [0, 1],
                 [0, start_index * self.numb_aparam],
-                [-1, natoms * self.numb_aparam],
+                [
+                    aparam.shape[0],
+                    start_index * self.numb_aparam + natoms * self.numb_aparam,
+                ],
             )
-            ext_aparam = tf.reshape(ext_aparam, [-1, self.numb_aparam])
-            ext_aparam = tf.cast(ext_aparam, self.fitting_precision)
-            layer = tf.concat([layer, ext_aparam], axis=1)
-
-        if nvnmd_cfg.enable:
-            one_layer = one_layer_nvnmd
-        else:
-            one_layer = one_layer_deepmd
+            ext_aparam = paddle.reshape(ext_aparam, [-1, self.numb_aparam])
+            ext_aparam = paddle.cast(ext_aparam, self.fitting_precision)
+            layer = paddle.concat([layer, ext_aparam], axis=1)
+
+        # if nvnmd_cfg.enable:
+        #     one_layer = one_layer_nvnmd
+        # else:
+        #     one_layer = one_layer_deepmd
         for ii in range(0, len(self.n_neuron)):
-            if self.layer_name is not None and self.layer_name[ii] is not None:
-                layer_suffix = "share_" + self.layer_name[ii] + type_suffix
-                layer_reuse = tf.AUTO_REUSE
-            else:
-                layer_suffix = "layer_" + str(ii) + type_suffix + suffix
-                layer_reuse = reuse
+            # if self.layer_name is not None and self.layer_name[ii] is not None:
+            #     layer_suffix = "share_" + self.layer_name[ii] + type_suffix
+            #     layer_reuse = tf.AUTO_REUSE
+            # else:
+            #     layer_suffix = "layer_" + str(ii) + type_suffix + suffix
+            #     layer_reuse = reuse
             if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii - 1]:
-                layer += one_layer(
-                    layer,
-                    self.n_neuron[ii],
-                    name=layer_suffix,
-                    reuse=layer_reuse,
-                    seed=self.seed,
-                    use_timestep=self.resnet_dt,
-                    activation_fn=self.fitting_activation_fn,
-                    precision=self.fitting_precision,
-                    trainable=self.trainable[ii],
-                    uniform_seed=self.uniform_seed,
-                    initial_variables=self.fitting_net_variables,
-                    mixed_prec=self.mixed_prec,
-                )
+                layer += self.one_layers[type_i][ii](layer)
             else:
-                layer = one_layer(
-                    layer,
-                    self.n_neuron[ii],
-                    name=layer_suffix,
-                    reuse=layer_reuse,
-                    seed=self.seed,
-                    activation_fn=self.fitting_activation_fn,
-                    precision=self.fitting_precision,
-                    trainable=self.trainable[ii],
-                    uniform_seed=self.uniform_seed,
-                    initial_variables=self.fitting_net_variables,
-                    mixed_prec=self.mixed_prec,
-                )
+                layer = self.one_layers[type_i][ii](layer)
+            # print(f"use {ii} of {len(self.one_layers)}_{type_i}")
+            # if (not self.uniform_seed) and (self.seed is not None):
+            #     self.seed += self.seed_shift
+            # if self.layer_name is not None and self.layer_name[-1] is not None:
+            #     layer_suffix = "share_" + self.layer_name[-1] + type_suffix
+            #     layer_reuse = tf.AUTO_REUSE
+            # else:
+            #     layer_suffix = "final_layer" + type_suffix + suffix
+            #     layer_reuse = reuse
             if (not self.uniform_seed) and (self.seed is not None):
                 self.seed += self.seed_shift
-        if self.layer_name is not None and self.layer_name[-1] is not None:
-            layer_suffix = "share_" + self.layer_name[-1] + type_suffix
-            layer_reuse = tf.AUTO_REUSE
-        else:
-            layer_suffix = "final_layer" + type_suffix + suffix
-            layer_reuse = reuse
-        final_layer = one_layer(
+        final_layer = self.final_layers[type_i](
             layer,
-            1,
-            activation_fn=None,
-            bavg=bias_atom_e,
-            name=layer_suffix,
-            reuse=layer_reuse,
-            seed=self.seed,
-            precision=self.fitting_precision,
-            trainable=self.trainable[-1],
-            uniform_seed=self.uniform_seed,
-            initial_variables=self.fitting_net_variables,
-            mixed_prec=self.mixed_prec,
-            final_layer=True,
+            # 1,
+            # activation_fn=None,
+            # bavg=bias_atom_e,
+            # name=layer_suffix,
+            # reuse=layer_reuse,
+            # seed=self.seed,
+            # precision=self.fitting_precision,
+            # trainable=self.trainable[-1],
+            # uniform_seed=self.uniform_seed,
+            # initial_variables=self.fitting_net_variables,
+            # mixed_prec=self.mixed_prec,
+            # final_layer=True,
         )
         if (not self.uniform_seed) and (self.seed is not None):
             self.seed += self.seed_shift
 
         return final_layer
 
-    def build(
+    def forward(
         self,
-        inputs: tf.Tensor,
-        natoms: tf.Tensor,
+        inputs: paddle.Tensor,
+        natoms: paddle.Tensor,
         input_dict: Optional[dict] = None,
         reuse: Optional[bool] = None,
         suffix: str = "",
-    ) -> tf.Tensor:
+    ) -> paddle.Tensor:
         """Build the computational graph for fitting net.
 
         Parameters
@@ -504,59 +573,18 @@ def build(
                     self.bias_atom_e[type_i] = self.bias_atom_e[type_i]
             self.bias_atom_e = self.bias_atom_e[:ntypes_atom]
 
-        with tf.variable_scope("fitting_attr" + suffix, reuse=reuse):
-            t_dfparam = tf.constant(self.numb_fparam, name="dfparam", dtype=tf.int32)
-            t_daparam = tf.constant(self.numb_aparam, name="daparam", dtype=tf.int32)
-            self.t_bias_atom_e = tf.get_variable(
-                "t_bias_atom_e",
-                self.bias_atom_e.shape,
-                dtype=GLOBAL_TF_FLOAT_PRECISION,
-                trainable=False,
-                initializer=tf.constant_initializer(self.bias_atom_e),
-            )
-            if self.numb_fparam > 0:
-                t_fparam_avg = tf.get_variable(
-                    "t_fparam_avg",
-                    self.numb_fparam,
-                    dtype=GLOBAL_TF_FLOAT_PRECISION,
-                    trainable=False,
-                    initializer=tf.constant_initializer(self.fparam_avg),
-                )
-                t_fparam_istd = tf.get_variable(
-                    "t_fparam_istd",
-                    self.numb_fparam,
-                    dtype=GLOBAL_TF_FLOAT_PRECISION,
-                    trainable=False,
-                    initializer=tf.constant_initializer(self.fparam_inv_std),
-                )
-            if self.numb_aparam > 0:
-                t_aparam_avg = tf.get_variable(
-                    "t_aparam_avg",
-                    self.numb_aparam,
-                    dtype=GLOBAL_TF_FLOAT_PRECISION,
-                    trainable=False,
-                    initializer=tf.constant_initializer(self.aparam_avg),
-                )
-                t_aparam_istd = tf.get_variable(
-                    "t_aparam_istd",
-                    self.numb_aparam,
-                    dtype=GLOBAL_TF_FLOAT_PRECISION,
-                    trainable=False,
-                    initializer=tf.constant_initializer(self.aparam_inv_std),
-                )
-
-        inputs = tf.reshape(inputs, [-1, natoms[0], self.dim_descrpt])
+        inputs = paddle.reshape(inputs, [-1, natoms[0], self.dim_descrpt])
         if len(self.atom_ener):
             # only for atom_ener
             nframes = input_dict.get("nframes")
             if nframes is not None:
                 # like inputs, but we don't want to add a dependency on inputs
-                inputs_zero = tf.zeros(
+                inputs_zero = paddle.zeros(
                     (nframes, natoms[0], self.dim_descrpt),
-                    dtype=GLOBAL_TF_FLOAT_PRECISION,
+                    dtype=GLOBAL_PD_FLOAT_PRECISION,
                 )
             else:
-                inputs_zero = tf.zeros_like(inputs, dtype=GLOBAL_TF_FLOAT_PRECISION)
+                inputs_zero = paddle.zeros_like(inputs, dtype=GLOBAL_PD_FLOAT_PRECISION)
 
         if bias_atom_e is not None:
             assert len(bias_atom_e) == self.ntypes
@@ -564,37 +592,42 @@ def build(
         fparam = None
         if self.numb_fparam > 0:
             fparam = input_dict["fparam"]
-            fparam = tf.reshape(fparam, [-1, self.numb_fparam])
-            fparam = (fparam - t_fparam_avg) * t_fparam_istd
+            fparam = paddle.reshape(fparam, [-1, self.numb_fparam])
+            fparam = (fparam - self.t_fparam_avg) * self.t_fparam_istd
 
         aparam = None
         if not self.use_aparam_as_mask:
             if self.numb_aparam > 0:
                 aparam = input_dict["aparam"]
-                aparam = tf.reshape(aparam, [-1, self.numb_aparam])
-                aparam = (aparam - t_aparam_avg) * t_aparam_istd
-                aparam = tf.reshape(aparam, [-1, self.numb_aparam * natoms[0]])
+                aparam = paddle.reshape(aparam, [-1, self.numb_aparam])
+                aparam = (aparam - self.t_aparam_avg) * self.t_aparam_istd
+                aparam = paddle.reshape(aparam, [-1, self.numb_aparam * natoms[0]])
 
-        atype_nall = tf.reshape(atype, [-1, natoms[1]])
-        self.atype_nloc = tf.slice(
-            atype_nall, [0, 0], [-1, natoms[0]]
+        atype_nall = paddle.reshape(atype, [-1, natoms[1]])
+        self.atype_nloc = paddle.slice(
+            atype_nall, [0, 1], [0, 0], [atype_nall.shape[0], natoms[0]]
         )  ## lammps will make error
-        atype_filter = tf.cast(self.atype_nloc >= 0, GLOBAL_TF_FLOAT_PRECISION)
-        self.atype_nloc = tf.reshape(self.atype_nloc, [-1])
+        atype_filter = paddle.cast(self.atype_nloc >= 0, GLOBAL_PD_FLOAT_PRECISION)
+        self.atype_nloc = paddle.reshape(self.atype_nloc, [-1])
         # prevent embedding_lookup error,
         # but the filter will be applied anyway
-        self.atype_nloc = tf.clip_by_value(self.atype_nloc, 0, self.ntypes - 1)
+        self.atype_nloc = paddle.clip(self.atype_nloc, 0, self.ntypes - 1)
 
         ## if spin is used
         if self.spin is not None:
-            self.atype_nloc = tf.slice(
-                atype_nall, [0, 0], [-1, tf.reduce_sum(natoms[2 : 2 + ntypes_atom])]
+            self.atype_nloc = paddle.slice(
+                atype_nall,
+                [0, 1],
+                [0, 0],
+                [-1, paddle.sum(natoms[2 : 2 + ntypes_atom]).item()],
             )
-            atype_filter = tf.cast(self.atype_nloc >= 0, GLOBAL_TF_FLOAT_PRECISION)
-            self.atype_nloc = tf.reshape(self.atype_nloc, [-1])
+            atype_filter = paddle.cast(self.atype_nloc >= 0, GLOBAL_PD_FLOAT_PRECISION)
+            self.atype_nloc = paddle.reshape(self.atype_nloc, [-1])
 
         if type_embedding is not None:
-            atype_embed = tf.nn.embedding_lookup(type_embedding, self.atype_nloc)
+            atype_embed = paddle.nn.functional.embedding(
+                self.atype_nloc, type_embedding
+            )
         else:
             atype_embed = None
 
@@ -604,6 +637,18 @@ def build(
             start_index = 0
             outs_list = []
             for type_i in range(ntypes_atom):
+                # final_layer = inputs
+                # for layer_j in range(type_i * ntypes_atom, (type_i + 1) * ntypes_atom):
+                #     final_layer = self.one_layers[layer_j](final_layer)
+                # final_layer = self.final_layers[type_i](final_layer)
+                # print(final_layer.shape)
+
+                # # concat the results
+                # if type_i < len(self.atom_ener) and self.atom_ener[type_i] is not None:
+                #     zero_layer = inputs_zero
+                #     for layer_j in range(type_i * ntypes_atom, (type_i + 1) * ntypes_atom):
+                #         zero_layer = self.one_layers[layer_j](zero_layer)
+                #     zero_layer = self.final_layers[type_i](zero_layer)
                 final_layer = self._build_lower(
                     start_index,
                     natoms[2 + type_i],
@@ -613,7 +658,8 @@ def build(
                     bias_atom_e=0.0,
                     type_suffix="_type_" + str(type_i),
                     suffix=suffix,
-                    reuse=reuse,
+                    # reuse=reuse,
+                    type_i=type_i,
                 )
                 # concat the results
                 if type_i < len(self.atom_ener) and self.atom_ener[type_i] is not None:
@@ -626,82 +672,80 @@ def build(
                         bias_atom_e=0.0,
                         type_suffix="_type_" + str(type_i),
                         suffix=suffix,
-                        reuse=True,
+                        # reuse=True,
+                        type_i=type_i,
                     )
                     final_layer -= zero_layer
-                final_layer = tf.reshape(
-                    final_layer, [tf.shape(inputs)[0], natoms[2 + type_i]]
+                final_layer = paddle.reshape(
+                    final_layer, [paddle.shape(inputs)[0], natoms[2 + type_i]]
                 )
                 outs_list.append(final_layer)
                 start_index += natoms[2 + type_i]
             # concat the results
             # concat once may be faster than multiple concat
-            outs = tf.concat(outs_list, axis=1)
+            outs = paddle.concat(outs_list, axis=1)
         # with type embedding
         else:
-            atype_embed = tf.cast(atype_embed, GLOBAL_TF_FLOAT_PRECISION)
-            type_shape = atype_embed.get_shape().as_list()
-            inputs = tf.concat(
-                [tf.reshape(inputs, [-1, self.dim_descrpt]), atype_embed], axis=1
+            atype_embed = paddle.cast(atype_embed, GLOBAL_PD_FLOAT_PRECISION)
+            type_shape = atype_embed.shape
+            inputs = paddle.concat(
+                [paddle.reshape(inputs, [-1, self.dim_descrpt]), atype_embed], axis=1
             )
             original_dim_descrpt = self.dim_descrpt
             self.dim_descrpt = self.dim_descrpt + type_shape[1]
-            inputs = tf.reshape(inputs, [-1, natoms[0], self.dim_descrpt])
-            final_layer = self._build_lower(
-                0,
-                natoms[0],
-                inputs,
-                fparam,
-                aparam,
-                bias_atom_e=0.0,
-                suffix=suffix,
-                reuse=reuse,
-            )
+            inputs = paddle.reshape(inputs, [-1, natoms[0], self.dim_descrpt])
+            final_layer = inputs
+            for layer_j in range(0 * ntypes_atom, (0 + 1) * ntypes_atom):
+                final_layer = self.one_layers[layer_j](final_layer)
+            final_layer = self.final_layers[0](final_layer)
             if len(self.atom_ener):
                 # remove contribution in vacuum
-                inputs_zero = tf.concat(
-                    [tf.reshape(inputs_zero, [-1, original_dim_descrpt]), atype_embed],
+                inputs_zero = paddle.concat(
+                    [
+                        paddle.reshape(inputs_zero, [-1, original_dim_descrpt]),
+                        atype_embed,
+                    ],
                     axis=1,
                 )
-                inputs_zero = tf.reshape(inputs_zero, [-1, natoms[0], self.dim_descrpt])
-                zero_layer = self._build_lower(
-                    0,
-                    natoms[0],
-                    inputs_zero,
-                    fparam,
-                    aparam,
-                    bias_atom_e=0.0,
-                    suffix=suffix,
-                    reuse=True,
+                inputs_zero = paddle.reshape(
+                    inputs_zero, [-1, natoms[0], self.dim_descrpt]
                 )
-                # atomic energy will be stored in `self.t_bias_atom_e` which is not trainable
+                zero_layer = inputs_zero
+                for layer_j in range(0 * ntypes_atom, (0 + 1) * ntypes_atom):
+                    zero_layer = self.one_layers[layer_j](zero_layer)
+                zero_layer = self.final_layers[0](zero_layer)
+
                 final_layer -= zero_layer
-            outs = tf.reshape(final_layer, [tf.shape(inputs)[0], natoms[0]])
+            outs = paddle.reshape(final_layer, [paddle.shape(inputs)[0], natoms[0]])
         # add bias
         self.atom_ener_before = outs * atype_filter
-        self.add_type = tf.reshape(
-            tf.nn.embedding_lookup(self.t_bias_atom_e, self.atype_nloc),
-            [tf.shape(inputs)[0], tf.reduce_sum(natoms[2 : 2 + ntypes_atom])],
+        self.add_type = paddle.reshape(
+            paddle.nn.functional.embedding(
+                self.atype_nloc, self.t_bias_atom_e.reshape([2, -1])
+            ),
+            [paddle.shape(inputs)[0], paddle.sum(natoms[2 : 2 + ntypes_atom]).item()],
         )
+        # print(__file__, self.t_bias_atom_e)
+        # exit()
         outs = outs + self.add_type
         outs *= atype_filter
         self.atom_ener_after = outs
 
         if self.tot_ener_zero:
             force_tot_ener = 0.0
-            outs = tf.reshape(outs, [-1, tf.reduce_sum(natoms[2 : 2 + ntypes_atom])])
-            outs_mean = tf.reshape(tf.reduce_mean(outs, axis=1), [-1, 1])
-            outs_mean = outs_mean - tf.ones_like(
-                outs_mean, dtype=GLOBAL_TF_FLOAT_PRECISION
+            outs = paddle.reshape(
+                outs, [-1, paddle.sum(natoms[2 : 2 + ntypes_atom]).item()]
+            )
+            outs_mean = paddle.reshape(paddle.mean(outs, axis=1), [-1, 1])
+            outs_mean = outs_mean - paddle.ones_like(
+                outs_mean, dtype=GLOBAL_PD_FLOAT_PRECISION
             ) * (
                 force_tot_ener
-                / global_cvt_2_tf_float(tf.reduce_sum(natoms[2 : 2 + ntypes_atom]))
+                / global_cvt_2_pd_float(paddle.sum(natoms[2 : 2 + ntypes_atom]))
             )
             outs = outs - outs_mean
-            outs = tf.reshape(outs, [-1])
-
-        tf.summary.histogram("fitting_net_output", outs)
-        return tf.reshape(outs, [-1])
+            outs = paddle.reshape(outs, [-1])
+        return paddle.reshape(outs, [-1])
 
     def init_variables(
         self,
diff --git a/deepmd/fit/ener_tf.py b/deepmd/fit/ener_tf.py
new file mode 100644
index 0000000000..1f77d2fdb6
--- /dev/null
+++ b/deepmd/fit/ener_tf.py
@@ -0,0 +1,888 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
+from typing import List
+from typing import Optional
+
+import numpy as np
+
+from deepmd.common import add_data_requirement
+from deepmd.common import cast_precision
+from deepmd.common import get_activation_func
+from deepmd.common import get_precision
+from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.env import global_cvt_2_tf_float
+from deepmd.env import tf
+from deepmd.fit.fitting import Fitting
+from deepmd.infer import DeepPotential
+from deepmd.loss.ener import EnerDipoleLoss
+from deepmd.loss.ener import EnerSpinLoss
+from deepmd.loss.ener import EnerStdLoss
+from deepmd.loss.loss import Loss
+from deepmd.nvnmd.fit.ener import one_layer_nvnmd
+from deepmd.nvnmd.utils.config import nvnmd_cfg
+from deepmd.utils.errors import GraphWithoutTensorError
+from deepmd.utils.graph import get_fitting_net_variables_from_graph_def
+from deepmd.utils.graph import get_tensor_by_name_from_graph
+from deepmd.utils.network import one_layer as one_layer_deepmd
+from deepmd.utils.network import one_layer_rand_seed_shift
+from deepmd.utils.spin import Spin
+
+log = logging.getLogger(__name__)
+
+
+@Fitting.register("ener")
+class EnerFitting(Fitting):
+    r"""Fitting the energy of the system. The force and the virial can also be trained.
+
+    The potential energy :math:`E` is a fitting network function of the descriptor :math:`\mathcal{D}`:
+
+    .. math::
+        E(\mathcal{D}) = \mathcal{L}^{(n)} \circ \mathcal{L}^{(n-1)}
+        \circ \cdots \circ \mathcal{L}^{(1)} \circ \mathcal{L}^{(0)}
+
+    The first :math:`n` hidden layers :math:`\mathcal{L}^{(0)}, \cdots, \mathcal{L}^{(n-1)}` are given by
+
+    .. math::
+        \mathbf{y}=\mathcal{L}(\mathbf{x};\mathbf{w},\mathbf{b})=
+            \boldsymbol{\phi}(\mathbf{x}^T\mathbf{w}+\mathbf{b})
+
+    where :math:`\mathbf{x} \in \mathbb{R}^{N_1}` is the input vector and :math:`\mathbf{y} \in \mathbb{R}^{N_2}`
+    is the output vector. :math:`\mathbf{w} \in \mathbb{R}^{N_1 \times N_2}` and
+    :math:`\mathbf{b} \in \mathbb{R}^{N_2}` are weights and biases, respectively,
+    both of which are trainable if `trainable[i]` is `True`. :math:`\boldsymbol{\phi}`
+    is the activation function.
+
+    The output layer :math:`\mathcal{L}^{(n)}` is given by
+
+    .. math::
+        \mathbf{y}=\mathcal{L}^{(n)}(\mathbf{x};\mathbf{w},\mathbf{b})=
+            \mathbf{x}^T\mathbf{w}+\mathbf{b}
+
+    where :math:`\mathbf{x} \in \mathbb{R}^{N_{n-1}}` is the input vector and :math:`\mathbf{y} \in \mathbb{R}`
+    is the output scalar. :math:`\mathbf{w} \in \mathbb{R}^{N_{n-1}}` and
+    :math:`\mathbf{b} \in \mathbb{R}` are weights and bias, respectively,
+    both of which are trainable if `trainable[n]` is `True`.
+
+    Parameters
+    ----------
+    descrpt
+            The descrptor :math:`\mathcal{D}`
+    neuron
+            Number of neurons :math:`N` in each hidden layer of the fitting net
+    resnet_dt
+            Time-step `dt` in the resnet construction:
+            :math:`y = x + dt * \phi (Wx + b)`
+    numb_fparam
+            Number of frame parameter
+    numb_aparam
+            Number of atomic parameter
+    rcond
+            The condition number for the regression of atomic energy.
+    tot_ener_zero
+            Force the total energy to zero. Useful for the charge fitting.
+    trainable
+            If the weights of fitting net are trainable.
+            Suppose that we have :math:`N_l` hidden layers in the fitting net,
+            this list is of length :math:`N_l + 1`, specifying if the hidden layers and the output layer are trainable.
+    seed
+            Random seed for initializing the network parameters.
+    atom_ener
+            Specifying atomic energy contribution in vacuum. The `set_davg_zero` key in the descrptor should be set.
+    activation_function
+            The activation function :math:`\boldsymbol{\phi}` in the embedding net. Supported options are |ACTIVATION_FN|
+    precision
+            The precision of the embedding net parameters. Supported options are |PRECISION|
+    uniform_seed
+            Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
+    layer_name : list[Optional[str]], optional
+            The name of the each layer. If two layers, either in the same fitting or different fittings,
+            have the same name, they will share the same neural network parameters.
+    use_aparam_as_mask: bool, optional
+            If True, the atomic parameters will be used as a mask that determines the atom is real/virtual.
+            And the aparam will not be used as the atomic parameters for embedding.
+    """
+
+    def __init__(
+        self,
+        descrpt: tf.Tensor,
+        neuron: List[int] = [120, 120, 120],
+        resnet_dt: bool = True,
+        numb_fparam: int = 0,
+        numb_aparam: int = 0,
+        rcond: Optional[float] = None,
+        tot_ener_zero: bool = False,
+        trainable: Optional[List[bool]] = None,
+        seed: Optional[int] = None,
+        atom_ener: List[float] = [],
+        activation_function: str = "tanh",
+        precision: str = "default",
+        uniform_seed: bool = False,
+        layer_name: Optional[List[Optional[str]]] = None,
+        use_aparam_as_mask: bool = False,
+        spin: Optional[Spin] = None,
+        **kwargs,
+    ) -> None:
+        """Constructor."""
+        # model param
+        self.ntypes = descrpt.get_ntypes()
+        self.dim_descrpt = descrpt.get_dim_out()
+        self.use_aparam_as_mask = use_aparam_as_mask
+        # args = ()\
+        #        .add('numb_fparam',      int,    default = 0)\
+        #        .add('numb_aparam',      int,    default = 0)\
+        #        .add('neuron',           list,   default = [120,120,120], alias = 'n_neuron')\
+        #        .add('resnet_dt',        bool,   default = True)\
+        #        .add('rcond',            float,  default = 1e-3) \
+        #        .add('tot_ener_zero',    bool,   default = False) \
+        #        .add('seed',             int)               \
+        #        .add('atom_ener',        list,   default = [])\
+        #        .add("activation_function", str,    default = "tanh")\
+        #        .add("precision",           str, default = "default")\
+        #        .add("trainable",        [list, bool], default = True)
+        self.numb_fparam = numb_fparam
+        self.numb_aparam = numb_aparam
+        self.n_neuron = neuron
+        self.resnet_dt = resnet_dt
+        self.rcond = rcond
+        self.seed = seed
+        self.uniform_seed = uniform_seed
+        self.spin = spin
+        self.ntypes_spin = self.spin.get_ntypes_spin() if self.spin is not None else 0
+        self.seed_shift = one_layer_rand_seed_shift()
+        self.tot_ener_zero = tot_ener_zero
+        self.fitting_activation_fn = get_activation_func(activation_function)
+        self.fitting_precision = get_precision(precision)
+        self.trainable = trainable
+        if self.trainable is None:
+            self.trainable = [True for ii in range(len(self.n_neuron) + 1)]
+        if isinstance(self.trainable, bool):
+            self.trainable = [self.trainable] * (len(self.n_neuron) + 1)
+        assert (
+            len(self.trainable) == len(self.n_neuron) + 1
+        ), "length of trainable should be that of n_neuron + 1"
+        self.atom_ener = []
+        self.atom_ener_v = atom_ener
+        for at, ae in enumerate(atom_ener):
+            if ae is not None:
+                self.atom_ener.append(
+                    tf.constant(ae, GLOBAL_TF_FLOAT_PRECISION, name="atom_%d_ener" % at)
+                )
+            else:
+                self.atom_ener.append(None)
+        self.useBN = False
+        self.bias_atom_e = np.zeros(self.ntypes, dtype=np.float64)
+        # data requirement
+        if self.numb_fparam > 0:
+            add_data_requirement(
+                "fparam", self.numb_fparam, atomic=False, must=True, high_prec=False
+            )
+            self.fparam_avg = None
+            self.fparam_std = None
+            self.fparam_inv_std = None
+        if self.numb_aparam > 0:
+            add_data_requirement(
+                "aparam", self.numb_aparam, atomic=True, must=True, high_prec=False
+            )
+            self.aparam_avg = None
+            self.aparam_std = None
+            self.aparam_inv_std = None
+
+        self.fitting_net_variables = None
+        self.mixed_prec = None
+        self.layer_name = layer_name
+        if self.layer_name is not None:
+            assert isinstance(self.layer_name, list), "layer_name should be a list"
+            assert (
+                len(self.layer_name) == len(self.n_neuron) + 1
+            ), "length of layer_name should be that of n_neuron + 1"
+
+    def get_numb_fparam(self) -> int:
+        """Get the number of frame parameters."""
+        return self.numb_fparam
+
+    def get_numb_aparam(self) -> int:
+        """Get the number of atomic parameters."""
+        return self.numb_fparam
+
+    def compute_output_stats(self, all_stat: dict, mixed_type: bool = False) -> None:
+        """Compute the ouput statistics.
+
+        Parameters
+        ----------
+        all_stat
+            must have the following components:
+            all_stat['energy'] of shape n_sys x n_batch x n_frame
+            can be prepared by model.make_stat_input
+        mixed_type
+            Whether to perform the mixed_type mode.
+            If True, the input data has the mixed_type format (see doc/model/train_se_atten.md),
+            in which frames in a system may have different natoms_vec(s), with the same nloc.
+        """
+        self.bias_atom_e = self._compute_output_stats(
+            all_stat, rcond=self.rcond, mixed_type=mixed_type
+        )
+
+    def _compute_output_stats(self, all_stat, rcond=1e-3, mixed_type=False):
+        data = all_stat["energy"]
+        # data[sys_idx][batch_idx][frame_idx]
+        sys_ener = []
+        for ss in range(len(data)):
+            sys_data = []
+            for ii in range(len(data[ss])):
+                for jj in range(len(data[ss][ii])):
+                    sys_data.append(data[ss][ii][jj])
+            sys_data = np.concatenate(sys_data)
+            sys_ener.append(np.average(sys_data))
+        sys_ener = np.array(sys_ener)
+        sys_tynatom = []
+        if mixed_type:
+            data = all_stat["real_natoms_vec"]
+            nsys = len(data)
+            for ss in range(len(data)):
+                tmp_tynatom = []
+                for ii in range(len(data[ss])):
+                    for jj in range(len(data[ss][ii])):
+                        tmp_tynatom.append(data[ss][ii][jj].astype(np.float64))
+                tmp_tynatom = np.average(np.array(tmp_tynatom), axis=0)
+                sys_tynatom.append(tmp_tynatom)
+        else:
+            data = all_stat["natoms_vec"]
+            nsys = len(data)
+            for ss in range(len(data)):
+                sys_tynatom.append(data[ss][0].astype(np.float64))
+        sys_tynatom = np.array(sys_tynatom)
+        sys_tynatom = np.reshape(sys_tynatom, [nsys, -1])
+        sys_tynatom = sys_tynatom[:, 2:]
+        if len(self.atom_ener) > 0:
+            # Atomic energies stats are incorrect if atomic energies are assigned.
+            # In this situation, we directly use these assigned energies instead of computing stats.
+            # This will make the loss decrease quickly
+            assigned_atom_ener = np.array(
+                [ee for ee in self.atom_ener_v if ee is not None]
+            )
+            assigned_ener_idx = [
+                ii for ii, ee in enumerate(self.atom_ener_v) if ee is not None
+            ]
+            # np.dot out size: nframe
+            sys_ener -= np.dot(sys_tynatom[:, assigned_ener_idx], assigned_atom_ener)
+            sys_tynatom[:, assigned_ener_idx] = 0.0
+        energy_shift, resd, rank, s_value = np.linalg.lstsq(
+            sys_tynatom, sys_ener, rcond=rcond
+        )
+        if len(self.atom_ener) > 0:
+            for ii in assigned_ener_idx:
+                energy_shift[ii] = self.atom_ener_v[ii]
+        return energy_shift
+
+    def compute_input_stats(self, all_stat: dict, protection: float = 1e-2) -> None:
+        """Compute the input statistics.
+
+        Parameters
+        ----------
+        all_stat
+            if numb_fparam > 0 must have all_stat['fparam']
+            if numb_aparam > 0 must have all_stat['aparam']
+            can be prepared by model.make_stat_input
+        protection
+            Divided-by-zero protection
+        """
+        # stat fparam
+        if self.numb_fparam > 0:
+            cat_data = np.concatenate(all_stat["fparam"], axis=0)
+            cat_data = np.reshape(cat_data, [-1, self.numb_fparam])
+            self.fparam_avg = np.average(cat_data, axis=0)
+            self.fparam_std = np.std(cat_data, axis=0)
+            for ii in range(self.fparam_std.size):
+                if self.fparam_std[ii] < protection:
+                    self.fparam_std[ii] = protection
+            self.fparam_inv_std = 1.0 / self.fparam_std
+        # stat aparam
+        if self.numb_aparam > 0:
+            sys_sumv = []
+            sys_sumv2 = []
+            sys_sumn = []
+            for ss_ in all_stat["aparam"]:
+                ss = np.reshape(ss_, [-1, self.numb_aparam])
+                sys_sumv.append(np.sum(ss, axis=0))
+                sys_sumv2.append(np.sum(np.multiply(ss, ss), axis=0))
+                sys_sumn.append(ss.shape[0])
+            sumv = np.sum(sys_sumv, axis=0)
+            sumv2 = np.sum(sys_sumv2, axis=0)
+            sumn = np.sum(sys_sumn)
+            self.aparam_avg = (sumv) / sumn
+            self.aparam_std = self._compute_std(sumv2, sumv, sumn)
+            for ii in range(self.aparam_std.size):
+                if self.aparam_std[ii] < protection:
+                    self.aparam_std[ii] = protection
+            self.aparam_inv_std = 1.0 / self.aparam_std
+
+    def _compute_std(self, sumv2, sumv, sumn):
+        return np.sqrt(sumv2 / sumn - np.multiply(sumv / sumn, sumv / sumn))
+
+    @cast_precision
+    def _build_lower(
+        self,
+        start_index,
+        natoms,
+        inputs,
+        fparam=None,
+        aparam=None,
+        bias_atom_e=0.0,
+        type_suffix="",
+        suffix="",
+        reuse=None,
+    ):
+        # cut-out inputs
+        inputs_i = tf.slice(inputs, [0, start_index, 0], [-1, natoms, -1])
+        inputs_i = tf.reshape(inputs_i, [-1, self.dim_descrpt])
+        layer = inputs_i
+        if fparam is not None:
+            ext_fparam = tf.tile(fparam, [1, natoms])
+            ext_fparam = tf.reshape(ext_fparam, [-1, self.numb_fparam])
+            ext_fparam = tf.cast(ext_fparam, self.fitting_precision)
+            layer = tf.concat([layer, ext_fparam], axis=1)
+        if aparam is not None:
+            ext_aparam = tf.slice(
+                aparam,
+                [0, start_index * self.numb_aparam],
+                [-1, natoms * self.numb_aparam],
+            )
+            ext_aparam = tf.reshape(ext_aparam, [-1, self.numb_aparam])
+            ext_aparam = tf.cast(ext_aparam, self.fitting_precision)
+            layer = tf.concat([layer, ext_aparam], axis=1)
+
+        if nvnmd_cfg.enable:
+            one_layer = one_layer_nvnmd
+        else:
+            one_layer = one_layer_deepmd
+        for ii in range(0, len(self.n_neuron)):
+            if self.layer_name is not None and self.layer_name[ii] is not None:
+                layer_suffix = "share_" + self.layer_name[ii] + type_suffix
+                layer_reuse = tf.AUTO_REUSE
+            else:
+                layer_suffix = "layer_" + str(ii) + type_suffix + suffix
+                layer_reuse = reuse
+            if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii - 1]:
+                layer += one_layer(
+                    layer,
+                    self.n_neuron[ii],
+                    name=layer_suffix,
+                    reuse=layer_reuse,
+                    seed=self.seed,
+                    use_timestep=self.resnet_dt,
+                    activation_fn=self.fitting_activation_fn,
+                    precision=self.fitting_precision,
+                    trainable=self.trainable[ii],
+                    uniform_seed=self.uniform_seed,
+                    initial_variables=self.fitting_net_variables,
+                    mixed_prec=self.mixed_prec,
+                )
+            else:
+                layer = one_layer(
+                    layer,
+                    self.n_neuron[ii],
+                    name=layer_suffix,
+                    reuse=layer_reuse,
+                    seed=self.seed,
+                    activation_fn=self.fitting_activation_fn,
+                    precision=self.fitting_precision,
+                    trainable=self.trainable[ii],
+                    uniform_seed=self.uniform_seed,
+                    initial_variables=self.fitting_net_variables,
+                    mixed_prec=self.mixed_prec,
+                )
+            if (not self.uniform_seed) and (self.seed is not None):
+                self.seed += self.seed_shift
+        if self.layer_name is not None and self.layer_name[-1] is not None:
+            layer_suffix = "share_" + self.layer_name[-1] + type_suffix
+            layer_reuse = tf.AUTO_REUSE
+        else:
+            layer_suffix = "final_layer" + type_suffix + suffix
+            layer_reuse = reuse
+        final_layer = one_layer(
+            layer,
+            1,
+            activation_fn=None,
+            bavg=bias_atom_e,
+            name=layer_suffix,
+            reuse=layer_reuse,
+            seed=self.seed,
+            precision=self.fitting_precision,
+            trainable=self.trainable[-1],
+            uniform_seed=self.uniform_seed,
+            initial_variables=self.fitting_net_variables,
+            mixed_prec=self.mixed_prec,
+            final_layer=True,
+        )
+        if (not self.uniform_seed) and (self.seed is not None):
+            self.seed += self.seed_shift
+
+        return final_layer
+
+    def build(
+        self,
+        inputs: tf.Tensor,
+        natoms: tf.Tensor,
+        input_dict: Optional[dict] = None,
+        reuse: Optional[bool] = None,
+        suffix: str = "",
+    ) -> tf.Tensor:
+        """Build the computational graph for fitting net.
+
+        Parameters
+        ----------
+        inputs
+            The input descriptor
+        input_dict
+            Additional dict for inputs.
+            if numb_fparam > 0, should have input_dict['fparam']
+            if numb_aparam > 0, should have input_dict['aparam']
+        natoms
+            The number of atoms. This tensor has the length of Ntypes + 2
+            natoms[0]: number of local atoms
+            natoms[1]: total number of atoms held by this processor
+            natoms[i]: 2 <= i < Ntypes+2, number of type i atoms
+        reuse
+            The weights in the networks should be reused when get the variable.
+        suffix
+            Name suffix to identify this descriptor
+
+        Returns
+        -------
+        ener
+            The system energy
+        """
+        if input_dict is None:
+            input_dict = {}
+        bias_atom_e = self.bias_atom_e
+        type_embedding = input_dict.get("type_embedding", None)
+        atype = input_dict.get("atype", None)
+        if self.numb_fparam > 0:
+            if self.fparam_avg is None:
+                self.fparam_avg = 0.0
+            if self.fparam_inv_std is None:
+                self.fparam_inv_std = 1.0
+        if self.numb_aparam > 0:
+            if self.aparam_avg is None:
+                self.aparam_avg = 0.0
+            if self.aparam_inv_std is None:
+                self.aparam_inv_std = 1.0
+
+        ntypes_atom = self.ntypes - self.ntypes_spin
+        if self.spin is not None:
+            for type_i in range(ntypes_atom):
+                if self.bias_atom_e.shape[0] != self.ntypes:
+                    self.bias_atom_e = np.pad(
+                        self.bias_atom_e,
+                        (0, self.ntypes_spin),
+                        "constant",
+                        constant_values=(0, 0),
+                    )
+                    bias_atom_e = self.bias_atom_e
+                if self.spin.use_spin[type_i]:
+                    self.bias_atom_e[type_i] = (
+                        self.bias_atom_e[type_i]
+                        + self.bias_atom_e[type_i + ntypes_atom]
+                    )
+                else:
+                    self.bias_atom_e[type_i] = self.bias_atom_e[type_i]
+            self.bias_atom_e = self.bias_atom_e[:ntypes_atom]
+
+        with tf.variable_scope("fitting_attr" + suffix, reuse=reuse):
+            # t_dfparam = tf.constant(self.numb_fparam, name="dfparam", dtype=tf.int32)
+            # t_daparam = tf.constant(self.numb_aparam, name="daparam", dtype=tf.int32)
+            self.t_bias_atom_e = tf.get_variable(
+                "t_bias_atom_e",
+                self.bias_atom_e.shape,
+                dtype=GLOBAL_TF_FLOAT_PRECISION,
+                trainable=False,
+                initializer=tf.constant_initializer(self.bias_atom_e),
+            )
+            if self.numb_fparam > 0:
+                t_fparam_avg = tf.get_variable(
+                    "t_fparam_avg",
+                    self.numb_fparam,
+                    dtype=GLOBAL_TF_FLOAT_PRECISION,
+                    trainable=False,
+                    initializer=tf.constant_initializer(self.fparam_avg),
+                )
+                t_fparam_istd = tf.get_variable(
+                    "t_fparam_istd",
+                    self.numb_fparam,
+                    dtype=GLOBAL_TF_FLOAT_PRECISION,
+                    trainable=False,
+                    initializer=tf.constant_initializer(self.fparam_inv_std),
+                )
+            if self.numb_aparam > 0:
+                t_aparam_avg = tf.get_variable(
+                    "t_aparam_avg",
+                    self.numb_aparam,
+                    dtype=GLOBAL_TF_FLOAT_PRECISION,
+                    trainable=False,
+                    initializer=tf.constant_initializer(self.aparam_avg),
+                )
+                t_aparam_istd = tf.get_variable(
+                    "t_aparam_istd",
+                    self.numb_aparam,
+                    dtype=GLOBAL_TF_FLOAT_PRECISION,
+                    trainable=False,
+                    initializer=tf.constant_initializer(self.aparam_inv_std),
+                )
+
+        inputs = tf.reshape(inputs, [-1, natoms[0], self.dim_descrpt])
+        if len(self.atom_ener):
+            # only for atom_ener
+            nframes = input_dict.get("nframes")
+            if nframes is not None:
+                # like inputs, but we don't want to add a dependency on inputs
+                inputs_zero = tf.zeros(
+                    (nframes, natoms[0], self.dim_descrpt),
+                    dtype=GLOBAL_TF_FLOAT_PRECISION,
+                )
+            else:
+                inputs_zero = tf.zeros_like(inputs, dtype=GLOBAL_TF_FLOAT_PRECISION)
+
+        if bias_atom_e is not None:
+            assert len(bias_atom_e) == self.ntypes
+
+        fparam = None
+        if self.numb_fparam > 0:
+            fparam = input_dict["fparam"]
+            fparam = tf.reshape(fparam, [-1, self.numb_fparam])
+            fparam = (fparam - t_fparam_avg) * t_fparam_istd
+
+        aparam = None
+        if not self.use_aparam_as_mask:
+            if self.numb_aparam > 0:
+                aparam = input_dict["aparam"]
+                aparam = tf.reshape(aparam, [-1, self.numb_aparam])
+                aparam = (aparam - t_aparam_avg) * t_aparam_istd
+                aparam = tf.reshape(aparam, [-1, self.numb_aparam * natoms[0]])
+
+        atype_nall = tf.reshape(atype, [-1, natoms[1]])
+        self.atype_nloc = tf.slice(
+            atype_nall, [0, 0], [-1, natoms[0]]
+        )  ## lammps will make error
+        atype_filter = tf.cast(self.atype_nloc >= 0, GLOBAL_TF_FLOAT_PRECISION)
+        self.atype_nloc = tf.reshape(self.atype_nloc, [-1])
+        # prevent embedding_lookup error,
+        # but the filter will be applied anyway
+        self.atype_nloc = tf.clip_by_value(self.atype_nloc, 0, self.ntypes - 1)
+
+        ## if spin is used
+        if self.spin is not None:
+            self.atype_nloc = tf.slice(
+                atype_nall, [0, 0], [-1, tf.reduce_sum(natoms[2 : 2 + ntypes_atom])]
+            )
+            atype_filter = tf.cast(self.atype_nloc >= 0, GLOBAL_TF_FLOAT_PRECISION)
+            self.atype_nloc = tf.reshape(self.atype_nloc, [-1])
+        if (
+            nvnmd_cfg.enable
+            and nvnmd_cfg.quantize_descriptor
+            and nvnmd_cfg.restore_descriptor
+            and (nvnmd_cfg.version == 1)
+        ):
+            type_embedding = nvnmd_cfg.map["t_ebd"]
+        if type_embedding is not None:
+            atype_embed = tf.nn.embedding_lookup(type_embedding, self.atype_nloc)
+        else:
+            atype_embed = None
+
+        self.atype_embed = atype_embed
+
+        if atype_embed is None:
+            start_index = 0
+            outs_list = []
+            for type_i in range(ntypes_atom):
+                final_layer = self._build_lower(
+                    start_index,
+                    natoms[2 + type_i],
+                    inputs,
+                    fparam,
+                    aparam,
+                    bias_atom_e=0.0,
+                    type_suffix="_type_" + str(type_i),
+                    suffix=suffix,
+                    reuse=reuse,
+                )
+                # concat the results
+                if type_i < len(self.atom_ener) and self.atom_ener[type_i] is not None:
+                    zero_layer = self._build_lower(
+                        start_index,
+                        natoms[2 + type_i],
+                        inputs_zero,
+                        fparam,
+                        aparam,
+                        bias_atom_e=0.0,
+                        type_suffix="_type_" + str(type_i),
+                        suffix=suffix,
+                        reuse=True,
+                    )
+                    final_layer -= zero_layer
+                final_layer = tf.reshape(
+                    final_layer, [tf.shape(inputs)[0], natoms[2 + type_i]]
+                )
+                outs_list.append(final_layer)
+                start_index += natoms[2 + type_i]
+            # concat the results
+            # concat once may be faster than multiple concat
+            outs = tf.concat(outs_list, axis=1)
+        # with type embedding
+        else:
+            atype_embed = tf.cast(atype_embed, GLOBAL_TF_FLOAT_PRECISION)
+            type_shape = atype_embed.get_shape().as_list()
+            inputs = tf.concat(
+                [tf.reshape(inputs, [-1, self.dim_descrpt]), atype_embed], axis=1
+            )
+            original_dim_descrpt = self.dim_descrpt
+            self.dim_descrpt = self.dim_descrpt + type_shape[1]
+            inputs = tf.reshape(inputs, [-1, natoms[0], self.dim_descrpt])
+            final_layer = self._build_lower(
+                0,
+                natoms[0],
+                inputs,
+                fparam,
+                aparam,
+                bias_atom_e=0.0,
+                suffix=suffix,
+                reuse=reuse,
+            )
+            if len(self.atom_ener):
+                # remove contribution in vacuum
+                inputs_zero = tf.concat(
+                    [tf.reshape(inputs_zero, [-1, original_dim_descrpt]), atype_embed],
+                    axis=1,
+                )
+                inputs_zero = tf.reshape(inputs_zero, [-1, natoms[0], self.dim_descrpt])
+                zero_layer = self._build_lower(
+                    0,
+                    natoms[0],
+                    inputs_zero,
+                    fparam,
+                    aparam,
+                    bias_atom_e=0.0,
+                    suffix=suffix,
+                    reuse=True,
+                )
+                # atomic energy will be stored in `self.t_bias_atom_e` which is not trainable
+                final_layer -= zero_layer
+            outs = tf.reshape(final_layer, [tf.shape(inputs)[0], natoms[0]])
+        # add bias
+        self.atom_ener_before = outs * atype_filter
+        # atomic bias energy from data statistics
+        self.atom_bias_ener = tf.reshape(
+            tf.nn.embedding_lookup(self.t_bias_atom_e, self.atype_nloc),
+            [tf.shape(inputs)[0], tf.reduce_sum(natoms[2 : 2 + ntypes_atom])],
+        )
+        outs = outs + self.atom_bias_ener
+        outs *= atype_filter
+        self.atom_bias_ener *= atype_filter
+        self.atom_ener_after = outs
+
+        if self.tot_ener_zero:
+            force_tot_ener = 0.0
+            outs = tf.reshape(outs, [-1, tf.reduce_sum(natoms[2 : 2 + ntypes_atom])])
+            outs_mean = tf.reshape(tf.reduce_mean(outs, axis=1), [-1, 1])
+            outs_mean = outs_mean - tf.ones_like(
+                outs_mean, dtype=GLOBAL_TF_FLOAT_PRECISION
+            ) * (
+                force_tot_ener
+                / global_cvt_2_tf_float(tf.reduce_sum(natoms[2 : 2 + ntypes_atom]))
+            )
+            outs = outs - outs_mean
+            outs = tf.reshape(outs, [-1])
+
+        tf.summary.histogram("fitting_net_output", outs)
+        return tf.reshape(outs, [-1])
+
+    def init_variables(
+        self,
+        graph: tf.Graph,
+        graph_def: tf.GraphDef,
+        suffix: str = "",
+    ) -> None:
+        """Init the fitting net variables with the given dict.
+
+        Parameters
+        ----------
+        graph : tf.Graph
+            The input frozen model graph
+        graph_def : tf.GraphDef
+            The input frozen model graph_def
+        suffix : str
+            suffix to name scope
+        """
+        self.fitting_net_variables = get_fitting_net_variables_from_graph_def(
+            graph_def, suffix=suffix
+        )
+        if self.layer_name is not None:
+            # shared variables have no suffix
+            shared_variables = get_fitting_net_variables_from_graph_def(
+                graph_def, suffix=""
+            )
+            self.fitting_net_variables.update(shared_variables)
+        if self.numb_fparam > 0:
+            self.fparam_avg = get_tensor_by_name_from_graph(
+                graph, "fitting_attr%s/t_fparam_avg" % suffix
+            )
+            self.fparam_inv_std = get_tensor_by_name_from_graph(
+                graph, "fitting_attr%s/t_fparam_istd" % suffix
+            )
+        if self.numb_aparam > 0:
+            self.aparam_avg = get_tensor_by_name_from_graph(
+                graph, "fitting_attr%s/t_aparam_avg" % suffix
+            )
+            self.aparam_inv_std = get_tensor_by_name_from_graph(
+                graph, "fitting_attr%s/t_aparam_istd" % suffix
+            )
+        try:
+            self.bias_atom_e = get_tensor_by_name_from_graph(
+                graph, "fitting_attr%s/t_bias_atom_e" % suffix
+            )
+        except GraphWithoutTensorError:
+            # for compatibility, old models has no t_bias_atom_e
+            pass
+
+    def change_energy_bias(
+        self,
+        data,
+        frozen_model,
+        origin_type_map,
+        full_type_map,
+        bias_shift="delta",
+        ntest=10,
+    ) -> None:
+        """Change the energy bias according to the input data and the pretrained model.
+
+        Parameters
+        ----------
+        data : DeepmdDataSystem
+            The training data.
+        frozen_model : str
+            The path file of frozen model.
+        origin_type_map : list
+            The original type_map in dataset, they are targets to change the energy bias.
+        full_type_map : str
+            The full type_map in pretrained model
+        bias_shift : str
+            The mode for changing energy bias : ['delta', 'statistic']
+            'delta' : perform predictions on energies of target dataset,
+                    and do least sqaure on the errors to obtain the target shift as bias.
+            'statistic' : directly use the statistic energy bias in the target dataset.
+        ntest : int
+            The number of test samples in a system to change the energy bias.
+        """
+        type_numbs = []
+        energy_ground_truth = []
+        energy_predict = []
+        sorter = np.argsort(full_type_map)
+        idx_type_map = sorter[
+            np.searchsorted(full_type_map, origin_type_map, sorter=sorter)
+        ]
+        mixed_type = data.mixed_type
+        numb_type = len(full_type_map)
+        dp = None
+        if bias_shift == "delta":
+            # init model
+            dp = DeepPotential(frozen_model)
+        for sys in data.data_systems:
+            test_data = sys.get_test()
+            nframes = test_data["box"].shape[0]
+            numb_test = min(nframes, ntest)
+            if mixed_type:
+                atype = test_data["type"][:numb_test].reshape([numb_test, -1])
+            else:
+                atype = test_data["type"][0]
+            assert np.array(
+                [i in idx_type_map for i in list(set(atype.reshape(-1)))]
+            ).all(), "Some types are not in 'type_map'!"
+            energy_ground_truth.append(
+                test_data["energy"][:numb_test].reshape([numb_test, 1])
+            )
+            if mixed_type:
+                type_numbs.append(
+                    np.array(
+                        [(atype == i).sum(axis=-1) for i in idx_type_map],
+                        dtype=np.int32,
+                    ).T
+                )
+            else:
+                type_numbs.append(
+                    np.tile(
+                        np.bincount(atype, minlength=numb_type)[idx_type_map],
+                        (numb_test, 1),
+                    )
+                )
+            if bias_shift == "delta":
+                coord = test_data["coord"][:numb_test].reshape([numb_test, -1])
+                if sys.pbc:
+                    box = test_data["box"][:numb_test]
+                else:
+                    box = None
+                ret = dp.eval(coord, box, atype, mixed_type=mixed_type)
+                energy_predict.append(ret[0].reshape([numb_test, 1]))
+        type_numbs = np.concatenate(type_numbs)
+        energy_ground_truth = np.concatenate(energy_ground_truth)
+        old_bias = self.bias_atom_e[idx_type_map]
+        if bias_shift == "delta":
+            energy_predict = np.concatenate(energy_predict)
+            bias_diff = energy_ground_truth - energy_predict
+            delta_bias = np.linalg.lstsq(type_numbs, bias_diff, rcond=None)[0]
+            unbias_e = energy_predict + type_numbs @ delta_bias
+            atom_numbs = type_numbs.sum(-1)
+            rmse_ae = (
+                np.sqrt(np.square(unbias_e - energy_ground_truth)) / atom_numbs
+            ).mean()
+            self.bias_atom_e[idx_type_map] += delta_bias.reshape(-1)
+            log.info(
+                f"RMSE of atomic energy after linear regression is: {rmse_ae} eV/atom."
+            )
+        elif bias_shift == "statistic":
+            statistic_bias = np.linalg.lstsq(
+                type_numbs, energy_ground_truth, rcond=None
+            )[0]
+            self.bias_atom_e[idx_type_map] = statistic_bias.reshape(-1)
+        else:
+            raise RuntimeError("Unknown bias_shift mode: " + bias_shift)
+        log.info(
+            "Change energy bias of {} from {} to {}.".format(
+                str(origin_type_map), str(old_bias), str(self.bias_atom_e[idx_type_map])
+            )
+        )
+
+    def enable_mixed_precision(self, mixed_prec: Optional[dict] = None) -> None:
+        """Reveive the mixed precision setting.
+
+        Parameters
+        ----------
+        mixed_prec
+            The mixed precision setting used in the embedding net
+        """
+        self.mixed_prec = mixed_prec
+        self.fitting_precision = get_precision(mixed_prec["output_prec"])
+
+    def get_loss(self, loss: dict, lr) -> Loss:
+        """Get the loss function.
+
+        Parameters
+        ----------
+        loss : dict
+            The loss function parameters.
+        lr : LearningRateExp
+            The learning rate.
+
+        Returns
+        -------
+        Loss
+            The loss function.
+        """
+        _loss_type = loss.pop("type", "ener")
+        loss["starter_learning_rate"] = lr.start_lr()
+        if _loss_type == "ener":
+            return EnerStdLoss(**loss)
+        elif _loss_type == "ener_dipole":
+            return EnerDipoleLoss(**loss)
+        elif _loss_type == "ener_spin":
+            return EnerSpinLoss(**loss, use_spin=self.spin.use_spin)
+        else:
+            raise RuntimeError("unknown loss type")
diff --git a/deepmd/infer/deep_eval.py b/deepmd/infer/deep_eval.py
index 799cd6fd3b..6d41b91506 100644
--- a/deepmd/infer/deep_eval.py
+++ b/deepmd/infer/deep_eval.py
@@ -1,31 +1,28 @@
-from functools import (
-    lru_cache,
-)
-from typing import (
-    TYPE_CHECKING,
-    List,
-    Optional,
-    Union,
-)
-
+from functools import lru_cache
+from typing import TYPE_CHECKING
+from typing import List
+from typing import Optional
+from typing import Union
+
+# from deepmd.descriptor.descriptor import (
+#     Descriptor,
+# )
 import numpy as np
 
-from deepmd.env import (
-    MODEL_VERSION,
-    default_tf_session_config,
-    tf,
-)
-from deepmd.utils.batch_size import (
-    AutoBatchSize,
-)
-from deepmd.utils.sess import (
-    run_sess,
-)
+from deepmd.common import data_requirement
+from deepmd.common import expand_sys_str
+from deepmd.common import j_loader
+from deepmd.common import j_must_have
+from deepmd.env import MODEL_VERSION
+from deepmd.env import default_tf_session_config
+from deepmd.env import paddle
+from deepmd.env import tf
+from deepmd.model import EnerModel
+from deepmd.utils.batch_size import AutoBatchSize
+from deepmd.utils.sess import run_sess
 
 if TYPE_CHECKING:
-    from pathlib import (
-        Path,
-    )
+    from pathlib import Path
 
 
 class DeepEval:
@@ -53,18 +50,71 @@ def __init__(
         default_tf_graph: bool = False,
         auto_batch_size: Union[bool, int, AutoBatchSize] = False,
     ):
-        self.graph = self._load_graph(
-            model_file, prefix=load_prefix, default_tf_graph=default_tf_graph
+        jdata = j_loader("input.json")
+        model_param = j_must_have(jdata, "model")
+
+        descrpt_param = j_must_have(model_param, "descriptor")
+        from deepmd.descriptor import DescrptSeA
+
+        descrpt_param.pop("type", None)
+        descrpt_param.pop("_comment", None)
+        self.spin = None
+        descrpt_param["spin"] = self.spin
+        self.descrpt = DescrptSeA(**descrpt_param)
+
+        self.multi_task_mode = "fitting_net_dict" in model_param
+        fitting_param = (
+            j_must_have(model_param, "fitting_net")
+            if not self.multi_task_mode
+            else j_must_have(model_param, "fitting_net_dict")
+        )
+        from deepmd.fit import EnerFitting
+
+        # fitting_param.pop("type", None)
+        fitting_param.pop("_comment", None)
+        fitting_param["descrpt"] = self.descrpt
+        self.fitting = EnerFitting(**fitting_param)
+
+        self.typeebd = None
+
+        self.model = EnerModel(
+            self.descrpt,
+            self.fitting,
+            self.typeebd,
+            model_param.get("type_map"),
+            model_param.get("data_stat_nbatch", 10),
+            model_param.get("data_stat_protect", 1e-2),
+            model_param.get("use_srtab"),
+            model_param.get("smin_alpha"),
+            model_param.get("sw_rmin"),
+            model_param.get("sw_rmax"),
+            self.spin,
         )
+        load_state_dict = paddle.load(str(model_file))
+        for k, v in load_state_dict.items():
+            if k in self.model.state_dict():
+                if load_state_dict[k].dtype != self.model.state_dict()[k].dtype:
+                    # print(f"convert dtype from {load_state_dict[k].dtype} to {self.model.state_dict()[k].dtype}")
+                    load_state_dict[k] = load_state_dict[k].astype(
+                        self.model.state_dict()[k].dtype
+                    )
+                if list(load_state_dict[k].shape) != list(
+                    self.model.state_dict()[k].shape
+                ):
+                    # print(f"convert shape from {load_state_dict[k].shape} to {self.model.state_dict()[k].shape}")
+                    load_state_dict[k] = load_state_dict[k].reshape(
+                        self.model.state_dict()[k].shape
+                    )
+        self.model.set_state_dict(load_state_dict)
         self.load_prefix = load_prefix
 
         # graph_compatable should be called after graph and prefix are set
-        if not self._graph_compatable():
-            raise RuntimeError(
-                f"model in graph (version {self.model_version}) is incompatible"
-                f"with the model (version {MODEL_VERSION}) supported by the current code."
-                "See https://deepmd.rtfd.io/compatability/ for details."
-            )
+        # if not self._graph_compatable():
+        #     raise RuntimeError(
+        #         f"model in graph (version {self.model_version}) is incompatible"
+        #         f"with the model (version {MODEL_VERSION}) supported by the current code."
+        #         "See https://deepmd.rtfd.io/compatability/ for details."
+        #     )
 
         # set default to False, as subclasses may not support
         if isinstance(auto_batch_size, bool):
@@ -82,13 +132,15 @@ def __init__(
     @property
     @lru_cache(maxsize=None)
     def model_type(self) -> str:
+        return "ener"
         """Get type of model.
 
         :type:str
         """
-        t_mt = self._get_tensor("model_attr/model_type:0")
-        [mt] = run_sess(self.sess, [t_mt], feed_dict={})
-        return mt.decode("utf-8")
+        # t_mt = self._get_tensor("model_attr/model_type:0")
+        # [mt] = run_sess(self.sess, [t_mt], feed_dict={})
+        # return mt.decode("utf-8")
+        self._model_type = self.model.t_mt
 
     @property
     @lru_cache(maxsize=None)
@@ -100,6 +152,7 @@ def model_version(self) -> str:
         str
             version of model
         """
+        return "0.1.0"
         try:
             t_mt = self._get_tensor("model_attr/model_version:0")
         except KeyError:
@@ -117,6 +170,7 @@ def sess(self) -> tf.Session:
         return tf.Session(graph=self.graph, config=default_tf_session_config)
 
     def _graph_compatable(self) -> bool:
+        return True
         """Check the model compatability.
 
         Returns
@@ -135,7 +189,7 @@ def _graph_compatable(self) -> bool:
         else:
             return True
 
-    def _get_tensor(
+    def _get_value(
         self, tensor_name: str, attr_name: Optional[str] = None
     ) -> tf.Tensor:
         """Get TF graph tensor and assign it to class namespace.
@@ -154,8 +208,10 @@ def _get_tensor(
             loaded tensor
         """
         # do not use os.path.join as it doesn't work on Windows
-        tensor_path = "/".join((self.load_prefix, tensor_name))
-        tensor = self.graph.get_tensor_by_name(tensor_path)
+        value = None
+        for name, tensor in self.model.named_buffers():
+            if tensor_name in name:
+                value = tensor.numpy()[0] if tensor.shape == [1] else tensor.numpy()
         if attr_name:
             setattr(self, attr_name, tensor)
             return tensor
@@ -194,6 +250,16 @@ def _load_graph(
                         name=prefix,
                         producer_op_list=None,
                     )
+                #     with tf.Session() as sess:
+                #         constant_ops = [op for op in graph.get_operations() if op.type == "Const"]
+                #         for constant_op in constant_ops:
+                #             param = sess.run(constant_op.outputs[0])
+                #             # print(type(param))
+                #             if hasattr(param, 'shape'):
+                #                 # print(param.shape)
+                #                 if param.shape == (2,):
+                #                     print(constant_op.outputs[0], param)
+                # exit()
 
             return graph
 
diff --git a/deepmd/infer/deep_pot.py b/deepmd/infer/deep_pot.py
index 10fed52497..909a9d23ac 100644
--- a/deepmd/infer/deep_pot.py
+++ b/deepmd/infer/deep_pot.py
@@ -1,35 +1,22 @@
 import logging
-from typing import (
-    TYPE_CHECKING,
-    Callable,
-    List,
-    Optional,
-    Tuple,
-    Union,
-)
+from typing import TYPE_CHECKING
+from typing import Callable
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
 
 import numpy as np
 
-from deepmd.common import (
-    make_default_mesh,
-)
-from deepmd.infer.data_modifier import (
-    DipoleChargeModifier,
-)
-from deepmd.infer.deep_eval import (
-    DeepEval,
-)
-from deepmd.utils.batch_size import (
-    AutoBatchSize,
-)
-from deepmd.utils.sess import (
-    run_sess,
-)
+from deepmd.common import make_default_mesh
+from deepmd.env import paddle
+from deepmd.infer.data_modifier import DipoleChargeModifier
+from deepmd.infer.deep_eval import DeepEval
+from deepmd.utils.batch_size import AutoBatchSize
+from deepmd.utils.sess import run_sess
 
 if TYPE_CHECKING:
-    from pathlib import (
-        Path,
-    )
+    from pathlib import Path
 
 log = logging.getLogger(__name__)
 
@@ -81,125 +68,173 @@ def __init__(
         self.tensors = dict(
             {
                 # descrpt attrs
-                "t_ntypes": "descrpt_attr/ntypes:0",
-                "t_rcut": "descrpt_attr/rcut:0",
+                "ntypes": "descrpt.ntypes",
+                "rcut": "descrpt.rcut",
                 # fitting attrs
-                "t_dfparam": "fitting_attr/dfparam:0",
-                "t_daparam": "fitting_attr/daparam:0",
-                # model attrs
-                "t_tmap": "model_attr/tmap:0",
-                # inputs
-                "t_coord": "t_coord:0",
-                "t_type": "t_type:0",
-                "t_natoms": "t_natoms:0",
-                "t_box": "t_box:0",
-                "t_mesh": "t_mesh:0",
-                # add output tensors
-                "t_energy": "o_energy:0",
-                "t_force": "o_force:0",
-                "t_virial": "o_virial:0",
-                "t_ae": "o_atom_energy:0",
-                "t_av": "o_atom_virial:0",
-                "t_descriptor": "o_descriptor:0",
+                "dfparam": "fitting.t_dfparam",
+                "daparam": "fitting.t_daparam",
+                # # fitting attrs
+                # "t_dfparam": "fitting_attr/dfparam:0",
+                # "t_daparam": "fitting_attr/daparam:0",
+                # # model attrs
+                # "t_tmap": "model_attr/tmap:0",
+                # # inputs
+                # "t_coord": "t_coord:0",
+                # "t_type": "t_type:0",
+                # "t_natoms": "t_natoms:0",
+                # "t_box": "t_box:0",
+                # "t_mesh": "t_mesh:0",
+                # # add output tensors
+                # "t_energy": "o_energy:0",
+                # "t_force": "o_force:0",
+                # "t_virial": "o_virial:0",
+                # "t_ae": "o_atom_energy:0",
+                # "t_av": "o_atom_virial:0",
+                # "t_descriptor": "o_descriptor:0",
             },
         )
         DeepEval.__init__(
             self,
             model_file,
             load_prefix=load_prefix,
-            default_tf_graph=default_tf_graph,
+            # default_tf_graph=default_tf_graph,
             auto_batch_size=auto_batch_size,
         )
 
-        # load optional tensors
-        operations = [op.name for op in self.graph.get_operations()]
-        # check if the graph has these operations:
-        # if yes add them
-        if "t_efield" in operations:
-            self._get_tensor("t_efield:0", "t_efield")
-            self.has_efield = True
-        else:
-            log.debug("Could not get tensor 't_efield:0'")
-            self.t_efield = None
-            self.has_efield = False
-
-        if "load/t_fparam" in operations:
-            self.tensors.update({"t_fparam": "t_fparam:0"})
-            self.has_fparam = True
-        else:
-            log.debug("Could not get tensor 't_fparam:0'")
-            self.t_fparam = None
-            self.has_fparam = False
-
-        if "load/t_aparam" in operations:
-            self.tensors.update({"t_aparam": "t_aparam:0"})
-            self.has_aparam = True
-        else:
-            log.debug("Could not get tensor 't_aparam:0'")
-            self.t_aparam = None
-            self.has_aparam = False
-
-        if "load/spin_attr/ntypes_spin" in operations:
-            self.tensors.update({"t_ntypes_spin": "spin_attr/ntypes_spin:0"})
-            self.has_spin = True
-        else:
-            self.ntypes_spin = 0
-            self.has_spin = False
+        # # load optional tensors
+        # operations = [op.name for op in self.graph.get_operations()]
+        # # check if the graph has these operations:
+        # # if yes add them
+        # if "t_efield" in operations:
+        # #     self._get_tensor("t_efield:0", "t_efield")
+        # if self._get_value("t_efield") is not None:
+        #     self._get_value("t_efield", "t_efield")
+        #     self.has_efield = True
+        # else:
+        #     log.debug("Could not get tensor 't_efield'")
+        #     self.t_efield = None
+        self.has_efield = False
+
+        # if self._get_value("load/t_fparam") is not None:
+        #     self.tensors.update({"t_fparam": "t_fparam"})
+        #     self.has_fparam = True
+        # else:
+        #     log.debug("Could not get tensor 't_fparam'")
+        #     self.t_fparam = None
+        self.has_fparam = False
+
+        # if self._get_value("load/t_aparam") is not None:
+        #     self.tensors.update({"t_aparam": "t_aparam"})
+        #     self.has_aparam = True
+        # else:
+        #     log.debug("Could not get tensor 't_aparam'")
+        #     self.t_aparam = None
+        self.has_aparam = False
+
+        # if self._get_value("load/spin_attr/ntypes_spin") is not None:
+        #     self.tensors.update({"t_ntypes_spin": "spin_attr/ntypes_spin"})
+        #     self.has_spin = True
+        # else:
+        self.ntypes_spin = 0
+        self.has_spin = False
 
         # now load tensors to object attributes
         for attr_name, tensor_name in self.tensors.items():
             try:
-                self._get_tensor(tensor_name, attr_name)
+                self._get_value(tensor_name, attr_name)
             except KeyError:
                 if attr_name != "t_descriptor":
                     raise
 
-        self._run_default_sess()
-        self.tmap = self.tmap.decode("UTF-8").split()
+        # self._run_default_sess()
+        # self.tmap = self.tmap.decode("UTF-8").split()
+        self.ntypes = 2
+        self.rcut = 6.0
+        self.dfparam = 0
+        self.daparam = 0
+        # self.t_tmap = self.model.t_tmap.split()
+        self.t_tmap = ["O", "H"]
 
         # setup modifier
         try:
-            t_modifier_type = self._get_tensor("modifier_attr/type:0")
-            self.modifier_type = run_sess(self.sess, t_modifier_type).decode("UTF-8")
+            # t_modifier_type = self._get_tensor("modifier_attr/type:0")
+            # self.modifier_type = run_sess(self.sess, t_modifier_type).decode("UTF-8")
+            self.modifier_type = self._get_value("modifier_attr.type")
         except (ValueError, KeyError):
             self.modifier_type = None
-
-        try:
-            t_jdata = self._get_tensor("train_attr/training_script:0")
-            jdata = run_sess(self.sess, t_jdata).decode("UTF-8")
-            import json
-
-            jdata = json.loads(jdata)
-            self.descriptor_type = jdata["model"]["descriptor"]["type"]
-        except (ValueError, KeyError):
-            self.descriptor_type = None
-
-        if self.modifier_type == "dipole_charge":
-            t_mdl_name = self._get_tensor("modifier_attr/mdl_name:0")
-            t_mdl_charge_map = self._get_tensor("modifier_attr/mdl_charge_map:0")
-            t_sys_charge_map = self._get_tensor("modifier_attr/sys_charge_map:0")
-            t_ewald_h = self._get_tensor("modifier_attr/ewald_h:0")
-            t_ewald_beta = self._get_tensor("modifier_attr/ewald_beta:0")
-            [mdl_name, mdl_charge_map, sys_charge_map, ewald_h, ewald_beta] = run_sess(
-                self.sess,
-                [
-                    t_mdl_name,
-                    t_mdl_charge_map,
-                    t_sys_charge_map,
-                    t_ewald_h,
-                    t_ewald_beta,
-                ],
-            )
-            mdl_name = mdl_name.decode("UTF-8")
-            mdl_charge_map = [int(ii) for ii in mdl_charge_map.decode("UTF-8").split()]
-            sys_charge_map = [int(ii) for ii in sys_charge_map.decode("UTF-8").split()]
-            self.dm = DipoleChargeModifier(
-                mdl_name,
-                mdl_charge_map,
-                sys_charge_map,
-                ewald_h=ewald_h,
-                ewald_beta=ewald_beta,
+        self.modifier_type = None
+        self.descriptor_type = "se_e2_a"
+
+        # try:
+        #     t_jdata = self._get_tensor("train_attr/training_script")
+        #     jdata = run_sess(self.sess, t_jdata).decode("UTF-8")
+        #     import json
+
+        #     jdata = json.loads(jdata)
+        #     self.descriptor_type = jdata["model"]["descriptor"]["type"]
+        # except (ValueError, KeyError):
+        #     self.descriptor_type = None
+
+        # if self.modifier_type == "dipole_charge":
+        #     t_mdl_name = self._get_tensor("modifier_attr/mdl_name:0")
+        #     t_mdl_charge_map = self._get_tensor("modifier_attr/mdl_charge_map:0")
+        #     t_sys_charge_map = self._get_tensor("modifier_attr/sys_charge_map:0")
+        #     t_ewald_h = self._get_tensor("modifier_attr/ewald_h:0")
+        #     t_ewald_beta = self._get_tensor("modifier_attr/ewald_beta:0")
+        #     [mdl_name, mdl_charge_map, sys_charge_map, ewald_h, ewald_beta] = run_sess(
+        #         self.sess,
+        #         [
+        #             t_mdl_name,
+        #             t_mdl_charge_map,
+        #             t_sys_charge_map,
+        #             t_ewald_h,
+        #             t_ewald_beta,
+        #         ],
+        #     )
+        #     mdl_name = mdl_name.decode("UTF-8")
+        #     mdl_charge_map = [int(ii) for ii in mdl_charge_map.decode("UTF-8").split()]
+        #     sys_charge_map = [int(ii) for ii in sys_charge_map.decode("UTF-8").split()]
+        #     self.dm = DipoleChargeModifier(
+        #         mdl_name,
+        #         mdl_charge_map,
+        #         sys_charge_map,
+        #         ewald_h=ewald_h,
+        #         ewald_beta=ewald_beta,
+        #     )
+
+        # NOTE: 使用静态图模型推理
+        if not hasattr(self, "st_model"):
+            self.st_model = paddle.jit.load(
+                "/workspace/hesensen/deepmd_backend/deepmd-kit/examples/water/se_e2_a/Model_1000000"
             )
+            # for k, v in self.st_model.named_parameters():
+            #     print(f"{k} {v.shape} {v.mean().item()} {v.var().item()}")
+            # """
+            # param_0 [1, 25] 0.9498768667019655 0.7340928425051493
+            # param_1 [1, 50] 1.1214760345730344 0.9621536430386503
+            # param_2 [1, 100] 1.168418946306086 1.0411743399117217
+            # param_3 [1, 25] 0.002546645920014433 0.27806176560439083
+            # param_4 [25, 50] -0.015372691466039676 0.10679961485782502
+            # param_5 [50, 100] -0.0010681208730640539 0.09950205346985407
+            # param_6 [1, 25] 1.0639599744616117 0.917256936729768
+            # param_7 [1, 50] 1.142691803888668 0.9639366693005659
+            # param_8 [1, 100] 1.1471394365452061 1.0091294911290036
+            # param_9 [1, 25] 0.019013792716200625 0.1450311660373793
+            # param_10 [25, 50] -0.006747145320748169 0.028971429954693633
+            # param_11 [50, 100] -0.03750622755877242 0.04714041793007081
+            # param_12 [1, 25] 1.0380588819220322 0.8904020425094114
+            # param_13 [1, 50] 1.1245407895732316 0.9234643810098301
+            # param_14 [1, 100] 1.1430567514092813 0.9876968977916372
+            # param_15 [1, 25] 0.03272738992064966 0.1751917732380509
+            # param_16 [25, 50] -0.017871745658352124 0.0384813911462805
+            # param_17 [50, 100] -0.07345191324160481 0.1768254187693918
+            # param_18 [1, 25] 1.0147830400771964 0.9070964180637516
+            # param_19 [1, 50] 1.1198266551333698 1.034746190888665
+            # param_20 [1, 100] 1.1410748813679754 1.0428001731414345
+            # param_21 [1, 25] -0.022862385119536602 0.18038150422614693
+            # param_22 [25, 50] -0.024970130750642985 0.07176423978220656
+            # param_23 [50, 100] -0.012309303874398866 0.07227932085917015
+            # """
 
     def _run_default_sess(self):
         if self.has_spin is True:
@@ -247,7 +282,7 @@ def get_rcut(self) -> float:
 
     def get_type_map(self) -> List[str]:
         """Get the type map (element name of the atom types) of this model."""
-        return self.tmap
+        return self.t_tmap
 
     def get_sel_type(self) -> List[int]:
         """Unsupported in this model."""
@@ -259,11 +294,11 @@ def get_descriptor_type(self) -> List[int]:
 
     def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this DP."""
-        return self.dfparam
+        return self.model.fitting.numb_fparam
 
     def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this DP."""
-        return self.daparam
+        return self.model.fitting.numb_aparam
 
     def _eval_func(self, inner_func: Callable, numb_test: int, natoms: int) -> Callable:
         """Wrapper method with auto batch size.
@@ -369,7 +404,7 @@ def eval(
         # reshape coords before getting shape
         natoms, numb_test = self._get_natoms_and_nframes(
             coords, atom_types, mixed_type=mixed_type
-        )
+        )  # 192, 30
         output = self._eval_func(self._eval_inner, numb_test, natoms)(
             coords,
             cells,
@@ -381,7 +416,7 @@ def eval(
             mixed_type=mixed_type,
         )
 
-        if self.modifier_type is not None:
+        if self.modifier_type is not None:  # 这里不会运行
             if atomic:
                 raise RuntimeError("modifier does not support atomic modification")
             me, mf, mv = self.dm.eval(coords, cells, atom_types)
@@ -460,46 +495,46 @@ def _prepare_feed_dict(
                 )
 
         # sort inputs
-        coords, atom_types, imap = self.sort_input(
-            coords, atom_types, mixed_type=mixed_type
-        )
-        if self.has_efield:
-            efield = np.reshape(efield, [nframes, natoms, 3])
-            efield = efield[:, imap, :]
-            efield = np.reshape(efield, [nframes, natoms * 3])
+        # coords, atom_types, imap = self.sort_input(
+        #     coords, atom_types, mixed_type=mixed_type
+        # )
+        # if self.has_efield:
+        #     efield = np.reshape(efield, [nframes, natoms, 3])
+        #     efield = efield[:, imap, :]
+        #     efield = np.reshape(efield, [nframes, natoms * 3])
 
         # make natoms_vec and default_mesh
         natoms_vec = self.make_natoms_vec(atom_types, mixed_type=mixed_type)
         assert natoms_vec[0] == natoms
 
         # evaluate
-        feed_dict_test = {}
-        feed_dict_test[self.t_natoms] = natoms_vec
-        if mixed_type:
-            feed_dict_test[self.t_type] = atom_types.reshape([-1])
-        else:
-            feed_dict_test[self.t_type] = np.tile(atom_types, [nframes, 1]).reshape(
-                [-1]
-            )
-        feed_dict_test[self.t_coord] = np.reshape(coords, [-1])
-
-        if len(self.t_box.shape) == 1:
-            feed_dict_test[self.t_box] = np.reshape(cells, [-1])
-        elif len(self.t_box.shape) == 2:
-            feed_dict_test[self.t_box] = cells
-        else:
-            raise RuntimeError
-        if self.has_efield:
-            feed_dict_test[self.t_efield] = np.reshape(efield, [-1])
-        if pbc:
-            feed_dict_test[self.t_mesh] = make_default_mesh(cells)
-        else:
-            feed_dict_test[self.t_mesh] = np.array([], dtype=np.int32)
-        if self.has_fparam:
-            feed_dict_test[self.t_fparam] = np.reshape(fparam, [-1])
-        if self.has_aparam:
-            feed_dict_test[self.t_aparam] = np.reshape(aparam, [-1])
-        return feed_dict_test, imap, natoms_vec
+        # feed_dict_test = {}
+        # feed_dict_test[self.t_natoms] = natoms_vec
+        # if mixed_type:
+        #     feed_dict_test[self.t_type] = atom_types.reshape([-1])
+        # else:
+        #     feed_dict_test[self.t_type] = np.tile(atom_types, [nframes, 1]).reshape(
+        #         [-1]
+        #     )
+        # feed_dict_test[self.t_coord] = np.reshape(coords, [-1])
+
+        # if len(self.t_box.shape) == 1:
+        #     feed_dict_test[self.t_box] = np.reshape(cells, [-1])
+        # elif len(self.t_box.shape) == 2:
+        #     feed_dict_test[self.t_box] = cells
+        # else:
+        #     raise RuntimeError
+        # if self.has_efield:
+        #     feed_dict_test[self.t_efield] = np.reshape(efield, [-1])
+        # if pbc:
+        #     feed_dict_test[self.t_mesh] = make_default_mesh(cells)
+        # else:
+        #     feed_dict_test[self.t_mesh] = np.array([], dtype=np.int32)
+        # if self.has_fparam:
+        #     feed_dict_test[self.t_fparam] = np.reshape(fparam, [-1])
+        # if self.has_aparam:
+        #     feed_dict_test[self.t_aparam] = np.reshape(aparam, [-1])
+        return None, None, natoms_vec
 
     def _eval_inner(
         self,
@@ -519,41 +554,130 @@ def _eval_inner(
             coords, cells, atom_types, fparam, aparam, efield, mixed_type=mixed_type
         )
 
-        t_out = [self.t_energy, self.t_force, self.t_virial]
-        if atomic:
-            t_out += [self.t_ae, self.t_av]
-
-        v_out = run_sess(self.sess, t_out, feed_dict=feed_dict_test)
-        energy = v_out[0]
-        force = v_out[1]
-        virial = v_out[2]
-        if atomic:
-            ae = v_out[3]
-            av = v_out[4]
+        # t_out = [self.t_energy, self.t_force, self.t_virial]
+        # if atomic:
+        #     t_out += [self.t_ae, self.t_av]
+
+        # v_out = run_sess(self.sess, t_out, feed_dict=feed_dict_test)
+        # energy = v_out[0]
+        # force = v_out[1]
+        # virial = v_out[2]
+        # if atomic:
+        #     ae = v_out[3]
+        #     av = v_out[4]
+
+        # if self.has_spin:
+        #     ntypes_real = self.ntypes - self.ntypes_spin
+        #     natoms_real = sum(
+        #         [
+        #             np.count_nonzero(np.array(atom_types) == ii)
+        #             for ii in range(ntypes_real)
+        #         ]
+        #     )
+        # else:
+        #     natoms_real = natoms
+
+        # # reverse map of the outputs
+        # force = self.reverse_map(np.reshape(force, [nframes, -1, 3]), imap)
+        # if atomic:
+        #     ae = self.reverse_map(np.reshape(ae, [nframes, -1, 1]), imap[:natoms_real])
+        #     av = self.reverse_map(np.reshape(av, [nframes, -1, 9]), imap)
+
+        # energy = np.reshape(energy, [nframes, 1])
+        # force = np.reshape(force, [nframes, natoms, 3])
+        # virial = np.reshape(virial, [nframes, 9])
+        # if atomic:
+        #     ae = np.reshape(ae, [nframes, natoms_real, 1])
+        #     av = np.reshape(av, [nframes, natoms, 9])
+        #     return energy, force, virial, ae, av
+        # else:
+        # atom_types = np.array(atom_types, dtype=int).reshape([-1])
+        # natoms = atom_types.size
+        # coords = np.reshape(np.array(coords), [-1, natoms * 3])
+        # nframes = coords.shape[0]
+
+        eval_inputs = {}
+        eval_inputs["coord"] = paddle.to_tensor(
+            np.reshape(coords, [-1]), dtype="float64"
+        )
+        eval_inputs["type"] = paddle.to_tensor(
+            np.tile(atom_types, [nframes, 1]).reshape([-1]), dtype="int32"
+        )
+        eval_inputs["natoms_vec"] = paddle.to_tensor(
+            natoms_vec, dtype="int32", place="cpu"
+        )
+        eval_inputs["box"] = paddle.to_tensor(np.reshape(cells, [-1]), dtype="float64")
+        # print(eval_inputs['coord'].shape) # [2880]
+        # print(eval_inputs['type'].shape) # [960]
+        # print(eval_inputs['natoms_vec'].shape) # [4]
+        # print(eval_inputs['box'].shape) # [45]
+        # exit()
 
-        if self.has_spin:
-            ntypes_real = self.ntypes - self.ntypes_spin
-            natoms_real = sum(
-                [
-                    np.count_nonzero(np.array(atom_types) == ii)
-                    for ii in range(ntypes_real)
-                ]
+        if self.has_fparam:
+            eval_inputs["fparam"] = paddle.to_tensor(
+                np.reshape(fparam, [-1], dtype="float64")
             )
+        if self.has_aparam:
+            eval_inputs["aparam"] = paddle.to_tensor(
+                np.reshape(aparam, [-1], dtype="float64")
+            )
+        # if se.pbc:
+        eval_inputs["default_mesh"] = paddle.to_tensor(
+            make_default_mesh(cells), dtype="int32"
+        )
+        # else:
+        # eval_inputs['default_mesh'] = paddle.to_tensor(np.array([], dtype = np.int32))
+
+        if hasattr(self, "st_model"):
+            eval_outputs = self.st_model(
+                eval_inputs["coord"],  # [2880] paddle.float64
+                eval_inputs["type"],  # [960] paddle.int32
+                eval_inputs["natoms_vec"],  # [4] paddle.int32
+                eval_inputs["box"],  # [45] paddle.float64
+                eval_inputs["default_mesh"],  # [6] paddle.int32
+            )
+            eval_outputs = {
+                "atom_ener": eval_outputs[0],
+                "atom_virial": eval_outputs[1],
+                "atype": eval_outputs[2],
+                "coord": eval_outputs[3],
+                "energy": eval_outputs[4],
+                "force": eval_outputs[5],
+                "virial": eval_outputs[6],
+                # "z00_hidden1": eval_outputs[7],
+                # "z00_hidden2": eval_outputs[8],
+                # "z00_hidden3": eval_outputs[9],
+                # "z00_xx1": eval_outputs[7],
+                # "z00_xx2": eval_outputs[8],
+                # "z00_xx3": eval_outputs[9],
+                # "z00_xx4": eval_outputs[10],
+                # "weight_0": eval_outputs[7],
+                # "bias_0": eval_outputs[8],
+                # "xx1": eval_outputs[9],
+                # "hidden1": eval_outputs[10],
+            }
         else:
-            natoms_real = natoms
-
-        # reverse map of the outputs
-        force = self.reverse_map(np.reshape(force, [nframes, -1, 3]), imap)
-        if atomic:
-            ae = self.reverse_map(np.reshape(ae, [nframes, -1, 1]), imap[:natoms_real])
-            av = self.reverse_map(np.reshape(av, [nframes, -1, 9]), imap)
-
-        energy = np.reshape(energy, [nframes, 1])
-        force = np.reshape(force, [nframes, natoms, 3])
-        virial = np.reshape(virial, [nframes, 9])
+            eval_outputs = self.model(
+                eval_inputs["coord"],  # [2880] paddle.float64
+                eval_inputs["type"],  # [960] paddle.int32
+                eval_inputs["natoms_vec"],  # [4] paddle.int32
+                eval_inputs["box"],  # [45] paddle.float64
+                eval_inputs["default_mesh"],  # [6] paddle.int32
+                eval_inputs,
+                # eval_inputs.coord: [2880] paddle.float64
+                # eval_inputs.type: [960] paddle.int32
+                # eval_inputs.natoms_vec: [4] paddle.int32
+                # eval_inputs.box: [45] paddle.float64
+                # eval_inputs.default_mesh: [6] paddle.int32
+                suffix="",
+                reuse=False,
+            )
+        energy = eval_outputs["energy"].numpy()
+        force = eval_outputs["force"].numpy()
+        virial = eval_outputs["virial"].numpy()
         if atomic:
-            ae = np.reshape(ae, [nframes, natoms_real, 1])
-            av = np.reshape(av, [nframes, natoms, 9])
+            ae = eval_outputs["atom_ener"].numpy()
+            av = eval_outputs["atom_virial"].numpy()
             return energy, force, virial, ae, av
         else:
             return energy, force, virial
diff --git a/deepmd/loss/ener.py b/deepmd/loss/ener.py
index 07c97b09bc..e1e7d89626 100644
--- a/deepmd/loss/ener.py
+++ b/deepmd/loss/ener.py
@@ -1,24 +1,16 @@
-from typing import (
-    Optional,
-)
+from typing import Optional
 
 import numpy as np
 
-from deepmd.common import (
-    add_data_requirement,
-)
-from deepmd.env import (
-    global_cvt_2_ener_float,
-    global_cvt_2_tf_float,
-    tf,
-)
-from deepmd.utils.sess import (
-    run_sess,
-)
+from deepmd.common import add_data_requirement
+from deepmd.env import global_cvt_2_ener_float
+from deepmd.env import global_cvt_2_pd_float
+from deepmd.env import global_cvt_2_tf_float
+from deepmd.env import paddle
+from deepmd.env import tf
+from deepmd.utils.sess import run_sess
 
-from .loss import (
-    Loss,
-)
+from .loss import Loss
 
 
 class EnerStdLoss(Loss):
@@ -82,11 +74,12 @@ def __init__(
                 default=1.0,
             )
 
-    def build(self, learning_rate, natoms, model_dict, label_dict, suffix):
+    def compute_loss(self, learning_rate, natoms, model_dict, label_dict, suffix):
         energy = model_dict["energy"]
         force = model_dict["force"]
         virial = model_dict["virial"]
         atom_ener = model_dict["atom_ener"]
+
         energy_hat = label_dict["energy"]
         force_hat = label_dict["force"]
         virial_hat = label_dict["virial"]
@@ -108,152 +101,187 @@ def build(self, learning_rate, natoms, model_dict, label_dict, suffix):
             # E = - E(A) - E(B) + E(C) + E(D)
             # A, B, C, D could be put far away from each other
             atom_ener_coeff = label_dict["atom_ener_coeff"]
-            atom_ener_coeff = tf.reshape(atom_ener_coeff, tf.shape(atom_ener))
-            energy = tf.reduce_sum(atom_ener_coeff * atom_ener, 1)
+            atom_ener_coeff = paddle.reshape(atom_ener_coeff, paddle.shape(atom_ener))
+            energy = paddle.sum(atom_ener_coeff * atom_ener, axis=1)
         if self.has_e:
-            l2_ener_loss = tf.reduce_mean(
-                tf.square(energy - energy_hat), name="l2_" + suffix
+            l2_ener_loss = paddle.mean(
+                paddle.square(energy - energy_hat), name="l2_" + suffix
             )
 
         if self.has_f or self.has_pf or self.relative_f:
-            force_reshape = tf.reshape(force, [-1])
-            force_hat_reshape = tf.reshape(force_hat, [-1])
+            force_reshape = paddle.reshape(force, [-1])
+            force_hat_reshape = paddle.reshape(force_hat, [-1])
             diff_f = force_hat_reshape - force_reshape
 
         if self.relative_f is not None:
-            force_hat_3 = tf.reshape(force_hat, [-1, 3])
-            norm_f = tf.reshape(tf.norm(force_hat_3, axis=1), [-1, 1]) + self.relative_f
-            diff_f_3 = tf.reshape(diff_f, [-1, 3])
+            force_hat_3 = paddle.reshape(force_hat, [-1, 3])
+            norm_f = (
+                paddle.reshape(paddle.linalg.norm(force_hat_3, axis=1), [-1, 1])
+                + self.relative_f
+            )
+            diff_f_3 = paddle.reshape(diff_f, [-1, 3])
             diff_f_3 = diff_f_3 / norm_f
-            diff_f = tf.reshape(diff_f_3, [-1])
+            diff_f = paddle.reshape(diff_f_3, [-1])
 
         if self.has_f:
-            l2_force_loss = tf.reduce_mean(tf.square(diff_f), name="l2_force_" + suffix)
+            l2_force_loss = paddle.mean(
+                paddle.square(diff_f), name="l2_force_" + suffix
+            )
 
         if self.has_pf:
-            atom_pref_reshape = tf.reshape(atom_pref, [-1])
-            l2_pref_force_loss = tf.reduce_mean(
-                tf.multiply(tf.square(diff_f), atom_pref_reshape),
+            atom_pref_reshape = paddle.reshape(atom_pref, [-1])
+            l2_pref_force_loss = paddle.mean(
+                paddle.multiply(paddle.square(diff_f), atom_pref_reshape),
                 name="l2_pref_force_" + suffix,
             )
 
         if self.has_v:
-            virial_reshape = tf.reshape(virial, [-1])
-            virial_hat_reshape = tf.reshape(virial_hat, [-1])
-            l2_virial_loss = tf.reduce_mean(
-                tf.square(virial_hat_reshape - virial_reshape),
+            virial_reshape = paddle.reshape(virial, [-1])
+            virial_hat_reshape = paddle.reshape(virial_hat, [-1])
+            l2_virial_loss = paddle.mean(
+                paddle.square(virial_hat_reshape - virial_reshape),
                 name="l2_virial_" + suffix,
             )
 
         if self.has_ae:
-            atom_ener_reshape = tf.reshape(atom_ener, [-1])
-            atom_ener_hat_reshape = tf.reshape(atom_ener_hat, [-1])
-            l2_atom_ener_loss = tf.reduce_mean(
-                tf.square(atom_ener_hat_reshape - atom_ener_reshape),
+            atom_ener_reshape = paddle.reshape(atom_ener, [-1])
+            atom_ener_hat_reshape = paddle.reshape(atom_ener_hat, [-1])
+            l2_atom_ener_loss = paddle.mean(
+                paddle.square(atom_ener_hat_reshape - atom_ener_reshape),
                 name="l2_atom_ener_" + suffix,
             )
 
-        atom_norm = 1.0 / global_cvt_2_tf_float(natoms[0])
-        atom_norm_ener = 1.0 / global_cvt_2_ener_float(natoms[0])
-        pref_e = global_cvt_2_ener_float(
-            find_energy
-            * (
-                self.limit_pref_e
-                + (self.start_pref_e - self.limit_pref_e)
-                * learning_rate
-                / self.starter_learning_rate
-            )
-        )
-        pref_f = global_cvt_2_tf_float(
-            find_force
-            * (
-                self.limit_pref_f
-                + (self.start_pref_f - self.limit_pref_f)
-                * learning_rate
-                / self.starter_learning_rate
-            )
-        )
-        pref_v = global_cvt_2_tf_float(
-            find_virial
-            * (
-                self.limit_pref_v
-                + (self.start_pref_v - self.limit_pref_v)
-                * learning_rate
-                / self.starter_learning_rate
-            )
-        )
-        pref_ae = global_cvt_2_tf_float(
-            find_atom_ener
-            * (
-                self.limit_pref_ae
-                + (self.start_pref_ae - self.limit_pref_ae)
-                * learning_rate
-                / self.starter_learning_rate
-            )
-        )
-        pref_pf = global_cvt_2_tf_float(
-            find_atom_pref
-            * (
-                self.limit_pref_pf
-                + (self.start_pref_pf - self.limit_pref_pf)
-                * learning_rate
-                / self.starter_learning_rate
-            )
+        atom_norm = 1.0 / (natoms[0])
+        atom_norm_ener = 1.0 / (natoms[0])
+        pref_e = find_energy * (
+            self.limit_pref_e
+            + (self.start_pref_e - self.limit_pref_e)
+            * learning_rate
+            / self.starter_learning_rate
+        )
+        pref_f = find_force * (
+            self.limit_pref_f
+            + (self.start_pref_f - self.limit_pref_f)
+            * learning_rate
+            / self.starter_learning_rate
+        )
+        pref_v = find_virial * (
+            self.limit_pref_v
+            + (self.start_pref_v - self.limit_pref_v)
+            * learning_rate
+            / self.starter_learning_rate
+        )
+        pref_ae = find_atom_ener * (
+            self.limit_pref_ae
+            + (self.start_pref_ae - self.limit_pref_ae)
+            * learning_rate
+            / self.starter_learning_rate
+        )
+        pref_pf = find_atom_pref * (
+            self.limit_pref_pf
+            + (self.start_pref_pf - self.limit_pref_pf)
+            * learning_rate
+            / self.starter_learning_rate
         )
 
         l2_loss = 0
         more_loss = {}
-        if self.has_e:
+        # print(self.has_e)
+        # print(self.has_f)
+        # print(self.has_v)
+        # print(self.has_ae)
+        # print(self.has_pf)
+        if self.has_e:  # true
             l2_loss += atom_norm_ener * (pref_e * l2_ener_loss)
             more_loss["l2_ener_loss"] = l2_ener_loss
-        if self.has_f:
-            l2_loss += global_cvt_2_ener_float(pref_f * l2_force_loss)
+        if self.has_f:  # true
+            l2_loss += pref_f * l2_force_loss
             more_loss["l2_force_loss"] = l2_force_loss
-        if self.has_v:
-            l2_loss += global_cvt_2_ener_float(atom_norm * (pref_v * l2_virial_loss))
+        if self.has_v:  # false
+            l2_loss += atom_norm * (pref_v * l2_virial_loss)
             more_loss["l2_virial_loss"] = l2_virial_loss
-        if self.has_ae:
-            l2_loss += global_cvt_2_ener_float(pref_ae * l2_atom_ener_loss)
+        if self.has_ae:  # false
+            l2_loss += pref_ae * l2_atom_ener_loss
             more_loss["l2_atom_ener_loss"] = l2_atom_ener_loss
-        if self.has_pf:
-            l2_loss += global_cvt_2_ener_float(pref_pf * l2_pref_force_loss)
+        if self.has_pf:  # false
+            l2_loss += pref_pf * l2_pref_force_loss
             more_loss["l2_pref_force_loss"] = l2_pref_force_loss
 
         # only used when tensorboard was set as true
-        self.l2_loss_summary = tf.summary.scalar("l2_loss_" + suffix, tf.sqrt(l2_loss))
-        if self.has_e:
-            self.l2_loss_ener_summary = tf.summary.scalar(
-                "l2_ener_loss_" + suffix,
-                global_cvt_2_tf_float(tf.sqrt(l2_ener_loss))
-                / global_cvt_2_tf_float(natoms[0]),
-            )
-        if self.has_f:
-            self.l2_loss_force_summary = tf.summary.scalar(
-                "l2_force_loss_" + suffix, tf.sqrt(l2_force_loss)
-            )
-        if self.has_v:
-            self.l2_loss_virial_summary = tf.summary.scalar(
-                "l2_virial_loss_" + suffix,
-                tf.sqrt(l2_virial_loss) / global_cvt_2_tf_float(natoms[0]),
-            )
+        # self.l2_loss_summary = paddle.summary.scalar("l2_loss_" + suffix, paddle.sqrt(l2_loss))
+        # if self.has_e:
+        #     self.l2_loss_ener_summary = paddle.summary.scalar(
+        #         "l2_ener_loss_" + suffix,
+        #         global_cvt_2_tf_float(paddle.sqrt(l2_ener_loss))
+        #         / global_cvt_2_tf_float(natoms[0]),
+        #     )
+        # if self.has_f:
+        #     self.l2_loss_force_summary = paddle.summary.scalar(
+        #         "l2_force_loss_" + suffix, paddle.sqrt(l2_force_loss)
+        #     )
+        # if self.has_v:
+        #     self.l2_loss_virial_summary = paddle.summary.scalar(
+        #         "l2_virial_loss_" + suffix,
+        #         paddle.sqrt(l2_virial_loss) / global_cvt_2_tf_float(natoms[0]),
+        #     )
 
         self.l2_l = l2_loss
         self.l2_more = more_loss
         return l2_loss, more_loss
 
-    def eval(self, sess, feed_dict, natoms):
-        placeholder = self.l2_l
+    def eval(self, model, batch_data, natoms):
+        # placeholder = self.l2_l
+
+        model_inputs = {}
+        for kk in batch_data.keys():
+            if kk == "find_type" or kk == "type":
+                continue
+            prec = "float64"
+            if "find_" in kk:
+                model_inputs[kk] = paddle.to_tensor(batch_data[kk], dtype="float64")
+            else:
+                model_inputs[kk] = paddle.to_tensor(
+                    np.reshape(batch_data[kk], [-1]), dtype=prec
+                )
+
+        for ii in ["type"]:
+            model_inputs[ii] = paddle.to_tensor(
+                np.reshape(batch_data[ii], [-1]), dtype="int32"
+            )
+        for ii in ["natoms_vec", "default_mesh"]:
+            model_inputs[ii] = paddle.to_tensor(batch_data[ii], dtype="int32")
+        model_inputs["is_training"] = paddle.to_tensor(False)
+        model_inputs["natoms_vec"] = paddle.to_tensor(
+            model_inputs["natoms_vec"], place="cpu"
+        )
+
+        model_pred = model(
+            model_inputs["coord"],
+            model_inputs["type"],
+            model_inputs["natoms_vec"],
+            model_inputs["box"],
+            model_inputs["default_mesh"],
+            model_inputs,
+            suffix="",
+            reuse=False,
+        )
+        l2_l, l2_more = self.compute_loss(
+            # 0.0, natoms, model_dict, batch_data
+            0.0,
+            model_inputs["natoms_vec"],
+            model_pred,
+            model_inputs,
+            suffix="test",
+        )
         run_data = [
-            self.l2_l,
-            self.l2_more["l2_ener_loss"] if self.has_e else placeholder,
-            self.l2_more["l2_force_loss"] if self.has_f else placeholder,
-            self.l2_more["l2_virial_loss"] if self.has_v else placeholder,
-            self.l2_more["l2_atom_ener_loss"] if self.has_ae else placeholder,
-            self.l2_more["l2_pref_force_loss"] if self.has_pf else placeholder,
+            (float(l2_l)),
+            (float(l2_more["l2_ener_loss"]) if self.has_e else 0.0),
+            (float(l2_more["l2_force_loss"]) if self.has_f else 0.0),
+            (float(l2_more["l2_virial_loss"]) if self.has_v else 0.0),
+            (float(l2_more["l2_atom_ener_loss"]) if self.has_ae else 0.0),
+            (float(l2_more["l2_pref_force_loss"]) if self.has_pf else 0.0),
         ]
-        error, error_e, error_f, error_v, error_ae, error_pf = run_sess(
-            sess, run_data, feed_dict=feed_dict
-        )
+        error, error_e, error_f, error_v, error_ae, error_pf = run_data
         results = {"natoms": natoms[0], "rmse": np.sqrt(error)}
         if self.has_e:
             results["rmse_e"] = np.sqrt(error_e) / natoms[0]
diff --git a/deepmd/loss/loss.py b/deepmd/loss/loss.py
index f666445e6e..7a9d55e106 100644
--- a/deepmd/loss/loss.py
+++ b/deepmd/loss/loss.py
@@ -1,21 +1,15 @@
-from abc import (
-    ABCMeta,
-    abstractmethod,
-)
-from typing import (
-    Dict,
-    Tuple,
-)
+from abc import ABCMeta
+from abc import abstractmethod
+from typing import Dict
+from typing import Tuple
 
-from deepmd.env import (
-    tf,
-)
+from deepmd.env import tf
 
 
 class Loss(metaclass=ABCMeta):
     """The abstract class for the loss function."""
 
-    @abstractmethod
+    # @abstractmethod
     def build(
         self,
         learning_rate: tf.Tensor,
@@ -46,6 +40,7 @@ def build(
         dict[str, tf.Tensor]
             A dictionary that maps loss keys to more loss tensors
         """
+        pass
 
     @abstractmethod
     def eval(
diff --git a/deepmd/model/ener.py b/deepmd/model/ener.py
index f9387c67fc..0c4e890e01 100644
--- a/deepmd/model/ener.py
+++ b/deepmd/model/ener.py
@@ -1,33 +1,22 @@
-from typing import (
-    List,
-    Optional,
-)
+from typing import List
+from typing import Optional
 
 import numpy as np
 
-from deepmd.env import (
-    MODEL_VERSION,
-    global_cvt_2_ener_float,
-    op_module,
-    tf,
-)
-from deepmd.utils.pair_tab import (
-    PairTab,
-)
-from deepmd.utils.spin import (
-    Spin,
-)
-
-from .model import (
-    Model,
-)
-from .model_stat import (
-    make_stat_input,
-    merge_sys_stat,
-)
-
-
-class EnerModel(Model):
+from deepmd.env import MODEL_VERSION
+from deepmd.env import global_cvt_2_ener_float
+from deepmd.env import op_module
+from deepmd.env import paddle
+from deepmd.env import tf
+from deepmd.utils.pair_tab import PairTab
+from deepmd.utils.spin import Spin
+
+from .model import Model
+from .model_stat import make_stat_input
+from .model_stat import merge_sys_stat
+
+
+class EnerModel(Model, paddle.nn.Layer):
     """Energy model.
 
     Parameters
@@ -69,6 +58,8 @@ def __init__(
         sw_rmax: Optional[float] = None,
         spin: Optional[Spin] = None,
     ) -> None:
+        super().__init__()
+        # super(EnerModel, self).__init__(name_scope="EnerModel")
         """Constructor."""
         # descriptor
         self.descrpt = descrpt
@@ -97,6 +88,11 @@ def __init__(
         else:
             self.srtab = None
 
+        # self.type_map = " ".join(self.type_map)
+        self.t_tmap = " ".join(self.type_map)
+        self.t_mt = self.model_type
+        self.t_ver = MODEL_VERSION
+
     def get_rcut(self):
         return self.rcut
 
@@ -144,7 +140,7 @@ def _compute_output_stat(self, all_stat, mixed_type=False):
         else:
             self.fitting.compute_output_stats(all_stat)
 
-    def build(
+    def forward(
         self,
         coord_,
         atype_,
@@ -157,172 +153,199 @@ def build(
         suffix="",
         reuse=None,
     ):
+        # print(__file__, coord_.shape)
+        # print(__file__, atype_.shape)
+        # print(__file__, natoms.shape)
+        # print(__file__, box.shape)
+        # print(__file__, mesh.shape)
+        # for k, v in input_dict.items():
+        #     print(f"{__file__} {k} {v.shape}")
+
         if input_dict is None:
             input_dict = {}
-        with tf.variable_scope("model_attr" + suffix, reuse=reuse):
-            t_tmap = tf.constant(" ".join(self.type_map), name="tmap", dtype=tf.string)
-            t_mt = tf.constant(self.model_type, name="model_type", dtype=tf.string)
-            t_ver = tf.constant(MODEL_VERSION, name="model_version", dtype=tf.string)
-
-            if self.srtab is not None:
-                tab_info, tab_data = self.srtab.get()
-                self.tab_info = tf.get_variable(
-                    "t_tab_info",
-                    tab_info.shape,
-                    dtype=tf.float64,
-                    trainable=False,
-                    initializer=tf.constant_initializer(tab_info, dtype=tf.float64),
-                )
-                self.tab_data = tf.get_variable(
-                    "t_tab_data",
-                    tab_data.shape,
-                    dtype=tf.float64,
-                    trainable=False,
-                    initializer=tf.constant_initializer(tab_data, dtype=tf.float64),
-                )
-
-        coord = tf.reshape(coord_, [-1, natoms[1] * 3])
-        atype = tf.reshape(atype_, [-1, natoms[1]])
-        input_dict["nframes"] = tf.shape(coord)[0]
+        # if self.srtab is not None:
+        #     tab_info, tab_data = self.srtab.get()
+        #     self.tab_info = tf.get_variable(
+        #         "t_tab_info",
+        #         tab_info.shape,
+        #         dtype=tf.float64,
+        #         trainable=False,
+        #         initializer=tf.constant_initializer(tab_info, dtype=tf.float64),
+        #     )
+        #     self.tab_data = tf.get_variable(
+        #         "t_tab_data",
+        #         tab_data.shape,
+        #         dtype=tf.float64,
+        #         trainable=False,
+        #         initializer=tf.constant_initializer(tab_data, dtype=tf.float64),
+        #     )
+
+        coord = paddle.reshape(coord_, [-1, natoms[1] * 3])
+        atype = paddle.reshape(atype_, [-1, natoms[1]])
+        # input_dict["nframes"] = paddle.shape(coord)[0]  # 推理模型导出的时候注释掉这里，否则会报错
 
         # type embedding if any
-        if self.typeebd is not None:
-            type_embedding = self.typeebd.build(
-                self.ntypes,
-                reuse=reuse,
-                suffix=suffix,
-            )
-            input_dict["type_embedding"] = type_embedding
+        # if self.typeebd is not None:
+        #     type_embedding = self.typeebd.build(
+        #         self.ntypes,
+        #         reuse=reuse,
+        #         suffix=suffix,
+        #     )
+        #     input_dict["type_embedding"] = type_embedding
         # spin if any
-        if self.spin is not None:
-            type_spin = self.spin.build(
-                reuse=reuse,
-                suffix=suffix,
-            )
+        # if self.spin is not None:
+        #     type_spin = self.spin.build(
+        #         reuse=reuse,
+        #         suffix=suffix,
+        #     )
         input_dict["atype"] = atype_
 
-        dout = self.build_descrpt(
+        dout = self.descrpt(
             coord,
             atype,
             natoms,
             box,
             mesh,
             input_dict,
-            frz_model=frz_model,
-            ckpt_meta=ckpt_meta,
+            # frz_model=frz_model,
+            # ckpt_meta=ckpt_meta,
             suffix=suffix,
             reuse=reuse,
         )
+        # self.dout = dout
 
-        if self.srtab is not None:
-            nlist, rij, sel_a, sel_r = self.descrpt.get_nlist()
-            nnei_a = np.cumsum(sel_a)[-1]
-            nnei_r = np.cumsum(sel_r)[-1]
+        # if self.srtab is not None:
+        #     nlist, rij, sel_a, sel_r = self.descrpt.get_nlist()
+        #     nnei_a = np.cumsum(sel_a)[-1]
+        #     nnei_r = np.cumsum(sel_r)[-1]
 
-        atom_ener = self.fitting.build(
-            dout, natoms, input_dict, reuse=reuse, suffix=suffix
-        )
+        atom_ener = self.fitting(dout, natoms, input_dict, reuse=reuse, suffix=suffix)
         self.atom_ener = atom_ener
 
-        if self.srtab is not None:
-            sw_lambda, sw_deriv = op_module.soft_min_switch(
-                atype,
-                rij,
-                nlist,
-                natoms,
-                sel_a=sel_a,
-                sel_r=sel_r,
-                alpha=self.smin_alpha,
-                rmin=self.sw_rmin,
-                rmax=self.sw_rmax,
-            )
-            inv_sw_lambda = 1.0 - sw_lambda
-            # NOTICE:
-            # atom energy is not scaled,
-            # force and virial are scaled
-            tab_atom_ener, tab_force, tab_atom_virial = op_module.pair_tab(
-                self.tab_info,
-                self.tab_data,
-                atype,
-                rij,
-                nlist,
-                natoms,
-                sw_lambda,
-                sel_a=sel_a,
-                sel_r=sel_r,
-            )
-            energy_diff = tab_atom_ener - tf.reshape(atom_ener, [-1, natoms[0]])
-            tab_atom_ener = tf.reshape(sw_lambda, [-1]) * tf.reshape(
-                tab_atom_ener, [-1]
-            )
-            atom_ener = tf.reshape(inv_sw_lambda, [-1]) * atom_ener
-            energy_raw = tab_atom_ener + atom_ener
-        else:
-            energy_raw = atom_ener
+        # if self.srtab is not None:
+        #     sw_lambda, sw_deriv = op_module.soft_min_switch(
+        #         atype,
+        #         rij,
+        #         nlist,
+        #         natoms,
+        #         sel_a=sel_a,
+        #         sel_r=sel_r,
+        #         alpha=self.smin_alpha,
+        #         rmin=self.sw_rmin,
+        #         rmax=self.sw_rmax,
+        #     )
+        #     inv_sw_lambda = 1.0 - sw_lambda
+        #     # NOTICE:
+        #     # atom energy is not scaled,
+        #     # force and virial are scaled
+        #     tab_atom_ener, tab_force, tab_atom_virial = op_module.pair_tab(
+        #         self.tab_info,
+        #         self.tab_data,
+        #         atype,
+        #         rij,
+        #         nlist,
+        #         natoms,
+        #         sw_lambda,
+        #         sel_a=sel_a,
+        #         sel_r=sel_r,
+        #     )
+        #     energy_diff = tab_atom_ener - tf.reshape(atom_ener, [-1, natoms[0]])
+        #     tab_atom_ener = tf.reshape(sw_lambda, [-1]) * tf.reshape(
+        #         tab_atom_ener, [-1]
+        #     )
+        #     atom_ener = tf.reshape(inv_sw_lambda, [-1]) * atom_ener
+        #     energy_raw = tab_atom_ener + atom_ener
+        # else:
+        energy_raw = atom_ener
 
         nloc_atom = (
             natoms[0]
             if self.spin is None
-            else tf.reduce_sum(natoms[2 : 2 + len(self.spin.use_spin)])
+            else paddle.sum(natoms[2 : 2 + len(self.spin.use_spin)]).item()
         )
-        energy_raw = tf.reshape(
+        energy_raw = paddle.reshape(
             energy_raw, [-1, nloc_atom], name="o_atom_energy" + suffix
         )
-        energy = tf.reduce_sum(
-            global_cvt_2_ener_float(energy_raw), axis=1, name="o_energy" + suffix
-        )
+        energy = paddle.sum(energy_raw, axis=1, name="o_energy" + suffix)
 
         force, virial, atom_virial = self.descrpt.prod_force_virial(atom_ener, natoms)
 
-        if self.srtab is not None:
-            sw_force = op_module.soft_min_force(
-                energy_diff, sw_deriv, nlist, natoms, n_a_sel=nnei_a, n_r_sel=nnei_r
-            )
-            force = force + sw_force + tab_force
+        # if self.srtab is not None:
+        #     sw_force = op_module.soft_min_force(
+        #         energy_diff, sw_deriv, nlist, natoms, n_a_sel=nnei_a, n_r_sel=nnei_r
+        #     )
+        #     force = force + sw_force + tab_force
 
-        force = tf.reshape(force, [-1, 3 * natoms[1]])
+        force = paddle.reshape(force, [-1, 3 * natoms[1]])
         if self.spin is not None:
             # split and concatenate force to compute local atom force and magnetic force
-            judge = tf.equal(natoms[0], natoms[1])
-            force = tf.cond(
+            judge = paddle.equal(natoms[0], natoms[1])
+            force = paddle.where(
                 judge,
-                lambda: self.natoms_match(force, natoms),
-                lambda: self.natoms_not_match(force, natoms, atype),
+                self.natoms_match(force, natoms),
+                self.natoms_not_match(force, natoms, atype),
             )
 
-        force = tf.reshape(force, [-1, 3 * natoms[1]], name="o_force" + suffix)
-
-        if self.srtab is not None:
-            sw_virial, sw_atom_virial = op_module.soft_min_virial(
-                energy_diff,
-                sw_deriv,
-                rij,
-                nlist,
-                natoms,
-                n_a_sel=nnei_a,
-                n_r_sel=nnei_r,
-            )
-            atom_virial = atom_virial + sw_atom_virial + tab_atom_virial
-            virial = (
-                virial
-                + sw_virial
-                + tf.reduce_sum(tf.reshape(tab_atom_virial, [-1, natoms[1], 9]), axis=1)
-            )
-
-        virial = tf.reshape(virial, [-1, 9], name="o_virial" + suffix)
-        atom_virial = tf.reshape(
+        force = paddle.reshape(force, [-1, 3 * natoms[1]], name="o_force" + suffix)
+
+        # if self.srtab is not None:
+        #     sw_virial, sw_atom_virial = op_module.soft_min_virial(
+        #         energy_diff,
+        #         sw_deriv,
+        #         rij,
+        #         nlist,
+        #         natoms,
+        #         n_a_sel=nnei_a,
+        #         n_r_sel=nnei_r,
+        #     )
+        #     atom_virial = atom_virial + sw_atom_virial + tab_atom_virial
+        #     virial = (
+        #         virial
+        #         + sw_virial
+        #         + tf.sum(tf.reshape(tab_atom_virial, [-1, natoms[1], 9]), axis=1)
+        #     )
+
+        virial = paddle.reshape(virial, [-1, 9], name="o_virial" + suffix)
+        atom_virial = paddle.reshape(
             atom_virial, [-1, 9 * natoms[1]], name="o_atom_virial" + suffix
         )
 
         model_dict = {}
-        model_dict["energy"] = energy
-        model_dict["force"] = force
-        model_dict["virial"] = virial
-        model_dict["atom_ener"] = energy_raw
-        model_dict["atom_virial"] = atom_virial
-        model_dict["coord"] = coord
-        model_dict["atype"] = atype
-
+        model_dict["energy"] = energy  # [5]
+        model_dict["force"] = force  # [5, 576]
+        model_dict["virial"] = virial  # [5, 9]
+        model_dict["atom_ener"] = energy_raw  # [5, 192]
+        model_dict["atom_virial"] = atom_virial  # [5, 1728]
+        model_dict["coord"] = coord  # [5, 576]
+        model_dict["atype"] = atype  # [5, 192]
+
+        # model_dict["zdebug1"] = self.descrpt.descrpt
+        # model_dict["zdebug2"] = self.descrpt.descrpt_deriv
+        # model_dict["zdebug3"] = self.descrpt.rij
+        # model_dict["zdebug4"] = self.descrpt.nlist
+        # model_dict["zdebug5"] = self.descrpt.dout
+        # model_dict["zdebug6"] = self.descrpt.qmat
+        # model_dict["zdebug7"] = self.descrpt.xyz_scatter_input
+        # model_dict["zdebug8"] = self.descrpt.xyz_scatter_output
+
+        # model_dict["zdebug9"] = self.descrpt.debug_inputs
+        # model_dict["zdebug99"] = self.descrpt.debug_inputs_i
+        # model_dict["zdebug999"] = self.descrpt.debug_inputs_reshape
+        # model_dict["zdebug9999"] = self.descrpt.debug_xyz_scatter
+        # model_dict["zdebug99999"] = self.descrpt.debug_xyz_scatter_input
+        # model_dict["zdebug999999"] = self.descrpt.debug_xyz_scatter_output
+
+        # model_dict["z00_hidden1"] = self.descrpt.embedding_nets[0][0].hidden1
+        # model_dict["z00_hidden2"] = self.descrpt.embedding_nets[0][0].hidden2
+        # model_dict["z00_hidden3"] = self.descrpt.embedding_nets[0][0].hidden3
+        # model_dict["z00_xx1"] = self.descrpt.embedding_nets[0][0].xx1
+        # model_dict["z00_xx2"] = self.descrpt.embedding_nets[0][0].xx2
+        # model_dict["z00_xx3"] = self.descrpt.embedding_nets[0][0].xx3
+        # model_dict["z00_xx4"] = self.descrpt.embedding_nets[0][0].xx4
+        # model_dict["z00_0"] = self.descrpt.embedding_nets[0][0].weight[0]
+        # model_dict["z00_1"] = self.descrpt.embedding_nets[0][0].bias[0]
+        # model_dict["z00_2"] = self.descrpt.embedding_nets[0][0].xx1
+        # model_dict["z00_3"] = self.descrpt.embedding_nets[0][0].hidden1
         return model_dict
 
     def init_variables(
diff --git a/deepmd/model/model.py b/deepmd/model/model.py
index 8e6ffad910..660e30dbce 100644
--- a/deepmd/model/model.py
+++ b/deepmd/model/model.py
@@ -1,27 +1,17 @@
-from abc import (
-    ABC,
-    abstractmethod,
-)
-from enum import (
-    Enum,
-)
-from typing import (
-    List,
-    Optional,
-    Union,
-)
+from abc import ABC
+from abc import abstractmethod
+from enum import Enum
+from typing import List
+from typing import Optional
+from typing import Union
 
-from deepmd.env import (
-    GLOBAL_TF_FLOAT_PRECISION,
-    tf,
-)
-from deepmd.utils.graph import (
-    load_graph_def,
-)
+from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.env import tf
+from deepmd.utils.graph import load_graph_def
 
 
 class Model(ABC):
-    @abstractmethod
+    # @abstractmethod
     def build(
         self,
         coord_: tf.Tensor,
diff --git a/deepmd/paddle_ops.egg-info/PKG-INFO b/deepmd/paddle_ops.egg-info/PKG-INFO
new file mode 100644
index 0000000000..08ad719487
--- /dev/null
+++ b/deepmd/paddle_ops.egg-info/PKG-INFO
@@ -0,0 +1,3 @@
+Metadata-Version: 2.1
+Name: paddle-ops
+Version: 0.0.0
diff --git a/deepmd/paddle_ops.egg-info/SOURCES.txt b/deepmd/paddle_ops.egg-info/SOURCES.txt
new file mode 100644
index 0000000000..8933b93cb1
--- /dev/null
+++ b/deepmd/paddle_ops.egg-info/SOURCES.txt
@@ -0,0 +1,12 @@
+load_paddle_op.py
+../source/op/paddle_ops/srcs/pd_prod_env_mat_multi_devices_cpu.cc
+../source/op/paddle_ops/srcs/pd_prod_env_mat_multi_devices_cuda.cc
+../source/op/paddle_ops/srcs/pd_prod_force_se_a_multi_devices_cpu.cc
+../source/op/paddle_ops/srcs/pd_prod_force_se_a_multi_devices_cuda.cc
+../source/op/paddle_ops/srcs/pd_prod_virial_se_a_multi_devices_cpu.cc
+../source/op/paddle_ops/srcs/pd_prod_virial_se_a_multi_devices_cuda.cc
+paddle_ops.egg-info/PKG-INFO
+paddle_ops.egg-info/SOURCES.txt
+paddle_ops.egg-info/dependency_links.txt
+paddle_ops.egg-info/not-zip-safe
+paddle_ops.egg-info/top_level.txt
\ No newline at end of file
diff --git a/deepmd/paddle_ops.egg-info/dependency_links.txt b/deepmd/paddle_ops.egg-info/dependency_links.txt
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/deepmd/paddle_ops.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/deepmd/paddle_ops.egg-info/not-zip-safe b/deepmd/paddle_ops.egg-info/not-zip-safe
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/deepmd/paddle_ops.egg-info/not-zip-safe
@@ -0,0 +1 @@
+
diff --git a/deepmd/paddle_ops.egg-info/top_level.txt b/deepmd/paddle_ops.egg-info/top_level.txt
new file mode 100644
index 0000000000..7a1d7479ce
--- /dev/null
+++ b/deepmd/paddle_ops.egg-info/top_level.txt
@@ -0,0 +1 @@
+paddle_ops
diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py
index 580d434533..04e159d55e 100644
--- a/deepmd/train/trainer.py
+++ b/deepmd/train/trainer.py
@@ -5,89 +5,56 @@
 import platform
 import shutil
 import time
-from typing import (
-    Dict,
-    List,
-)
+from typing import Dict
+from typing import List
 
 import google.protobuf.message
 import numpy as np
-from packaging.version import (
-    Version,
-)
-from tensorflow.python.client import (
-    timeline,
-)
+from packaging.version import Version
+from tensorflow.python.client import timeline
 
 # load grad of force module
 import deepmd.op  # noqa: F401
-from deepmd.common import (
-    data_requirement,
-    get_precision,
-    j_must_have,
-)
-from deepmd.descriptor.descriptor import (
-    Descriptor,
-)
-from deepmd.env import (
-    GLOBAL_ENER_FLOAT_PRECISION,
-    GLOBAL_TF_FLOAT_PRECISION,
-    TF_VERSION,
-    get_tf_session_config,
-    tf,
-    tfv2,
-)
-from deepmd.fit import (
-    Fitting,
-)
-from deepmd.loss import (
-    DOSLoss,
-    EnerDipoleLoss,
-    EnerSpinLoss,
-    EnerStdLoss,
-    TensorLoss,
-)
-from deepmd.model import (
-    DipoleModel,
-    DOSModel,
-    EnerModel,
-    MultiModel,
-    PolarModel,
-)
+from deepmd.common import data_requirement
+from deepmd.common import get_precision
+from deepmd.common import j_must_have
+from deepmd.descriptor.descriptor import Descriptor
+from deepmd.env import GLOBAL_ENER_FLOAT_PRECISION
+from deepmd.env import GLOBAL_PD_FLOAT_PRECISION
+from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.env import TF_VERSION
+from deepmd.env import get_tf_session_config
+from deepmd.env import paddle
+from deepmd.env import tf
+from deepmd.env import tfv2
+from deepmd.fit import Fitting
+from deepmd.fit import ener
+from deepmd.loss import DOSLoss
+from deepmd.loss import EnerDipoleLoss
+from deepmd.loss import EnerSpinLoss
+from deepmd.loss import EnerStdLoss
+from deepmd.loss import TensorLoss
+from deepmd.model import DipoleModel
+from deepmd.model import DOSModel
+from deepmd.model import EnerModel
+from deepmd.model import MultiModel
+from deepmd.model import PolarModel
 from deepmd.utils import random as dp_random
-from deepmd.utils.argcheck import (
-    type_embedding_args,
-)
-from deepmd.utils.data_system import (
-    DeepmdDataSystem,
-)
-from deepmd.utils.errors import (
-    GraphTooLargeError,
-    GraphWithoutTensorError,
-)
-from deepmd.utils.graph import (
-    get_tensor_by_name_from_graph,
-    load_graph_def,
-)
-from deepmd.utils.learning_rate import (
-    LearningRateExp,
-)
-from deepmd.utils.sess import (
-    run_sess,
-)
-from deepmd.utils.spin import (
-    Spin,
-)
-from deepmd.utils.type_embed import (
-    TypeEmbedNet,
-)
+from deepmd.utils.argcheck import type_embedding_args
+from deepmd.utils.data_system import DeepmdDataSystem
+from deepmd.utils.errors import GraphTooLargeError
+from deepmd.utils.errors import GraphWithoutTensorError
+from deepmd.utils.graph import get_tensor_by_name_from_graph
+from deepmd.utils.graph import load_graph_def
+from deepmd.utils.learning_rate import LearningRateExp
+from deepmd.utils.sess import run_sess
+from deepmd.utils.spin import Spin
+from deepmd.utils.type_embed import TypeEmbedNet
 
 log = logging.getLogger(__name__)
 
 # nvnmd
-from deepmd.nvnmd.utils.config import (
-    nvnmd_cfg,
-)
+from deepmd.nvnmd.utils.config import nvnmd_cfg
 
 
 def _is_subdir(path, directory):
@@ -158,7 +125,8 @@ def _init_param(self, jdata):
             descrpt_param["multi_task"] = True
         if descrpt_param["type"] in ["se_e2_a", "se_a", "se_e2_r", "se_r", "hybrid"]:
             descrpt_param["spin"] = self.spin
-        self.descrpt = Descriptor(**descrpt_param)
+        descrpt_param.pop("type")
+        self.descrpt = deepmd.descriptor.se_a.DescrptSeA(**descrpt_param)
 
         # fitting net
         if not self.multi_task_mode:
@@ -167,7 +135,8 @@ def _init_param(self, jdata):
             fitting_param["descrpt"] = self.descrpt
             if fitting_type == "ener":
                 fitting_param["spin"] = self.spin
-            self.fitting = Fitting(**fitting_param)
+                fitting_param.pop("type")
+            self.fitting = ener.EnerFitting(**fitting_param)
         else:
             self.fitting_dict = {}
             self.fitting_type_dict = {}
@@ -316,7 +285,7 @@ def get_lr_and_coef(lr_param):
 
         # loss
         # infer loss type by fitting_type
-        def loss_init(_loss_param, _fitting_type, _fitting, _lr):
+        def loss_init(_loss_param, _fitting_type, _fitting, _lr) -> EnerStdLoss:
             _loss_type = _loss_param.get("type", "ener")
             if _fitting_type == "ener":
                 _loss_param.pop("type", None)
@@ -576,10 +545,10 @@ def build(self, data=None, stop_batch=0, origin_type_map=None, suffix=""):
             # for fparam or aparam settings in 'ener' type fitting net
             self.fitting.init_variables(graph, graph_def)
 
-        if self.is_compress or self.model_type == "compressed_model":
-            tf.constant("compressed_model", name="model_type", dtype=tf.string)
-        else:
-            tf.constant("original_model", name="model_type", dtype=tf.string)
+        # if self.is_compress or self.model_type == "compressed_model":
+        #     tf.constant("compressed_model", name="model_type", dtype=tf.string)
+        # else:
+        #     tf.constant("original_model", name="model_type", dtype=tf.string)
 
         if self.mixed_prec is not None:
             self.descrpt.enable_mixed_precision(self.mixed_prec)
@@ -593,17 +562,17 @@ def build(self, data=None, stop_batch=0, origin_type_map=None, suffix=""):
 
         self._build_lr()
         self._build_network(data, suffix)
-        self._build_training()
+        # self._build_training()
 
     def _build_lr(self):
-        self._extra_train_ops = []
-        self.global_step = tf.train.get_or_create_global_step()
+        # self._extra_train_ops = []
+        self.global_step = 0
         if not self.multi_task_mode:
             self.learning_rate = self.lr.build(self.global_step, self.stop_batch)
         else:
             self.learning_rate_dict = {}
             for fitting_key in self.fitting_type_dict:
-                self.learning_rate_dict[fitting_key] = self.lr_dict[fitting_key].build(
+                self.lr_scheduler[fitting_key] = self.lr.build(
                     self.global_step, self.stop_batch
                 )
 
@@ -678,7 +647,7 @@ def _build_network(self, data, suffix=""):
             reuse=False,
         )
 
-        self.l2_l, self.l2_more = self._build_loss()
+        # self.l2_l, self.l2_more = self._build_loss()
 
         log.info("built network")
 
@@ -813,12 +782,12 @@ def _init_session(self):
                 log.info("receive global variables from task#0")
             run_sess(self.sess, bcast_op)
 
-    def train(self, train_data=None, valid_data=None):
+    def train(self, train_data=None, valid_data=None, stop_batch: int = 10):
         # if valid_data is None:  # no validation set specified.
         #     valid_data = train_data  # using training set as validation set.
 
-        stop_batch = self.stop_batch
-        self._init_session()
+        # stop_batch = self.stop_batch
+        # self._init_session()
 
         # Before data shard is enabled, only cheif do evaluation and record it
         # self.print_head()
@@ -826,15 +795,18 @@ def train(self, train_data=None, valid_data=None):
         if self.run_opt.is_chief:
             fp = open(self.disp_file, "a")
 
-        cur_batch = run_sess(self.sess, self.global_step)
+        cur_batch = self.global_step
         is_first_step = True
         self.cur_batch = cur_batch
+        self.optimizer = paddle.optimizer.Adam(
+            learning_rate=self.learning_rate, parameters=self.model.parameters()
+        )
         if not self.multi_task_mode:
             log.info(
                 "start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e"
                 % (
-                    run_sess(self.sess, self.learning_rate),
-                    self.lr.value(cur_batch),
+                    self.learning_rate.get_lr(),
+                    self.learning_rate.get_lr(),
                     self.lr.decay_steps_,
                     self.lr.decay_rate_,
                     self.lr.value(stop_batch),
@@ -846,56 +818,51 @@ def train(self, train_data=None, valid_data=None):
                     "%s: start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e"
                     % (
                         fitting_key,
-                        run_sess(self.sess, self.learning_rate_dict[fitting_key]),
-                        self.lr_dict[fitting_key].value(cur_batch),
+                        self.learning_rate[fitting_key].base_lr,
+                        self.lr_dict[fitting_key].get_lr(),
                         self.lr_dict[fitting_key].decay_steps_,
                         self.lr_dict[fitting_key].decay_rate_,
                         self.lr_dict[fitting_key].value(stop_batch),
                     )
                 )
 
-        prf_options = None
-        prf_run_metadata = None
-        if self.profiling:
-            prf_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
-            prf_run_metadata = tf.RunMetadata()
+        # prf_options = None
+        # prf_run_metadata = None
+        # if self.profiling:
+        #     prf_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
+        #     prf_run_metadata = tf.RunMetadata()
 
         # set tensorboard execution environment
-        if self.tensorboard:
-            summary_merged_op = tf.summary.merge_all()
-            # Remove TB old logging directory from previous run
-            try:
-                shutil.rmtree(self.tensorboard_log_dir)
-            except FileNotFoundError:
-                pass  # directory does not exist, this is OK
-            except Exception as e:
-                # general error when removing directory, warn user
-                log.exception(
-                    f"Could not remove old tensorboard logging directory: "
-                    f"{self.tensorboard_log_dir}. Error: {e}"
-                )
-            else:
-                log.debug("Removing old tensorboard log directory.")
-            tb_train_writer = tf.summary.FileWriter(
-                self.tensorboard_log_dir + "/train", self.sess.graph
-            )
-            tb_valid_writer = tf.summary.FileWriter(self.tensorboard_log_dir + "/test")
-        else:
-            tb_train_writer = None
-            tb_valid_writer = None
-        if self.enable_profiler:
-            # https://www.tensorflow.org/guide/profiler
-            tfv2.profiler.experimental.start(self.tensorboard_log_dir)
+        # if self.tensorboard:
+        #     summary_merged_op = tf.summary.merge_all()
+        #     # Remove TB old logging directory from previous run
+        #     try:
+        #         shutil.rmtree(self.tensorboard_log_dir)
+        #     except FileNotFoundError:
+        #         pass  # directory does not exist, this is OK
+        #     except Exception as e:
+        #         # general error when removing directory, warn user
+        #         log.exception(
+        #             f"Could not remove old tensorboard logging directory: "
+        #             f"{self.tensorboard_log_dir}. Error: {e}"
+        #         )
+        #     else:
+        #         log.debug("Removing old tensorboard log directory.")
+        #     tb_train_writer = tf.summary.FileWriter(
+        #         self.tensorboard_log_dir + "/train", self.sess.graph
+        #     )
+        #     tb_valid_writer = tf.summary.FileWriter(self.tensorboard_log_dir + "/test")
+        # else:
+        #     tb_train_writer = None
+        #     tb_valid_writer = None
+        # if self.enable_profiler:
+        #     # https://www.tensorflow.org/guide/profiler
+        #     tfv2.profiler.experimental.start(self.tensorboard_log_dir)
 
         train_time = 0
         total_train_time = 0.0
         wall_time_tic = time.time()
 
-        next_batch_train_op = None
-        next_fitting_key = None
-        next_train_batch_list = None
-        next_datasetloader = None
-
         # dataset loader op
         if not self.multi_task_mode:
             datasetloader = DatasetLoader(train_data)
@@ -908,35 +875,36 @@ def train(self, train_data=None, valid_data=None):
                 data_op[fitting_key] = datasetloader[fitting_key].build()
 
         while cur_batch < stop_batch:
+            train_batch = datasetloader.get_data_dict()
             # first round validation:
-            if is_first_step:
-                if not self.multi_task_mode:
-                    train_batch = train_data.get_batch()
-                    batch_train_op = self.train_op
-                else:
-                    fitting_idx = dp_random.choice(
-                        np.arange(self.nfitting), p=np.array(self.fitting_prob)
-                    )
-                    fitting_key = self.fitting_key_list[fitting_idx]
-                    train_batch = train_data[fitting_key].get_batch()
-                    batch_train_op = self.train_op[fitting_key]
-            else:
-                train_batch = next_datasetloader.get_data_dict(next_train_batch_list)
-                batch_train_op = next_batch_train_op
-                fitting_key = next_fitting_key
+            # if is_first_step:
+            #     if not self.multi_task_mode:
+            #         train_batch = train_data.get_batch()
+            #         # batch_train_op = self.train_op
+            #     else:
+            #         fitting_idx = dp_random.choice(
+            #             np.arange(self.nfitting), p=np.array(self.fitting_prob)
+            #         )
+            #         fitting_key = self.fitting_key_list[fitting_idx]
+            #         train_batch = train_data[fitting_key].get_batch()
+            #         # batch_train_op = self.train_op[fitting_key]
+            # else:
+            #     train_batch = next_datasetloader.get_data_dict(next_train_batch_list)
+            #     # batch_train_op = next_batch_train_op
+            #     fitting_key = next_fitting_key
             # for next round
-            if not self.multi_task_mode:
-                next_datasetloader = datasetloader
-                next_batch_train_op = self.train_op
-                next_train_batch_op = data_op
-            else:
-                fitting_idx = dp_random.choice(
-                    np.arange(self.nfitting), p=np.array(self.fitting_prob)
-                )
-                next_fitting_key = self.fitting_key_list[fitting_idx]
-                next_datasetloader = datasetloader[next_fitting_key]
-                next_batch_train_op = self.train_op[fitting_key]
-                next_train_batch_op = data_op[fitting_key]
+            # if not self.multi_task_mode:
+            #     next_datasetloader = datasetloader
+            # next_batch_train_op = self.train_op
+            # next_train_batch_op = data_op
+            # else:
+            #     fitting_idx = dp_random.choice(
+            #         np.arange(self.nfitting), p=np.array(self.fitting_prob)
+            #     )
+            #     next_fitting_key = self.fitting_key_list[fitting_idx]
+            #     next_datasetloader = datasetloader[next_fitting_key]
+            # next_batch_train_op = self.train_op[fitting_key]
+            # next_train_batch_op = data_op[fitting_key]
 
             if self.display_in_training and is_first_step:
                 if self.run_opt.is_chief:
@@ -982,32 +950,180 @@ def train(self, train_data=None, valid_data=None):
 
             if self.timing_in_training:
                 tic = time.time()
-            train_feed_dict = self.get_feed_dict(train_batch, is_training=True)
+            # train_feed_dict = self.get_feed_dict(train_batch, is_training=True)
             # use tensorboard to visualize the training of deepmd-kit
             # it will takes some extra execution time to generate the tensorboard data
             if self.tensorboard and (cur_batch % self.tensorboard_freq == 0):
-                summary, _, next_train_batch_list = run_sess(
-                    self.sess,
-                    [summary_merged_op, batch_train_op, next_train_batch_op],
-                    feed_dict=train_feed_dict,
-                    options=prf_options,
-                    run_metadata=prf_run_metadata,
+                # summary, _, next_train_batch_list = run_sess(
+                #     self.sess,
+                #     [summary_merged_op, batch_train_op, next_train_batch_op],
+                #     feed_dict=train_feed_dict,
+                #     options=prf_options,
+                #     run_metadata=prf_run_metadata,
+                # )
+                # tb_train_writer.add_summary(summary, cur_batch)
+                model_pred = self.model(
+                    paddle.to_tensor(train_batch["coord"], "float32"),
+                    paddle.to_tensor(train_batch["type"], "int32"),
+                    paddle.to_tensor(train_batch["natoms_vec"], "int32", "cpu"),
+                    paddle.to_tensor(train_batch["box"], "float32"),
+                    paddle.to_tensor(train_batch["default_mesh"], "int32"),
+                    train_batch,
+                    suffix="",
+                    reuse=False,
                 )
-                tb_train_writer.add_summary(summary, cur_batch)
             else:
-                _, next_train_batch_list = run_sess(
-                    self.sess,
-                    [batch_train_op, next_train_batch_op],
-                    feed_dict=train_feed_dict,
-                    options=prf_options,
-                    run_metadata=prf_run_metadata,
+                # for k, v in train_feed_dict.items():
+                #     print(f"{k} {v.shape if hasattr(v, 'shape') else v}")
+                """
+                find_box:0", dtype=float32) ()
+                find_coord:0", dtype=float32) ()
+                find_numb_copy:0", dtype=float32) ()
+                find_energy:0", dtype=float32) ()
+                find_force:0", dtype=float32) ()
+                find_virial:0", dtype=float32) ()
+                find_atom_ener:0", dtype=float32) ()
+                find_atom_pref:0", dtype=float32) ()
+                box:0", shape=(?,), dtype=float64) (9,)
+                coord:0", shape=(?,), dtype=float64) (576,)
+                numb_copy:0", shape=(?,), dtype=float64) (1,)
+                energy:0", shape=(?,), dtype=float64) (1,)
+                force:0", shape=(?,), dtype=float64) (576,)
+                virial:0", shape=(?,), dtype=float64) (9,)
+                atom_ener:0", shape=(?,), dtype=float64) (192,)
+                atom_pref:0", shape=(?,), dtype=float64) (576,)
+                natoms:0", shape=(4,), dtype=int32) (4,)
+                mesh:0", shape=(?,), dtype=int32) (6,)
+                type:0", shape=(?,), dtype=int32) (192,)
+                aceholder:0", dtype=bool) True
+                """
+                model_inputs = {}
+                for kk in train_batch.keys():
+                    if kk == "find_type" or kk == "type":
+                        continue
+                    prec = "float64"
+                    if "find_" in kk:
+                        model_inputs[kk] = paddle.to_tensor(
+                            train_batch[kk], dtype="float64"
+                        )
+                    else:
+                        model_inputs[kk] = paddle.to_tensor(
+                            np.reshape(train_batch[kk], [-1]), dtype=prec
+                        )
+
+                for ii in ["type"]:
+                    model_inputs[ii] = paddle.to_tensor(
+                        np.reshape(train_batch[ii], [-1]), dtype="int32"
+                    )
+                for ii in ["natoms_vec", "default_mesh"]:
+                    model_inputs[ii] = paddle.to_tensor(train_batch[ii], dtype="int32")
+                model_inputs["is_training"] = paddle.to_tensor(True)
+                model_inputs["natoms_vec"] = paddle.to_tensor(
+                    model_inputs["natoms_vec"], place="cpu"
+                )
+                # for k, v in model_inputs.items():
+                #     np.save(
+                #         "/workspace/hesensen/deepmd_backend/"
+                #         f"deepmd-kit/examples/water/se_e2_a/align_input/{k}",
+                #         v,
+                #     )
+                # exit()
+                # {
+                #     find_box: []
+                #     box: [9]
+                #     find_coord: []
+                #     coord: [576]
+                #     find_numb_copy: []
+                #     numb_copy: [1]
+                #     find_energy: []
+                #     energy: [1]
+                #     find_force: []
+                #     force: [576]
+                #     find_virial: []
+                #     virial: [9]
+                #     find_atom_ener: []
+                #     atom_ener: [192]
+                #     find_atom_pref: []
+                #     atom_pref: [576]
+                #     natoms_vec: [4]
+                #     default_mesh: [6]
+                #     type: [192]
+                #     is_training: []
+                # }
+                model_pred = self.model(
+                    model_inputs["coord"],
+                    model_inputs["type"],
+                    model_inputs["natoms_vec"],
+                    model_inputs["box"],
+                    model_inputs["default_mesh"],
+                    model_inputs,
+                    suffix="",
+                    reuse=False,
+                )
+                # for k, v in model_pred.items():
+                #     np.save(
+                #         "/workspace/hesensen/deepmd_backend/"
+                #         f"deepmd-kit/examples/water/se_e2_a/align_input/pred_{k}",
+                #         v,
+                #     )
+                # exit()
+
+                # loss = (
+                #     model_pred["force"].sum()
+                #     + model_pred["virial"].sum()
+                #     + model_pred["energy"].sum()
+                #     + model_pred["atom_ener"].sum()
+                # )
+                # print(f"{self.cur_batch} {self.learning_rate.get_lr():.10f}")
+                l2_l, l2_more = self.loss.compute_loss(
+                    self.learning_rate.get_lr(),
+                    model_inputs["natoms_vec"],
+                    model_pred,
+                    model_inputs,
+                    suffix="train",
                 )
+
+                self.optimizer.clear_grad()
+                l2_l.backward()
+                self.optimizer.step()
+                self.global_step += 1
+
+                # _, next_train_batch_list = run_sess(
+                #     self.sess,
+                #     [batch_train_op, next_train_batch_op],
+                #     feed_dict=train_feed_dict,
+                #     options=prf_options,
+                #     run_metadata=prf_run_metadata,
+                # )
+                """next_train_batch_list
+                find_box (): <class 'numpy.float32'> none
+                box (1, 9): <class 'numpy.ndarray'> (1, 9)
+                find_coord (): <class 'numpy.float32'> none
+                coord (1, 576): <class 'numpy.ndarray'> (1, 576)
+                find_numb_copy (): <class 'numpy.float32'> none
+                numb_copy (1, 1): <class 'numpy.ndarray'> (1, 1)
+                find_energy (): <class 'numpy.float32'> none
+                energy (1, 1): <class 'numpy.ndarray'> (1, 1)
+                find_force (): <class 'numpy.float32'> none
+                force (1, 576): <class 'numpy.ndarray'> (1, 576)
+                find_virial (): <class 'numpy.float32'> none
+                virial (1, 9): <class 'numpy.ndarray'> (1, 9)
+                find_atom_ener (): <class 'numpy.float32'> none
+                atom_ener (1, 192): <class 'numpy.ndarray'> (1, 192)
+                find_atom_pref (): <class 'numpy.float32'> none
+                atom_pref (1, 576): <class 'numpy.ndarray'> (1, 576)
+                type (1, 192): <class 'numpy.ndarray'> (1, 192)
+                natoms_vec (4,): <class 'numpy.ndarray'> (4,)
+                default_mesh (6,): <class 'numpy.ndarray'> (6,)
+                """
             if self.timing_in_training:
                 toc = time.time()
             if self.timing_in_training:
                 train_time += toc - tic
-            cur_batch = run_sess(self.sess, self.global_step)
+            cur_batch = self.global_step
             self.cur_batch = cur_batch
+            if (cur_batch % self.lr.decay_steps_) == 0:
+                self.learning_rate.step()
 
             # on-the-fly validation
             if self.display_in_training and (cur_batch % self.disp_freq == 0):
@@ -1060,12 +1176,10 @@ def train(self, train_data=None, valid_data=None):
                 if (
                     self.save_freq > 0
                     and cur_batch % self.save_freq == 0
-                    and self.saver is not None
+                    # and self.saver is not None
                 ):
                     self.save_checkpoint(cur_batch)
-        if (
-            self.save_freq == 0 or cur_batch == 0 or cur_batch % self.save_freq != 0
-        ) and self.saver is not None:
+        if self.save_freq == 0 or cur_batch == 0 or cur_batch % self.save_freq != 0:
             self.save_checkpoint(cur_batch)
         if self.run_opt.is_chief:
             fp.close()
@@ -1083,42 +1197,44 @@ def train(self, train_data=None, valid_data=None):
                     total_train_time / (stop_batch // self.disp_freq * self.disp_freq),
                 )
 
-        if self.profiling and self.run_opt.is_chief:
-            fetched_timeline = timeline.Timeline(prf_run_metadata.step_stats)
-            chrome_trace = fetched_timeline.generate_chrome_trace_format()
-            with open(self.profiling_file, "w") as f:
-                f.write(chrome_trace)
-        if self.enable_profiler and self.run_opt.is_chief:
-            tfv2.profiler.experimental.stop()
+        # if self.profiling and self.run_opt.is_chief:
+        #     fetched_timeline = timeline.Timeline(prf_run_metadata.step_stats)
+        #     chrome_trace = fetched_timeline.generate_chrome_trace_format()
+        #     with open(self.profiling_file, "w") as f:
+        #         f.write(chrome_trace)
+        # if self.enable_profiler and self.run_opt.is_chief:
+        #     tfv2.profiler.experimental.stop()
 
     def save_checkpoint(self, cur_batch: int):
-        try:
-            ckpt_prefix = self.saver.save(
-                self.sess,
-                os.path.join(os.getcwd(), self.save_ckpt),
-                global_step=cur_batch,
-            )
-        except google.protobuf.message.DecodeError as e:
-            raise GraphTooLargeError(
-                "The graph size exceeds 2 GB, the hard limitation of protobuf."
-                " Then a DecodeError was raised by protobuf. You should "
-                "reduce the size of your model."
-            ) from e
-        # make symlinks from prefix with step to that without step to break nothing
-        # get all checkpoint files
-        original_files = glob.glob(ckpt_prefix + ".*")
-        for ori_ff in original_files:
-            new_ff = self.save_ckpt + ori_ff[len(ckpt_prefix) :]
-            try:
-                # remove old one
-                os.remove(new_ff)
-            except OSError:
-                pass
-            if platform.system() != "Windows":
-                # by default one does not have access to create symlink on Windows
-                os.symlink(ori_ff, new_ff)
-            else:
-                shutil.copyfile(ori_ff, new_ff)
+        # try:
+        #     ckpt_prefix = self.saver.save(
+        #         self.sess,
+        #         os.path.join(os.getcwd(), self.save_ckpt),
+        #         global_step=cur_batch,
+        #     )
+        # except google.protobuf.message.DecodeError as e:
+        #     raise GraphTooLargeError(
+        #         "The graph size exceeds 2 GB, the hard limitation of protobuf."
+        #         " Then a DecodeError was raised by protobuf. You should "
+        #         "reduce the size of your model."
+        #     ) from e
+        # # make symlinks from prefix with step to that without step to break nothing
+        # # get all checkpoint files
+        # original_files = glob.glob(ckpt_prefix + ".*")
+        # for ori_ff in original_files:
+        #     new_ff = self.save_ckpt + ori_ff[len(ckpt_prefix) :]
+        #     try:
+        #         # remove old one
+        #         os.remove(new_ff)
+        #     except OSError:
+        #         pass
+        #     if platform.system() != "Windows":
+        #         # by default one does not have access to create symlink on Windows
+        #         os.symlink(ori_ff, new_ff)
+        #     else:
+        #         shutil.copyfile(ori_ff, new_ff)
+        paddle.save(self.model.state_dict(), f"Model_{cur_batch}.pdparams")
+        paddle.save(self.optimizer.state_dict(), f"Optimier_{cur_batch}.pdopt")
         log.info("saved checkpoint %s" % self.save_ckpt)
 
     def get_feed_dict(self, batch, is_training):
@@ -1127,18 +1243,18 @@ def get_feed_dict(self, batch, is_training):
             if kk == "find_type" or kk == "type" or kk == "real_natoms_vec":
                 continue
             if "find_" in kk:
-                feed_dict[self.place_holders[kk]] = batch[kk]
+                feed_dict[kk] = batch[kk]
             else:
-                feed_dict[self.place_holders[kk]] = np.reshape(batch[kk], [-1])
+                feed_dict[kk] = np.reshape(batch[kk], [-1])
         for ii in ["type"]:
-            feed_dict[self.place_holders[ii]] = np.reshape(batch[ii], [-1])
+            feed_dict[ii] = np.reshape(batch[ii], [-1])
         for ii in ["natoms_vec", "default_mesh"]:
-            feed_dict[self.place_holders[ii]] = batch[ii]
-        feed_dict[self.place_holders["is_training"]] = is_training
+            feed_dict[ii] = batch[ii]
+        feed_dict["is_training"] = is_training
         return feed_dict
 
     def get_global_step(self):
-        return run_sess(self.sess, self.global_step)
+        return self.global_step
 
     # def print_head (self) :  # depreciated
     #     if self.run_opt.is_chief:
@@ -1157,7 +1273,7 @@ def valid_on_the_fly(
 
         cur_batch = self.cur_batch
         if not self.multi_task_mode:
-            current_lr = run_sess(self.sess, self.learning_rate)
+            current_lr = self.learning_rate.get_lr()
         else:
             assert (
                 fitting_key is not None
@@ -1263,8 +1379,8 @@ def print_on_training(
         fp.write(print_str)
         fp.flush()
 
-    @staticmethod
-    def eval_single_list(single_batch_list, loss, sess, get_feed_dict_func, prefix=""):
+    # @staticmethod
+    def eval_single_list(self, single_batch_list, loss, prefix=""):
         if single_batch_list is None:
             return None
         numb_batch = len(single_batch_list)
@@ -1273,8 +1389,8 @@ def eval_single_list(single_batch_list, loss, sess, get_feed_dict_func, prefix="
         for i in range(numb_batch):
             batch = single_batch_list[i]
             natoms = batch["natoms_vec"]
-            feed_dict = get_feed_dict_func(batch, is_training=False)
-            results = loss.eval(sess, feed_dict, natoms)
+            # feed_dict = get_feed_dict_func(batch, is_training=False)
+            results = loss.eval(self.model, batch, natoms)
 
             for k, v in results.items():
                 if k == "natoms":
@@ -1290,9 +1406,7 @@ def eval_single_list(single_batch_list, loss, sess, get_feed_dict_func, prefix="
 
     def get_evaluation_results(self, batch_list):
         if not self.multi_task_mode:
-            avg_results = self.eval_single_list(
-                batch_list, self.loss, self.sess, self.get_feed_dict
-            )
+            avg_results = self.eval_single_list(batch_list, self.loss)
         else:
             avg_results = {}
             for fitting_key in batch_list:
@@ -1474,9 +1588,11 @@ def get_train_batch() -> List[np.ndarray]:
             batch_data = tuple([batch_data[kk] for kk in self.data_keys])
             return batch_data
 
-        return tf.py_func(get_train_batch, [], self.data_types, name="train_data")
+        return get_train_batch
 
-    def get_data_dict(self, batch_list: List[np.ndarray]) -> Dict[str, np.ndarray]:
+    def get_data_dict(
+        self, batch_list: List[np.ndarray] = None
+    ) -> Dict[str, np.ndarray]:
         """Generate a dict of the loaded data.
 
         Parameters
@@ -1489,4 +1605,8 @@ def get_data_dict(self, batch_list: List[np.ndarray]) -> Dict[str, np.ndarray]:
         Dict[str, np.ndarray]
             The dict of the loaded data.
         """
-        return {kk: vv for kk, vv in zip(self.data_keys, batch_list)}
+        batch_data = self.train_data.get_batch()
+        # convert dict to list of arryas
+        batch_data = tuple([batch_data[kk] for kk in self.data_keys])
+        return {kk: vv for kk, vv in zip(self.data_keys, batch_data)}
+        # return {kk: vv for kk, vv in zip(self.data_keys, batch_list)}
diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py
index f393618cb1..5b55261d06 100644
--- a/deepmd/utils/batch_size.py
+++ b/deepmd/utils/batch_size.py
@@ -1,18 +1,12 @@
 import logging
 import os
-from typing import (
-    Callable,
-    Tuple,
-)
+from typing import Callable
+from typing import Tuple
 
 import numpy as np
 
-from deepmd.env import (
-    tf,
-)
-from deepmd.utils.errors import (
-    OutOfMemoryError,
-)
+from deepmd.env import tf
+from deepmd.utils.errors import OutOfMemoryError
 
 log = logging.getLogger(__name__)
 
@@ -100,9 +94,11 @@ def execute(
             OOM when batch size is 1
         """
         try:
+            # print(__file__, self.current_batch_size, natoms)
             n_batch, result = callable(
                 max(self.current_batch_size // natoms, 1), start_index
             )
+            # print(__file__, n_batch)
         except OutOfMemoryError as e:
             # TODO: it's very slow to catch OOM error; I don't know what TF is doing here
             # but luckily we only need to catch once
@@ -196,6 +192,7 @@ def execute_with_batch_size(
                 for rr in result:
                     rr.reshape((n_batch, -1))
                 results.append(result)
+        # print(__file__, "here")
 
         r = tuple([np.concatenate(r, axis=0) for r in zip(*results)])
         if len(r) == 1:
diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index 16fcbfc7c5..8ffb7bc58a 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -1,21 +1,15 @@
 #!/usr/bin/env python3
 
 import logging
-from typing import (
-    List,
-    Optional,
-)
+from typing import List
+from typing import Optional
 
 import numpy as np
 
-from deepmd.env import (
-    GLOBAL_ENER_FLOAT_PRECISION,
-    GLOBAL_NP_FLOAT_PRECISION,
-)
+from deepmd.env import GLOBAL_ENER_FLOAT_PRECISION
+from deepmd.env import GLOBAL_NP_FLOAT_PRECISION
 from deepmd.utils import random as dp_random
-from deepmd.utils.path import (
-    DPPath,
-)
+from deepmd.utils.path import DPPath
 
 log = logging.getLogger(__name__)
 
@@ -458,7 +452,7 @@ def _load_set(self, set_name: DPPath):
                     self.data_dict[kk]["ndof"],
                     atomic=self.data_dict[kk]["atomic"],
                     high_prec=self.data_dict[kk]["high_prec"],
-                    must=self.data_dict[kk]["must"],
+                    must=False,
                     type_sel=self.data_dict[kk]["type_sel"],
                     repeat=self.data_dict[kk]["repeat"],
                     default=self.data_dict[kk]["default"],
diff --git a/deepmd/utils/learning_rate.py b/deepmd/utils/learning_rate.py
index 324f4f7fff..315138010e 100644
--- a/deepmd/utils/learning_rate.py
+++ b/deepmd/utils/learning_rate.py
@@ -1,12 +1,10 @@
-from typing import (
-    Optional,
-)
+from typing import Optional
 
 import numpy as np
+from paddle.optimizer import lr
 
-from deepmd.env import (
-    tf,
-)
+from deepmd.env import paddle
+from deepmd.env import tf
 
 
 class LearningRateExp:
@@ -89,13 +87,16 @@ def build(
                 np.log(self.stop_lr_ / self.start_lr_) / (stop_step / self.decay_steps_)
             )
 
-        return tf.train.exponential_decay(
+        # print("decay_steps_ = ", self.decay_steps_)
+        return lr.ExponentialDecay(
             self.start_lr_,
-            global_step,
-            self.decay_steps_,
-            self.decay_rate_,
-            staircase=True,
+            gamma=self.decay_rate_,
         )
+        # return paddle.optimizer.lr.ExponentialDecay(
+        #     learning_rate=self.start_lr_,
+        #     gamma=self.decay_rate_ ** (1 / self.decay_steps_),
+        #     # verbose=True,
+        # )
 
     def start_lr(self) -> float:
         """Get the start lr."""
diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
index 9b23bc9d76..12ef281ef7 100644
--- a/deepmd/utils/neighbor_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -1,24 +1,25 @@
 import logging
 import math
-from typing import (
-    List,
-    Tuple,
-)
+from typing import List
+from typing import Tuple
 
 import numpy as np
 
-from deepmd.env import (
-    GLOBAL_NP_FLOAT_PRECISION,
-    default_tf_session_config,
-    op_module,
-    tf,
-)
-from deepmd.utils.data_system import (
-    DeepmdDataSystem,
-)
-from deepmd.utils.parallel_op import (
-    ParallelOp,
-)
+from deepmd.env import GLOBAL_NP_FLOAT_PRECISION
+from deepmd.env import default_tf_session_config
+from deepmd.env import op_module
+from deepmd.env import tf
+
+# from paddle.utils import cpp_extension
+# op_module = cpp_extension.load(
+#     name="custom_op_paddle2",
+#     sources=["/workspace/hesensen/deepmd-kit/source/op/paddle/neighbor_stat.cc"],
+#     extra_include_paths=["/workspace/hesensen/deepmd-kit/source/lib/include/","/usr/local/cuda/targets/x86_64-linux/include/", "/workspace/hesensen/deepmd-kit/source/op"],
+#     # extra_library_paths=["../build/lib/", "/usr/local/cuda/lib64"],
+#     verbose=True,
+# )
+from deepmd.utils.data_system import DeepmdDataSystem
+from deepmd.utils.parallel_op import ParallelOp
 
 log = logging.getLogger(__name__)
 
@@ -48,45 +49,44 @@ def __init__(
         self.rcut = rcut
         self.ntypes = ntypes
         self.one_type = one_type
-        sub_graph = tf.Graph()
-
-        def builder():
-            place_holders = {}
-            for ii in ["coord", "box"]:
-                place_holders[ii] = tf.placeholder(
-                    GLOBAL_NP_FLOAT_PRECISION, [None, None], name="t_" + ii
-                )
-            place_holders["type"] = tf.placeholder(
-                tf.int32, [None, None], name="t_type"
-            )
-            place_holders["natoms_vec"] = tf.placeholder(
-                tf.int32, [self.ntypes + 2], name="t_natoms"
-            )
-            place_holders["default_mesh"] = tf.placeholder(
-                tf.int32, [None], name="t_mesh"
-            )
-            t_type = place_holders["type"]
-            t_natoms = place_holders["natoms_vec"]
-            if self.one_type:
-                # all types = 0, natoms_vec = [natoms, natoms, natoms]
-                t_type = tf.clip_by_value(t_type, -1, 0)
-                t_natoms = tf.tile(t_natoms[0:1], [3])
-
-            _max_nbor_size, _min_nbor_dist = op_module.neighbor_stat(
-                place_holders["coord"],
-                t_type,
-                t_natoms,
-                place_holders["box"],
-                place_holders["default_mesh"],
-                rcut=self.rcut,
-            )
-            place_holders["dir"] = tf.placeholder(tf.string)
-            return place_holders, (_max_nbor_size, _min_nbor_dist, place_holders["dir"])
-
-        with sub_graph.as_default():
-            self.p = ParallelOp(builder, config=default_tf_session_config)
-
-        self.sub_sess = tf.Session(graph=sub_graph, config=default_tf_session_config)
+        # sub_graph = tf.Graph()
+
+        # def builder():
+        #     place_holders = {}
+        #     for ii in ["coord", "box"]:
+        #         place_holders[ii] = tf.placeholder(
+        #             GLOBAL_NP_FLOAT_PRECISION, [None, None], name="t_" + ii
+        #         )
+        #     place_holders["type"] = tf.placeholder(
+        #         tf.int32, [None, None], name="t_type"
+        #     )
+        #     place_holders["natoms_vec"] = tf.placeholder(
+        #         tf.int32, [self.ntypes + 2], name="t_natoms"
+        #     )
+        #     place_holders["default_mesh"] = tf.placeholder(
+        #         tf.int32, [None], name="t_mesh"
+        #     )
+        #     t_type = place_holders["type"]
+        #     t_natoms = place_holders["natoms_vec"]
+        #     if self.one_type:
+        #         # all types = 0, natoms_vec = [natoms, natoms, natoms]
+        #         t_type = tf.clip_by_value(t_type, -1, 0)
+        #         t_natoms = tf.tile(t_natoms[0:1], [3])
+        #     _max_nbor_size, _min_nbor_dist = op_module.neighbor_stat( # 这里只计算一次
+        #         place_holders["coord"],
+        #         t_type,
+        #         t_natoms,
+        #         place_holders["box"],
+        #         place_holders["default_mesh"],
+        #         rcut=self.rcut,
+        #     )
+        #     place_holders["dir"] = tf.placeholder(tf.string)
+        #     return place_holders, (_max_nbor_size, _min_nbor_dist, place_holders["dir"])
+
+        # with sub_graph.as_default():
+        #     self.p = ParallelOp(builder, config=default_tf_session_config)
+
+        # self.sub_sess = tf.Session(graph=sub_graph, config=default_tf_session_config)
 
     def get_stat(self, data: DeepmdDataSystem) -> Tuple[float, List[int]]:
         """Get the data statistics of the training data, including nearest nbor distance between atoms, max nbor size of atoms.
@@ -108,44 +108,120 @@ def get_stat(self, data: DeepmdDataSystem) -> Tuple[float, List[int]]:
         if not self.one_type:
             self.max_nbor_size *= self.ntypes
 
-        def feed():
-            for ii in range(len(data.system_dirs)):
-                for jj in data.data_systems[ii].dirs:
-                    data_set = data.data_systems[ii]._load_set(jj)
-                    for kk in range(np.array(data_set["type"]).shape[0]):
-                        yield {
-                            "coord": np.array(data_set["coord"])[kk].reshape(
-                                [-1, data.natoms[ii] * 3]
-                            ),
-                            "type": np.array(data_set["type"])[kk].reshape(
-                                [-1, data.natoms[ii]]
-                            ),
-                            "natoms_vec": np.array(data.natoms_vec[ii]),
-                            "box": np.array(data_set["box"])[kk].reshape([-1, 9]),
-                            "default_mesh": np.array(data.default_mesh[ii]),
-                            "dir": str(jj),
-                        }
-
-        for mn, dt, jj in self.p.generate(self.sub_sess, feed()):
-            if dt.size != 0:
-                dt = np.min(dt)
-            else:
-                dt = self.rcut
-                log.warning(
-                    "Atoms with no neighbors found in %s. Please make sure it's what you expected."
-                    % jj
-                )
-            if dt < self.min_nbor_dist:
-                if math.isclose(dt, 0.0, rel_tol=1e-6):
-                    # it's unexpected that the distance between two atoms is zero
-                    # zero distance will cause nan (#874)
-                    raise RuntimeError(
-                        "Some atoms are overlapping in %s. Please check your"
-                        " training data to remove duplicated atoms." % jj
+        # def feed():
+        #     for ii in range(len(data.system_dirs)):
+        #         for jj in data.data_systems[ii].dirs:
+        #             data_set = data.data_systems[ii]._load_set(jj)
+        #             for kk in range(np.array(data_set["type"]).shape[0]):
+        #                 ret = {
+        #                     "coord": np.array(data_set["coord"])[kk].reshape(
+        #                         [-1, data.natoms[ii] * 3]
+        #                     ), # (1, 576)
+        #                     "type": np.array(data_set["type"])[kk].reshape(
+        #                         [-1, data.natoms[ii]]
+        #                     ), # (1, 192)
+        #                     "natoms_vec": np.array(data.natoms_vec[ii]), # (4,)
+        #                     "box": np.array(data_set["box"])[kk].reshape([-1, 9]), # (1, 9)
+        #                     "default_mesh": np.array(data.default_mesh[ii]), # (6,)
+        #                     "dir": str(jj), # ../data/data_0/set.xxx
+        #                 }
+        #                 print(str(jj))
+        #                 print("coord", ret["coord"].shape, ret["coord"].dtype)
+        #                 print("type", ret["type"].shape, ret["type"].dtype)
+        #                 print("natoms_vec", ret["natoms_vec"].shape, ret["natoms_vec"].dtype)
+        #                 print("box", ret["box"].shape, ret["box"].dtype)
+        #                 print("default_mesh", ret["default_mesh"].shape, ret["default_mesh"].dtype)
+        #                 # np.save("/workspace/hesensen/deepmd-kit/cuda_ext/coord.npy", ret["coord"])
+        #                 # np.save("/workspace/hesensen/deepmd-kit/cuda_ext/type.npy", ret["type"])
+        #                 # np.save("/workspace/hesensen/deepmd-kit/cuda_ext/natoms_vec.npy", ret["natoms_vec"])
+        #                 # np.save("/workspace/hesensen/deepmd-kit/cuda_ext/box.npy", ret["box"])
+        #                 # np.save("/workspace/hesensen/deepmd-kit/cuda_ext/default_mesh.npy", ret["default_mesh"])
+        #                 yield ret
+        import paddle
+
+        for ii in range(len(data.system_dirs)):
+            for jj in data.data_systems[ii].dirs:
+                data_set = data.data_systems[ii]._load_set(jj)
+                for kk in range(np.array(data_set["type"]).shape[0]):
+                    coord = np.array(data_set["coord"])[kk].reshape(
+                        [-1, data.natoms[ii] * 3]
                     )
-                self.min_nbor_dist = dt
-            var = np.max(mn, axis=0)
-            self.max_nbor_size = np.maximum(var, self.max_nbor_size)
+                    coord = paddle.to_tensor(
+                        coord, dtype="float32", place="cpu"
+                    )  # [1, 576]
+
+                    _type = np.array(data_set["type"])[kk].reshape(
+                        [-1, data.natoms[ii]]
+                    )
+                    _type = paddle.to_tensor(
+                        _type, dtype="int32", place="cpu"
+                    )  # [1, 192]
+
+                    natoms_vec = np.array(data.natoms_vec[ii])
+                    natoms_vec = paddle.to_tensor(
+                        natoms_vec, dtype="int64", place="cpu"
+                    )  # [4]
+
+                    box = np.array(data_set["box"])[kk].reshape([-1, 9])
+                    box = paddle.to_tensor(box, dtype="float32", place="cpu")  # [1, 9]
+
+                    default_mesh = np.array(data.default_mesh[ii])
+                    default_mesh = paddle.to_tensor(
+                        default_mesh, dtype="int32", place="cpu"
+                    )  # [6]
+
+                    rcut = self.rcut
+                    mn, dt = op_module.neighbor_stat(
+                        coord,
+                        _type,
+                        natoms_vec,
+                        box,
+                        default_mesh,
+                        rcut,
+                    )
+                    if dt.size != 0:
+                        dt = paddle.min(dt).item()
+                    else:
+                        dt = self.rcut
+                        log.warning(
+                            "Atoms with no neighbors found in %s. Please make sure it's what you expected."
+                            % jj
+                        )
+                    if dt < self.min_nbor_dist:
+                        if math.isclose(dt, 0.0, rel_tol=1e-6):
+                            # it's unexpected that the distance between two atoms is zero
+                            # zero distance will cause nan (#874)
+                            raise RuntimeError(
+                                "Some atoms are overlapping in %s. Please check your"
+                                " training data to remove duplicated atoms." % jj
+                            )
+                        self.min_nbor_dist = dt
+                    var = paddle.max(mn, axis=0).numpy()
+                    self.max_nbor_size = np.maximum(var, self.max_nbor_size)
+
+        # for mn, dt, jj in self.p.generate(self.sub_sess, feed()): # _max_nbor_size, _min_nbor_dist, dir
+        #     # print(mn.shape, dt.shape, jj)
+        #     # np.save("/workspace/hesensen/deepmd-kit/cuda_ext/max_nbor_size.npy", mn)
+        #     # np.save("/workspace/hesensen/deepmd-kit/cuda_ext/min_nbor_dist.npy", dt)
+        #     if dt.size != 0:
+        #         dt = np.min(dt)
+        #     else:
+        #         dt = self.rcut
+        #         log.warning(
+        #             "Atoms with no neighbors found in %s. Please make sure it's what you expected."
+        #             % jj
+        #         )
+        #     if dt < self.min_nbor_dist:
+        #         if math.isclose(dt, 0.0, rel_tol=1e-6):
+        #             # it's unexpected that the distance between two atoms is zero
+        #             # zero distance will cause nan (#874)
+        #             raise RuntimeError(
+        #                 "Some atoms are overlapping in %s. Please check your"
+        #                 " training data to remove duplicated atoms." % jj
+        #             )
+        #         self.min_nbor_dist = dt
+        #     var = np.max(mn, axis=0)
+        #     self.max_nbor_size = np.maximum(var, self.max_nbor_size)
 
         log.info("training data with min nbor dist: " + str(self.min_nbor_dist))
         log.info("training data with max nbor size: " + str(self.max_nbor_size))
diff --git a/deepmd/utils/network.py b/deepmd/utils/network.py
index a718da0b26..8471964000 100644
--- a/deepmd/utils/network.py
+++ b/deepmd/utils/network.py
@@ -1,12 +1,11 @@
 import numpy as np
+from paddle import nn
 
-from deepmd.common import (
-    get_precision,
-)
-from deepmd.env import (
-    GLOBAL_TF_FLOAT_PRECISION,
-    tf,
-)
+from deepmd.common import get_precision
+from deepmd.env import GLOBAL_PD_FLOAT_PRECISION
+from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.env import paddle
+from deepmd.env import tf
 
 
 def one_layer_rand_seed_shift():
@@ -296,3 +295,256 @@ def variable_summaries(var: tf.Variable, name: str):
         tf.summary.scalar("max", tf.reduce_max(var))
         tf.summary.scalar("min", tf.reduce_min(var))
         tf.summary.histogram("histogram", var)
+
+
+class OneLayer(paddle.nn.Layer):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        activation_fn=paddle.nn.functional.tanh,
+        precision=GLOBAL_PD_FLOAT_PRECISION,
+        stddev=1.0,
+        bavg=0.0,
+        name="linear",
+        seed=None,
+        use_timestep=False,
+        trainable=True,
+        useBN=False,
+    ):
+        super(OneLayer, self).__init__(name)
+        self.out_features = out_features
+        self.activation_fn = activation_fn
+        self.use_timestep = use_timestep
+        self.useBN = useBN
+        self.seed = seed
+        paddle.seed(seed)
+
+        self.weight = self.create_parameter(
+            shape=[in_features, out_features],
+            dtype=precision,
+            is_bias=False,
+            attr=paddle.ParamAttr(trainable=trainable),
+            default_initializer=paddle.nn.initializer.Normal(
+                std=stddev / np.sqrt(in_features + out_features)
+            ),
+        )
+        # print(bavg, stddev)
+        self.bias = self.create_parameter(
+            shape=[out_features],
+            dtype=precision,
+            is_bias=True,
+            attr=paddle.ParamAttr(trainable=trainable),
+            default_initializer=paddle.nn.initializer.Normal(
+                mean=bavg if isinstance(bavg, float) else bavg[0], std=stddev
+            ),
+        )
+        if self.activation_fn is not None and self.use_timestep:
+            self.idt = self.create_parameter(
+                shape=[out_features],
+                dtype=precision,
+                attr=paddle.ParamAttr(trainable=trainable),
+                default_initializer=paddle.nn.initializer.Normal(mean=0.1, std=0.001),
+            )
+
+    def forward(self, input):
+        hidden = paddle.matmul(input, self.weight) + self.bias
+        if self.activation_fn is not None:
+            if self.useBN:
+                None
+                # hidden_bn = self._batch_norm(hidden, name=name+'_normalization', reuse=reuse)
+                # return activation_fn(hidden_bn)
+            else:
+                if self.use_timestep:
+                    hidden = (
+                        paddle.reshape(
+                            self.activation_fn(hidden), [-1, self.out_features]
+                        )
+                        * self.idt
+                    )
+                else:
+                    hidden = paddle.reshape(
+                        self.activation_fn(hidden), [-1, self.out_features]
+                    )
+        return hidden
+
+
+class EmbeddingNet(paddle.nn.Layer):
+    """Parameters
+    ----------
+    xx : Tensor
+        Input tensor of shape [-1,1]
+    network_size: list of int
+        Size of the embedding network. For example [16,32,64]
+    precision:
+        Precision of network weights. For example, tf.float64
+    activation_fn:
+        Activation function
+    resnet_dt: boolean
+        Using time-step in the ResNet construction
+    name_suffix: str
+        The name suffix append to each variable.
+    stddev: float
+        Standard deviation of initializing network parameters
+    bavg: float
+        Mean of network intial bias
+    seed: int
+        Random seed for initializing network parameters
+    trainable: boolean
+        If the netowk is trainable
+    """
+
+    def __init__(
+        self,
+        network_size,
+        precision,
+        activation_fn=paddle.nn.functional.tanh,
+        resnet_dt=False,
+        stddev=1.0,
+        bavg=0.0,
+        seed=42,
+        trainable=True,
+        name="",
+    ):
+        super().__init__(name)
+        self.name = name
+        self.outputs_size = [1] + network_size
+        self.activation_fn = activation_fn
+        self.resnet_dt = resnet_dt
+        self.seed = seed
+        paddle.seed(seed)
+
+        outputs_size = self.outputs_size
+        weight = []
+        bias = []
+        idt = []
+        for ii in range(1, len(outputs_size)):
+            weight.append(
+                self.create_parameter(
+                    shape=[outputs_size[ii - 1], outputs_size[ii]],
+                    dtype=precision,
+                    is_bias=False,
+                    attr=paddle.ParamAttr(trainable=trainable),
+                    default_initializer=paddle.nn.initializer.Normal(
+                        std=stddev / np.sqrt(outputs_size[ii] + outputs_size[ii - 1])
+                    ),
+                )
+            )
+            # print(outputs_size[ii-1], precision, False, trainable, outputs_size[ii]+outputs_size[ii-1])
+            # exit()
+            bias.append(
+                self.create_parameter(
+                    shape=[1, outputs_size[ii]],
+                    dtype=precision,
+                    is_bias=True,
+                    attr=paddle.ParamAttr(trainable=trainable),
+                    default_initializer=paddle.nn.initializer.Normal(
+                        mean=bavg, std=stddev
+                    ),
+                )
+            )
+            if resnet_dt:
+                idt.append(
+                    self.create_parameter(
+                        shape=[1, outputs_size[ii]],
+                        dtype=precision,
+                        attr=paddle.ParamAttr(trainable=trainable),
+                        default_initializer=paddle.nn.initializer.Normal(
+                            mean=0.1, std=0.001
+                        ),
+                    )
+                )
+
+        self.weight = paddle.nn.ParameterList(weight)
+        self.bias = paddle.nn.ParameterList(bias)
+        self.idt = paddle.nn.ParameterList(idt)
+
+    def forward(self, xx):
+        # outputs_size = self.outputs_size
+        # print(self.outputs_size)
+        # for ii in range(1, len(outputs_size)):
+        #     # if self.activation_fn is not None:
+        #     hidden = paddle.reshape(
+        #         self.activation_fn(paddle.matmul(xx, self.weight[ii-1]) + self.bias[ii-1]),
+        #         [-1, outputs_size[ii]]
+        #     )
+        #     # print(__file__, 1)
+        #     # else:
+        #     #     hidden = paddle.reshape(
+        #     #         paddle.matmul(xx, self.weight[ii-1]) + self.bias[ii-1],
+        #     #         [-1, outputs_size[ii]]
+        #     #     )
+        #         # print(__file__, 2)
+
+        #     if outputs_size[ii] == outputs_size[ii - 1]:
+        #         if self.resnet_dt:
+        #             xx += hidden * self.idt[ii]
+        #             # print(__file__, 3)
+        #         else:
+        #             xx += hidden
+        #             # print(__file__, 4)
+        #     elif outputs_size[ii] == outputs_size[ii-1] * 2:
+        #         if self.resnet_dt:
+        #             xx = paddle.concat([xx,xx], axis=1) + hidden * self.idt[ii]
+        #             # print(__file__, 5)
+        #         else:
+        #             xx = paddle.concat([xx,xx], axis=1) + hidden
+        #             # print(__file__, 6)
+        #     else:
+        #         # print(__file__, 7)
+        #         xx = hidden
+        # # exit()
+
+        # return xx
+        # if not hasattr(self, "xx1"):
+        #     self.xx1 = xx
+        # paddle.save(self.xx1.numpy(), f"/workspace/hesensen/deepmd_backend/debug_emb/{self.name}_xx1.npy")
+        # paddle.save(self.weight[0].numpy(), f"/workspace/hesensen/deepmd_backend/debug_emb/{self.name}_weight_0.npy")
+        # paddle.save(self.bias[0].numpy(), f"/workspace/hesensen/deepmd_backend/debug_emb/{self.name}_bias_0.npy")
+
+        hidden = nn.functional.tanh(
+            nn.functional.linear(xx, self.weight[0], self.bias[0])
+        ).reshape(
+            [-1, 25]
+        )  # 1
+        xx = hidden  # 7
+
+        # if not hasattr(self, "hidden1"):
+        #     self.hidden1 = hidden
+        # paddle.save(self.hidden1.numpy(), f"/workspace/hesensen/deepmd_backend/debug_emb/{self.name}_hidden1.npy")
+
+        # if not hasattr(self, "xx2"):
+        #     self.xx2 = xx
+        # paddle.save(self.xx2.numpy(), f"/workspace/hesensen/deepmd_backend/debug_emb/{self.name}_xx2.npy")
+
+        hidden = nn.functional.tanh(
+            nn.functional.linear(xx, self.weight[1], self.bias[1])
+        ).reshape(
+            [-1, 50]
+        )  # 1
+        xx = paddle.concat([xx, xx], axis=1) + hidden  # 6
+
+        # if not hasattr(self, "hidden2"):
+        #     self.hidden2 = hidden
+        # paddle.save(self.hidden2.numpy(), f"/workspace/hesensen/deepmd_backend/debug_emb/{self.name}_hidden2.npy")
+
+        # if not hasattr(self, "xx3"):
+        #     self.xx3 = xx
+        # paddle.save(self.xx3.numpy(), f"/workspace/hesensen/deepmd_backend/debug_emb/{self.name}_xx3.npy")
+
+        hidden = nn.functional.tanh(
+            nn.functional.linear(xx, self.weight[2], self.bias[2])
+        ).reshape(
+            [-1, 100]
+        )  # 1
+        xx = paddle.concat([xx, xx], axis=1) + hidden  # 6
+
+        # if not hasattr(self, "hidden3"):
+        #     self.hidden3 = hidden
+        # paddle.save(self.hidden3.numpy(), f"/workspace/hesensen/deepmd_backend/debug_emb/{self.name}_hidden3.npy")
+
+        # if not hasattr(self, "xx4"):
+        #     self.xx4 = xx
+        # paddle.save(self.xx4.numpy(), f"/workspace/hesensen/deepmd_backend/debug_emb/{self.name}_xx4.npy")
+
+        return xx
diff --git a/deepmd/utils/type_embed.py b/deepmd/utils/type_embed.py
index 7a3e0925b8..807c3db431 100644
--- a/deepmd/utils/type_embed.py
+++ b/deepmd/utils/type_embed.py
@@ -1,22 +1,13 @@
-from typing import (
-    List,
-    Optional,
-    Union,
-)
+from typing import List
+from typing import Optional
+from typing import Union
 
-from deepmd.common import (
-    get_activation_func,
-    get_precision,
-)
-from deepmd.env import (
-    tf,
-)
-from deepmd.utils.graph import (
-    get_type_embedding_net_variables_from_graph_def,
-)
-from deepmd.utils.network import (
-    embedding_net,
-)
+from deepmd.common import get_activation_func
+from deepmd.common import get_precision
+from deepmd.env import paddle
+from deepmd.env import tf
+from deepmd.utils.graph import get_type_embedding_net_variables_from_graph_def
+from deepmd.utils.network import embedding_net
 
 
 def embed_atom_type(
@@ -47,15 +38,15 @@ def embed_atom_type(
         The embedded type of each atom.
         It has the shape of [numb_atoms, embedding_dim]
     """
-    te_out_dim = type_embedding.get_shape().as_list()[-1]
+    te_out_dim = type_embedding.shape[-1]
     atype = []
     for ii in range(ntypes):
-        atype.append(tf.tile([ii], [natoms[2 + ii]]))
-    atype = tf.concat(atype, axis=0)
-    atm_embed = tf.nn.embedding_lookup(
-        type_embedding, tf.cast(atype, dtype=tf.int32)
+        atype.append(paddle.tile([ii], [natoms[2 + ii]]))
+    atype = paddle.concat(atype, axis=0)
+    atm_embed = paddle.nn.functional.embedding(
+        paddle.cast(atype, dtype=paddle.int32), type_embedding
     )  # (nf*natom)*nchnl
-    atm_embed = tf.reshape(atm_embed, [-1, te_out_dim])
+    atm_embed = paddle.reshape(atm_embed, [-1, te_out_dim])
     return atm_embed
 
 
diff --git a/source/lib/paddle_src/neighbor_stat.cu b/source/lib/paddle_src/neighbor_stat.cu
new file mode 100644
index 0000000000..6754f3efc9
--- /dev/null
+++ b/source/lib/paddle_src/neighbor_stat.cu
@@ -0,0 +1,217 @@
+// #include <cub/block/block_load.cuh>
+// #include <cub/block/block_radix_sort.cuh>
+// #include <cub/block/block_store.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+#include "paddle/extension.h"
+
+#include "device.h"
+#include "prod_virial.h"
+#include "gpu_cuda.h"
+
+#include "paddle/extension.h"
+#include "errors.h"
+#include "neighbor_list.h"
+#include "device.h"
+
+#undef PADDLE_WITH_CUDA
+// #define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+// #define CHECK_INPUT_READY(x) PD_CHECK(x.IsInitialized(), #x " must be initialized before usage.")
+typedef double boxtensor_t;
+typedef double compute_t;
+
+std::vector<paddle::Tensor> NeighborStatOpCPUForward(
+  const paddle::Tensor& coord_tensor,
+  const paddle::Tensor& type_tensor,
+  const paddle::Tensor& natoms_tensor,
+  const paddle::Tensor& box_tensor,
+  const paddle::Tensor& mesh_tensor,
+  float rcut
+) {
+  CHECK_INPUT_CPU(coord_tensor);
+  CHECK_INPUT_CPU(type_tensor);
+  CHECK_INPUT_CPU(natoms_tensor);
+  CHECK_INPUT_CPU(box_tensor);
+  CHECK_INPUT_CPU(mesh_tensor);
+
+  CHECK_INPUT_DIM(coord_tensor, 2);
+  CHECK_INPUT_DIM(type_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+  CHECK_INPUT_DIM(box_tensor, 2);
+  CHECK_INPUT_DIM(mesh_tensor, 1);
+  PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3");
+
+  const int64_t* natoms = natoms_tensor.data<int64_t>();
+  int64_t nloc = natoms[0];
+  int64_t nall = natoms[1];
+  int64_t nsamples = coord_tensor.shape()[0];
+  int64_t ntypes = natoms_tensor.shape()[0] - 2;
+
+  PD_CHECK(nsamples == type_tensor.shape()[0], "number of samples should match");
+  PD_CHECK(nsamples == box_tensor.shape()[0], "number of samples should match");
+  PD_CHECK(nall * 3 == coord_tensor.shape()[1], "number of atoms should match");
+  PD_CHECK(nall == type_tensor.shape()[1], "number of atoms should match");
+  PD_CHECK(9 == box_tensor.shape()[1], "number of box should be 9");
+
+  // std::cout << "1" << std::endl;
+  int nei_mode = 0;
+  if (mesh_tensor.shape()[0] == 6) {
+    // manual copied pbc
+    assert(nloc == nall);
+    nei_mode = 1;
+  } else if (mesh_tensor.shape()[0] == 0) {
+    // no pbc
+    nei_mode = -1;
+  } else {
+    throw deepmd::deepmd_exception("invalid mesh tensor");
+  }
+  // if region is given extended, do not use pbc
+  bool b_pbc = (nei_mode >= 1 || nei_mode == -1) ? false : true;
+  bool b_norm_atom = (nei_mode == 1) ? true : false;
+
+  std::vector<int64_t> max_nbor_size_shape = {nloc, ntypes};
+  paddle::Tensor max_nbor_size_tensor = paddle::zeros(
+    max_nbor_size_shape,
+    type_tensor.dtype(),
+    type_tensor.place()
+  );
+  // std::cout << "2" << std::endl;
+
+  const auto* coord = coord_tensor.data<float>();
+  // std::cout << "3" << std::endl;
+  const auto* type = type_tensor.data<int>();
+  // std::cout << "4" << std::endl;
+  const auto* box = box_tensor.data<float>();
+  // std::cout << "5" << std::endl;
+  const auto* mesh = mesh_tensor.data<int>();
+  // std::cout << "6" << std::endl;
+  auto *max_nbor_size = max_nbor_size_tensor.mutable_data<int>();
+  // std::cout << "7" << std::endl;
+
+  boxtensor_t boxt[9] = {0};
+  for (int dd = 0; dd < 9; ++dd) {
+    boxt[dd] = box[dd];
+  }
+  SimulationRegion<compute_t> region;
+  region.reinitBox(boxt);
+  // set & normalize coord
+  std::vector<compute_t> d_coord3(nall * 3);
+  for (int ii = 0; ii < nall; ++ii) {
+    for (int dd = 0; dd < 3; ++dd) {
+      d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+    }
+    if (b_norm_atom) {
+      compute_t inter[3];
+      region.phys2Inter(inter, &d_coord3[3 * ii]);
+      for (int dd = 0; dd < 3; ++dd) {
+        if (inter[dd] < 0)
+          inter[dd] += 1.;
+        else if (inter[dd] >= 1)
+          inter[dd] -= 1.;
+      }
+      region.inter2Phys(&d_coord3[3 * ii], inter);
+    }
+  }
+
+  // set type
+  std::vector<int> d_type(nall);
+  for (int ii = 0; ii < nall; ++ii) d_type[ii] = type[ii];
+
+  // build nlist
+  std::vector<std::vector<int> > d_nlist_a;
+  std::vector<std::vector<int> > d_nlist_r;
+  std::vector<int> nlist_map;
+  bool b_nlist_map = false;
+
+  if (nei_mode == 1) {
+    // std::cout << "I'm in nei_mode 1" << std::endl;
+    std::vector<double> bk_d_coord3 = d_coord3;
+    std::vector<int> bk_d_type = d_type;
+    std::vector<int> ncell, ngcell;
+    copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3,
+                bk_d_type, rcut, region);
+    b_nlist_map = true;
+    std::vector<int> nat_stt(3, 0);
+    std::vector<int> ext_stt(3), ext_end(3);
+    for (int dd = 0; dd < 3; ++dd) {
+      ext_stt[dd] = -ngcell[dd];
+      ext_end[dd] = ncell[dd] + ngcell[dd];
+    }
+    ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt,
+                  ncell, ext_stt, ext_end, region, ncell);
+  } else if (nei_mode == -1) {
+    ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL);
+  } else {
+    throw deepmd::deepmd_exception("unknow neighbor mode");
+  }
+
+  int MAX_NNEI = 0;
+  for (int ii = 0; ii < nloc; ii++) {
+    MAX_NNEI = MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI;
+  }
+
+  // allocate output tensor for deepmd-kit
+  std::vector<int64_t> min_nbor_dist_shape = {nloc * MAX_NNEI};
+  paddle::Tensor min_nbor_dist_tensor = paddle::full(
+    min_nbor_dist_shape,
+    10000.0,
+    coord_tensor.dtype(),
+    coord_tensor.place()
+  );
+  auto* min_nbor_dist = min_nbor_dist_tensor.mutable_data<float>();
+
+#pragma omp parallel for
+  for (int ii = 0; ii < nloc; ii++) {
+    if (d_type[ii] < 0) continue;  // virtual atom
+    for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) {
+      int type = d_type[d_nlist_r[ii][jj]];
+      if (type < 0) continue;  // virtual atom
+      max_nbor_size[ii * ntypes + type] += 1;
+      compute_t rij[3] = {
+          d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0],
+          d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1],
+          d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]};
+      min_nbor_dist[ii * MAX_NNEI + jj] =
+          sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]);
+    }
+  }
+  return {max_nbor_size_tensor, min_nbor_dist_tensor};
+}
+
+
+std::vector<paddle::Tensor> NeighborStatForward(
+  const paddle::Tensor& coord_tensor,  /*float32*/
+  const paddle::Tensor& type_tensor,  /*int32*/
+  const paddle::Tensor& natoms_tensor,  /*int64*/
+  const paddle::Tensor& box_tensor,  /*float32*/
+  const paddle::Tensor& mesh_tensor,  /*int32*/
+  float rcut
+) {
+  if (coord_tensor.is_cpu()) {
+    // std::cout << coord_tensor.dtype() << std::endl;
+    // std::cout << type_tensor.dtype() << std::endl;
+    // std::cout << natoms_tensor.dtype() << std::endl;
+    // std::cout << box_tensor.dtype() << std::endl;
+    // std::cout << mesh_tensor.dtype() << std::endl;
+    return NeighborStatOpCPUForward(
+      coord_tensor,
+      type_tensor,
+      natoms_tensor,
+      box_tensor,
+      mesh_tensor,
+      rcut
+    );
+  } else {
+    PD_THROW("Unsupported device type for forward function of custom relu operator.");
+  }
+}
+
+
+PD_BUILD_OP(neighbor_stat)
+    .Inputs({"coord", "type", "natoms", "box", "mesh"})
+    .Outputs({"max_nbor_size", "min_nbor_dist"})
+    .Attrs({"rcut: float"})
+    .SetKernelFn(PD_KERNEL(NeighborStatForward));
diff --git a/source/lib/paddle_src/prod_env_mat.cc b/source/lib/paddle_src/prod_env_mat.cc
new file mode 100644
index 0000000000..7ebfd6cdc7
--- /dev/null
+++ b/source/lib/paddle_src/prod_env_mat.cc
@@ -0,0 +1,321 @@
+#include "prod_env_mat.h"
+
+#include <string.h>
+
+#include <cassert>
+#include <iostream>
+
+#include "env_mat.h"
+#include "fmt_nlist.h"
+
+using namespace deepmd;
+
+template <typename FPTYPE>
+void deepmd::prod_env_mat_a_cpu(FPTYPE *em,
+                                FPTYPE *em_deriv,
+                                FPTYPE *rij,
+                                int *nlist,
+                                const FPTYPE *coord,
+                                const int *type,
+                                const InputNlist &inlist,
+                                const int max_nbor_size,
+                                const FPTYPE *avg,
+                                const FPTYPE *std,
+                                const int nloc,
+                                const int nall,
+                                const float rcut,
+                                const float rcut_smth,
+                                const std::vector<int> sec,
+                                const int *f_type) {
+  if (f_type == NULL) {
+    f_type = type;
+  }
+  const int nnei = sec.back();
+  const int nem = nnei * 4;
+
+  // set & normalize coord
+  std::vector<FPTYPE> d_coord3(nall * 3);
+  for (int ii = 0; ii < nall; ++ii) {
+    for (int dd = 0; dd < 3; ++dd) {
+      d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+    }
+  }
+
+  // set type
+  std::vector<int> d_f_type(nall);
+  for (int ii = 0; ii < nall; ++ii) {
+    d_f_type[ii] = f_type[ii];
+  }
+
+  // build nlist
+  std::vector<std::vector<int> > d_nlist_a(nloc);
+
+  assert(nloc == inlist.inum);
+  for (unsigned ii = 0; ii < nloc; ++ii) {
+    d_nlist_a[ii].reserve(max_nbor_size);
+  }
+  for (unsigned ii = 0; ii < nloc; ++ii) {
+    int i_idx = inlist.ilist[ii];
+    for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) {
+      int j_idx = inlist.firstneigh[ii][jj];
+      d_nlist_a[i_idx].push_back(j_idx);
+    }
+  }
+
+#pragma omp parallel for
+  for (int ii = 0; ii < nloc; ++ii) {
+    std::vector<int> fmt_nlist_a;
+    int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_f_type, ii,
+                                 d_nlist_a[ii], rcut, sec);
+    std::vector<FPTYPE> d_em_a;
+    std::vector<FPTYPE> d_em_a_deriv;
+    std::vector<FPTYPE> d_em_r;
+    std::vector<FPTYPE> d_em_r_deriv;
+    std::vector<FPTYPE> d_rij_a;
+    env_mat_a_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_f_type, ii,
+                  fmt_nlist_a, sec, rcut_smth, rcut);
+
+    // check sizes
+    assert(d_em_a.size() == nem);
+    assert(d_em_a_deriv.size() == nem * 3);
+    assert(d_rij_a.size() == nnei * 3);
+    assert(fmt_nlist_a.size() == nnei);
+    // record outputs
+    for (int jj = 0; jj < nem; ++jj) {
+      if (type[ii] >= 0) {
+        em[ii * nem + jj] =
+            (d_em_a[jj] - avg[type[ii] * nem + jj]) / std[type[ii] * nem + jj];
+      } else {
+        em[ii * nem + jj] = 0;
+      }
+    }
+    for (int jj = 0; jj < nem * 3; ++jj) {
+      if (type[ii] >= 0) {
+        em_deriv[ii * nem * 3 + jj] =
+            d_em_a_deriv[jj] / std[type[ii] * nem + jj / 3];
+      } else {
+        em_deriv[ii * nem * 3 + jj] = 0;
+      }
+    }
+    for (int jj = 0; jj < nnei * 3; ++jj) {
+      rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+    }
+    for (int jj = 0; jj < nnei; ++jj) {
+      nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+    }
+  }
+}
+
+template <typename FPTYPE>
+void deepmd::prod_env_mat_r_cpu(FPTYPE *em,
+                                FPTYPE *em_deriv,
+                                FPTYPE *rij,
+                                int *nlist,
+                                const FPTYPE *coord,
+                                const int *type,
+                                const InputNlist &inlist,
+                                const int max_nbor_size,
+                                const FPTYPE *avg,
+                                const FPTYPE *std,
+                                const int nloc,
+                                const int nall,
+                                const float rcut,
+                                const float rcut_smth,
+                                const std::vector<int> sec) {
+  const int nnei = sec.back();
+  const int nem = nnei * 1;
+
+  // set & normalize coord
+  std::vector<FPTYPE> d_coord3(nall * 3);
+  for (int ii = 0; ii < nall; ++ii) {
+    for (int dd = 0; dd < 3; ++dd) {
+      d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+    }
+  }
+
+  // set type
+  std::vector<int> d_type(nall);
+  for (int ii = 0; ii < nall; ++ii) {
+    d_type[ii] = type[ii];
+  }
+
+  // build nlist
+  std::vector<std::vector<int> > d_nlist_a(nloc);
+
+  assert(nloc == inlist.inum);
+  for (unsigned ii = 0; ii < nloc; ++ii) {
+    d_nlist_a[ii].reserve(max_nbor_size);
+  }
+  for (unsigned ii = 0; ii < nloc; ++ii) {
+    int i_idx = inlist.ilist[ii];
+    for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) {
+      int j_idx = inlist.firstneigh[ii][jj];
+      d_nlist_a[i_idx].push_back(j_idx);
+    }
+  }
+
+#pragma omp parallel for
+  for (int ii = 0; ii < nloc; ++ii) {
+    std::vector<int> fmt_nlist_a;
+    int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_type, ii,
+                                 d_nlist_a[ii], rcut, sec);
+    std::vector<FPTYPE> d_em_a;
+    std::vector<FPTYPE> d_em_a_deriv;
+    std::vector<FPTYPE> d_em_r;
+    std::vector<FPTYPE> d_em_r_deriv;
+    std::vector<FPTYPE> d_rij_a;
+    env_mat_r_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_type, ii,
+                  fmt_nlist_a, sec, rcut_smth, rcut);
+
+    // check sizes
+    assert(d_em_a.size() == nem);
+    assert(d_em_a_deriv.size() == nem * 3);
+    assert(d_rij_a.size() == nnei * 3);
+    assert(fmt_nlist_a.size() == nnei);
+    // record outputs
+    for (int jj = 0; jj < nem; ++jj) {
+      em[ii * nem + jj] = (d_em_a[jj] - avg[d_type[ii] * nem + jj]) /
+                          std[d_type[ii] * nem + jj];
+    }
+    for (int jj = 0; jj < nem * 3; ++jj) {
+      em_deriv[ii * nem * 3 + jj] =
+          d_em_a_deriv[jj] / std[d_type[ii] * nem + jj / 3];
+    }
+    for (int jj = 0; jj < nnei * 3; ++jj) {
+      rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+    }
+    for (int jj = 0; jj < nnei; ++jj) {
+      nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+    }
+  }
+}
+
+template void deepmd::prod_env_mat_a_cpu<double>(double *em,
+                                                 double *em_deriv,
+                                                 double *rij,
+                                                 int *nlist,
+                                                 const double *coord,
+                                                 const int *type,
+                                                 const InputNlist &inlist,
+                                                 const int max_nbor_size,
+                                                 const double *avg,
+                                                 const double *std,
+                                                 const int nloc,
+                                                 const int nall,
+                                                 const float rcut,
+                                                 const float rcut_smth,
+                                                 const std::vector<int> sec,
+                                                 const int *f_type);
+
+template void deepmd::prod_env_mat_a_cpu<float>(float *em,
+                                                float *em_deriv,
+                                                float *rij,
+                                                int *nlist,
+                                                const float *coord,
+                                                const int *type,
+                                                const InputNlist &inlist,
+                                                const int max_nbor_size,
+                                                const float *avg,
+                                                const float *std,
+                                                const int nloc,
+                                                const int nall,
+                                                const float rcut,
+                                                const float rcut_smth,
+                                                const std::vector<int> sec,
+                                                const int *f_type);
+
+template void deepmd::prod_env_mat_r_cpu<double>(double *em,
+                                                 double *em_deriv,
+                                                 double *rij,
+                                                 int *nlist,
+                                                 const double *coord,
+                                                 const int *type,
+                                                 const InputNlist &inlist,
+                                                 const int max_nbor_size,
+                                                 const double *avg,
+                                                 const double *std,
+                                                 const int nloc,
+                                                 const int nall,
+                                                 const float rcut,
+                                                 const float rcut_smth,
+                                                 const std::vector<int> sec);
+
+template void deepmd::prod_env_mat_r_cpu<float>(float *em,
+                                                float *em_deriv,
+                                                float *rij,
+                                                int *nlist,
+                                                const float *coord,
+                                                const int *type,
+                                                const InputNlist &inlist,
+                                                const int max_nbor_size,
+                                                const float *avg,
+                                                const float *std,
+                                                const int nloc,
+                                                const int nall,
+                                                const float rcut,
+                                                const float rcut_smth,
+                                                const std::vector<int> sec);
+
+// #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+void deepmd::env_mat_nbor_update(InputNlist &inlist,
+                                 InputNlist &gpu_inlist,
+                                 int &max_nbor_size,
+                                 int *&nbor_list_dev,
+                                 const int *mesh,
+                                 const int size) {
+  int *mesh_host = new int[size];
+  memcpy_device_to_host(mesh, mesh_host, size);
+  memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int *));
+  memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *));
+  memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **));
+  const int ago = mesh_host[0];
+  if (ago == 0 || gpu_inlist.inum < inlist.inum) {
+    const int inum = inlist.inum;
+    if (gpu_inlist.inum < inum) {
+      delete_device_memory(gpu_inlist.ilist);
+      delete_device_memory(gpu_inlist.numneigh);
+      delete_device_memory(gpu_inlist.firstneigh);
+      malloc_device_memory(gpu_inlist.ilist, inum);
+      malloc_device_memory(gpu_inlist.numneigh, inum);
+      malloc_device_memory(gpu_inlist.firstneigh, inum);
+    }
+    memcpy_host_to_device(gpu_inlist.ilist, inlist.ilist, inum);
+    memcpy_host_to_device(gpu_inlist.numneigh, inlist.numneigh, inum);
+    int _max_nbor_size = max_numneigh(inlist);
+    if (_max_nbor_size <= 256) {
+      _max_nbor_size = 256;
+    } else if (_max_nbor_size <= 512) {
+      _max_nbor_size = 512;
+    } else if (_max_nbor_size <= 1024) {
+      _max_nbor_size = 1024;
+    } else if (_max_nbor_size <= 2048) {
+      _max_nbor_size = 2048;
+    } else {
+      _max_nbor_size = 4096;
+    }
+    if (nbor_list_dev == NULL || _max_nbor_size > max_nbor_size ||
+        inum > gpu_inlist.inum) {
+      delete_device_memory(nbor_list_dev);
+      malloc_device_memory(nbor_list_dev, inum * _max_nbor_size);
+    }
+    // update info
+    gpu_inlist.inum = inum;
+    max_nbor_size = _max_nbor_size;
+
+    // copy nbor list from host to the device
+    std::vector<int> nbor_list_host(inum * max_nbor_size, 0);
+    int **_firstneigh = (int **)malloc(sizeof(int *) * inum);
+    for (int ii = 0; ii < inum; ii++) {
+      _firstneigh[ii] = nbor_list_dev + ii * max_nbor_size;
+      for (int jj = 0; jj < inlist.numneigh[ii]; jj++) {
+        nbor_list_host[ii * max_nbor_size + jj] = inlist.firstneigh[ii][jj];
+      }
+    }
+    memcpy_host_to_device(nbor_list_dev, &nbor_list_host[0],
+                          inum * max_nbor_size);
+    memcpy_host_to_device(gpu_inlist.firstneigh, _firstneigh, inum);
+    free(_firstneigh);
+  }
+  delete[] mesh_host;
+}
+// #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/paddle_src/prod_env_mat.cu b/source/lib/paddle_src/prod_env_mat.cu
new file mode 100644
index 0000000000..81270a0c81
--- /dev/null
+++ b/source/lib/paddle_src/prod_env_mat.cu
@@ -0,0 +1,1324 @@
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+#include "paddle/extension.h"
+
+#define GOOGLE_CUDA 1
+
+#include <iomanip>
+#include "utilities.h"
+#include "coord.h"
+#include "fmt_nlist.h"
+#include "region.h"
+#include "neighbor_list.h"
+#include "prod_env_mat.h"
+#include "gpu_cuda.h"
+#include <vector>
+
+typedef long long int_64;
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+// #define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
+__device__ inline double _sqrt(double x) { return sqrt(x); }
+__device__ inline float _sqrt(float x) { return sqrtf(x); }
+__device__ inline double _rsqrt(double x) { return rsqrt(x); }
+__device__ inline float _rsqrt(float x) { return rsqrtf(x); }
+
+template <typename FPTYPE>
+static int
+_norm_copy_coord_gpu(
+    std::vector<paddle::Tensor>* tensor_list,
+    FPTYPE *&coord_cpy,
+    int *&type_cpy,
+    int *&idx_mapping,
+    int &nall,
+    int &mem_cpy,
+    const FPTYPE *coord,
+    const FPTYPE *box,
+    const int *type,
+    const int &nloc,
+    const int &max_cpy_trial,
+    const float &rcut_r);
+
+template <typename FPTYPE>
+static int
+_build_nlist_gpu(
+    std::vector<paddle::Tensor> *tensor_list,
+    int *&ilist,
+    int *&numneigh,
+    int **&firstneigh,
+    int *&jlist,
+    int &max_nnei,
+    int &mem_nnei,
+    const FPTYPE *coord,
+    const int &nloc,
+    const int &new_nall,
+    const int &max_nnei_trial,
+    const float &rcut_r);
+
+static void
+_map_nlist_gpu(
+    int *nlist,
+    const int *idx_mapping,
+    const int &nloc,
+    const int &nnei);
+
+template <typename FPTYPE>
+static void
+_prepare_coord_nlist_gpu(
+    std::vector<paddle::Tensor> *tensor_list,
+    FPTYPE const **coord,
+    FPTYPE *&coord_cpy,
+    int const **type,
+    int *&type_cpy,
+    int *&idx_mapping,
+    deepmd::InputNlist &inlist,
+    int *&ilist,
+    int *&numneigh,
+    int **&firstneigh,
+    int *&jlist,
+    int *&nbor_list_dev,
+    int &new_nall,
+    int &mem_cpy,
+    int &mem_nnei,
+    int &max_nbor_size,
+    const FPTYPE *box,
+    const int *mesh_tensor_data,
+    const int mesh_tensor_size,
+    const int &nloc,
+    const int &nei_mode,
+    const float &rcut_r,
+    const int &max_cpy_trial,
+    const int &max_nnei_trial);
+
+template <typename FPTYPE>
+__device__ inline uint_64 encoding_nbor_info(const int type,
+                                             const FPTYPE dist,
+                                             const int index) {
+  // nbor info checking:
+  // the type of nbor atom must be smaller than 128
+  // the distance of center atom between nbor atom must be smaller than 128
+  // the index of nbor atom(including ghost region) must be smaller than
+  // 16777216(1 << 24)
+  if (type >= 128 || dist >= (FPTYPE)128.0 || index >= (1 << 24)) {
+    asm("trap;");
+  }
+  return ((uint_64)type << 57) +
+         (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) +
+         index;
+}
+
+__device__ inline void decoding_nbor_info(int& type,
+                                          int& index,
+                                          const uint_64 key) {
+  type = key >> 57;
+  index = key & 0xFFFFFF;
+}
+
+template <typename FPTYPE>
+__global__ void get_i_idx(FPTYPE* i_idx, const int nloc, const FPTYPE* ilist) {
+  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= nloc) {
+    return;
+  }
+  i_idx[ilist[idx]] = idx;
+}
+
+// common part of prod_env_mat
+template <typename Key, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__launch_bounds__(BLOCK_THREADS) __global__
+    void BlockSortKernel(Key* d_in,
+                         Key* d_out)  // Tile of output
+{
+  enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+  // Specialize BlockLoad type for our thread block (uses warp-striped loads for
+  // coalescing, then transposes in shared memory to a blocked arrangement)
+  typedef cub::BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD,
+                         cub::BLOCK_LOAD_WARP_TRANSPOSE>
+      BlockLoadT;
+  // Specialize BlockRadixSort type for our thread block
+  typedef cub::BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD>
+      BlockRadixSortT;
+  // Shared memory
+  __shared__ union TempStorage {
+    typename BlockLoadT::TempStorage load;
+    typename BlockRadixSortT::TempStorage sort;
+  } temp_storage;
+  // Per-thread tile items
+  Key items[ITEMS_PER_THREAD];
+  // Our current block's offset
+  int_64 block_offset = (int_64)blockIdx.x * TILE_SIZE;
+  // Load items into a blocked arrangement
+  BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+  // Barrier for smem reuse
+  __syncthreads();
+  // Sort keys
+  BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
+  // Store output in striped fashion
+  cub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset,
+                                         items);
+}
+
+
+template <typename FPTYPE>
+__device__ inline FPTYPE dev_dot(FPTYPE* arr1, FPTYPE* arr2) {
+  return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
+}
+
+template <typename FPTYPE>
+__device__ inline void spline5_switch(
+    FPTYPE& vv, FPTYPE& dd, FPTYPE& xx, const float& rmin, const float& rmax) {
+  if (xx < rmin) {
+    dd = (FPTYPE)0.;
+    vv = (FPTYPE)1.;
+  } else if (xx < rmax) {
+    FPTYPE uu = (xx - rmin) / (rmax - rmin);
+    FPTYPE du = (FPTYPE)1. / (rmax - rmin);
+    vv = uu * uu * uu *
+             ((FPTYPE)-6. * uu * uu + (FPTYPE)15. * uu - (FPTYPE)10.) +
+         (FPTYPE)1.;
+    dd = ((FPTYPE)3. * uu * uu *
+              ((FPTYPE)-6. * uu * uu + (FPTYPE)15. * uu - (FPTYPE)10.) +
+          uu * uu * uu * ((FPTYPE)-12. * uu + (FPTYPE)15.)) *
+         du;
+  } else {
+    dd = (FPTYPE)0.;
+    vv = (FPTYPE)0.;
+  }
+}
+
+template <typename FPTYPE>
+__global__ void format_nlist_fill_a(uint_64* key,
+                                    const FPTYPE* coord,
+                                    const int* type,
+                                    const int* numneigh,
+                                    int** firstneigh,
+                                    const float rcut,
+                                    int* i_idx,
+                                    const int MAX_NBOR_SIZE) {
+  // <<<nloc, MAX_NBOR_SIZE>>>
+  const int_64 idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
+
+  const int nsize = numneigh[i_idx[idx]];
+  if (idy >= nsize) {
+    return;
+  }
+
+  const int* nei_idx = firstneigh[i_idx[idx]];
+  // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize);
+  uint_64* key_in = key + idx * MAX_NBOR_SIZE;
+  FPTYPE diff[3];
+  const int& j_idx = nei_idx[idy];
+  if (type[j_idx] < 0) return;
+  for (int dd = 0; dd < 3; dd++) {
+    diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
+  }
+  FPTYPE rr = _sqrt(dev_dot(diff, diff));
+  if (rr <= rcut) {
+    key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx);
+  }
+}
+
+template <typename FPTYPE>
+__global__ void fill_nei_iter(int* nei_iter_dev,
+                              const FPTYPE* key,
+                              const int nloc,
+                              const int max_nbor_size,
+                              const int sec_size) {
+  int_64 row = blockIdx.x;
+  int col = blockIdx.y * blockDim.x + threadIdx.x;
+  const FPTYPE* key_out = key + nloc * max_nbor_size + row * max_nbor_size;
+  int nei_type_cur = -1, nbor_idx_cur = 0;
+  int nei_type_pre = -1, nbor_idx_pre = 0;
+  if (col < max_nbor_size && key_out[col] != key_out[max_nbor_size - 1]) {
+    if (col >= 1)
+      decoding_nbor_info(nei_type_pre, nbor_idx_pre, key_out[col - 1]);
+    decoding_nbor_info(nei_type_cur, nbor_idx_cur, key_out[col]);
+  }
+  if (nei_type_cur != nei_type_pre) {
+    nei_iter_dev[row * sec_size + nei_type_cur] = col;
+  }
+}
+
+template <typename FPTYPE>
+__global__ void format_nlist_fill_b(int* nlist,
+                                    const int nlist_size,
+                                    const int nloc,
+                                    FPTYPE* key,
+                                    const int* sec,
+                                    const int sec_size,
+                                    int* nei_iter_dev,
+                                    const int max_nbor_size) {
+  int_64 row = blockIdx.x;
+  int col = blockIdx.y * blockDim.x + threadIdx.x;
+  int* nei_iter = nei_iter_dev + row * sec_size;
+  FPTYPE* key_out = key + nloc * max_nbor_size + row * max_nbor_size;
+  int* row_nlist = nlist + row * nlist_size;
+  if (col < max_nbor_size) {
+    if (key_out[col] != key_out[max_nbor_size - 1]) {
+      int nei_type = 0, nbor_idx = 0;
+      decoding_nbor_info(nei_type, nbor_idx, key_out[col]);
+      int out_indx = col - nei_iter[nei_type] + sec[nei_type];
+      if (out_indx < sec[nei_type + 1]) {
+        row_nlist[out_indx] = nbor_idx;
+      }
+    }
+  }
+}
+
+template <typename FPTYPE>
+__global__ void encoding_decoding_nbor_info(uint_64* key,
+                                            int* out_type,
+                                            int* out_index,
+                                            const int* in_type,
+                                            const FPTYPE* in_dist,
+                                            const int* in_index,
+                                            const int size_of_array) {
+  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= size_of_array) {
+    return;
+  }
+
+  key[idx] = encoding_nbor_info(in_type[idx], in_dist[idx], in_index[idx]);
+  decoding_nbor_info(out_type[idx], out_index[idx], key[idx]);
+}
+
+template <typename FPTYPE>
+void format_nbor_list_256(uint_64* key,
+                          const FPTYPE* coord,
+                          const int* type,
+                          const deepmd::InputNlist& gpu_inlist,
+                          const int& nloc,
+                          const float& rcut,
+                          int* i_idx) {
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 256;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  format_nlist_fill_a<<<block_grid, thread_grid>>>(
+      key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
+      MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 4;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
+  // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
+      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void format_nbor_list_512(uint_64* key,
+                          const FPTYPE* coord,
+                          const int* type,
+                          const deepmd::InputNlist& gpu_inlist,
+                          const int& nloc,
+                          const float& rcut,
+                          int* i_idx) {
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 512;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  format_nlist_fill_a<<<block_grid, thread_grid>>>(
+      key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
+      MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 4;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
+  // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
+      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void format_nbor_list_1024(uint_64* key,
+                           const FPTYPE* coord,
+                           const int* type,
+                           const deepmd::InputNlist& gpu_inlist,
+                           const int& nloc,
+                           const float& rcut,
+                           int* i_idx) {
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 1024;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  format_nlist_fill_a<<<block_grid, thread_grid>>>(
+      key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
+      MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 8;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
+  // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
+      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void format_nbor_list_2048(uint_64* key,
+                           const FPTYPE* coord,
+                           const int* type,
+                           const deepmd::InputNlist& gpu_inlist,
+                           const int& nloc,
+                           const float& rcut,
+                           int* i_idx) {
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 2048;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  format_nlist_fill_a<<<block_grid, thread_grid>>>(
+      key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
+      MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 8;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
+  // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
+      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void format_nbor_list_4096(uint_64* key,
+                           const FPTYPE* coord,
+                           const int* type,
+                           const deepmd::InputNlist& gpu_inlist,
+                           const int& nloc,
+                           const float& rcut,
+                           int* i_idx) {
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 4096;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  format_nlist_fill_a<<<block_grid, thread_grid>>>(
+      key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
+      MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 16;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
+  // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
+      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+
+template <typename FPTYPE, int THREADS_PER_BLOCK>
+__global__ void compute_env_mat_a(FPTYPE* em,
+                                  FPTYPE* em_deriv,
+                                  FPTYPE* rij,
+                                  const FPTYPE* coord,
+                                  const FPTYPE* avg,
+                                  const FPTYPE* std,
+                                  const int* type,
+                                  const int* nlist,
+                                  const int nnei,
+                                  const float rmin,
+                                  const float rmax) {
+  // <<<nloc, TPB>>>
+  const int_64 bid = blockIdx.x;
+  const unsigned int tid = threadIdx.x;
+  if (type[bid] < 0) return;
+  if (tid >= nnei) {
+    return;
+  }
+  const int ndescrpt = nnei * 4;
+  const int* row_nlist = nlist + bid * nnei;
+  FPTYPE* row_rij = rij + bid * nnei * 3;
+  FPTYPE* row_descript = em + bid * nnei * 4;
+  FPTYPE* row_descript_deriv = em_deriv + bid * nnei * 12;
+  for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) {
+    const int idx_value = ii * 4;   // 4 components
+    const int idx_deriv = ii * 12;  // 4 components time 3 directions
+    if (row_nlist[ii] >= 0) {
+      FPTYPE rr[3] = {0};
+      FPTYPE dd[4] = {0};
+      FPTYPE vv[12] = {0};
+      const int j_idx = row_nlist[ii];
+      for (int kk = 0; kk < 3; kk++) {
+        rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
+        row_rij[ii * 3 + kk] = rr[kk];
+      }
+      // const FPTYPE * rr = &row_rij[ii * 3];
+      FPTYPE nr2 = dev_dot(rr, rr);
+      FPTYPE inr = _rsqrt(nr2);
+      FPTYPE nr = nr2 * inr;
+      FPTYPE inr2 = inr * inr;
+      FPTYPE inr4 = inr2 * inr2;
+      FPTYPE inr3 = inr4 * nr;
+      FPTYPE sw, dsw;
+      spline5_switch(sw, dsw, nr, rmin, rmax);
+      dd[0] = ((FPTYPE)1. / nr);  //* sw;
+      dd[1] = (rr[0] / nr2);      //* sw;
+      dd[2] = (rr[1] / nr2);      //* sw;
+      dd[3] = (rr[2] / nr2);      //* sw;
+      vv[0] = (rr[0] * inr3 * sw -
+               dd[0] * dsw * rr[0] *
+                   inr);  // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
+      vv[1] = (rr[1] * inr3 * sw -
+               dd[0] * dsw * rr[1] *
+                   inr);  // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
+      vv[2] = (rr[2] * inr3 * sw -
+               dd[0] * dsw * rr[2] *
+                   inr);  // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
+      // ****deriv of component x/r2
+      vv[3] = (((FPTYPE)2. * rr[0] * rr[0] * inr4 - inr2) * sw -
+               dd[1] * dsw * rr[0] *
+                   inr);  // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
+      vv[4] = (((FPTYPE)2. * rr[0] * rr[1] * inr4) * sw -
+               dd[1] * dsw * rr[1] *
+                   inr);  // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
+      vv[5] = (((FPTYPE)2. * rr[0] * rr[2] * inr4) * sw -
+               dd[1] * dsw * rr[2] *
+                   inr);  // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
+      // ***deriv of component y/r2
+      vv[6] = (((FPTYPE)2. * rr[1] * rr[0] * inr4) * sw -
+               dd[2] * dsw * rr[0] *
+                   inr);  // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
+      vv[7] = (((FPTYPE)2. * rr[1] * rr[1] * inr4 - inr2) * sw -
+               dd[2] * dsw * rr[1] *
+                   inr);  // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
+      vv[8] = (((FPTYPE)2. * rr[1] * rr[2] * inr4) * sw -
+               dd[2] * dsw * rr[2] *
+                   inr);  // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
+      // ***deriv of component z/r2
+      vv[9] = (((FPTYPE)2. * rr[2] * rr[0] * inr4) * sw -
+               dd[3] * dsw * rr[0] *
+                   inr);  // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
+      vv[10] =
+          (((FPTYPE)2. * rr[2] * rr[1] * inr4) * sw -
+           dd[3] * dsw * rr[1] *
+               inr);  // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt
+                      // + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
+      vv[11] =
+          (((FPTYPE)2. * rr[2] * rr[2] * inr4 - inr2) * sw -
+           dd[3] * dsw * rr[2] *
+               inr);  // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt
+                      // + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
+      // 4 value components
+      dd[0] *= sw;  // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx]
+                    // * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt +
+                    // idx_value + 0];
+      dd[1] *= sw;  // * em[idx * ndescrpt + idx_value + 1]);// - avg[type[idx]
+                    // * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt +
+                    // idx_value + 1];
+      dd[2] *= sw;  // * em[idx * ndescrpt + idx_value + 2]);// - avg[type[idx]
+                    // * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt +
+                    // idx_value + 2];
+      dd[3] *= sw;  // * em[idx * ndescrpt + idx_value + 3]);// - avg[type[idx]
+                    // * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt +
+                    // idx_value + 3];
+      for (int ii = 0; ii < 12; ii++) {
+        row_descript_deriv[idx_deriv + ii] =
+            vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
+      }
+      for (int ii = 0; ii < 4; ii++) {
+        row_descript[idx_value + ii] =
+            (dd[ii] - avg[type[bid] * ndescrpt + idx_value + ii]) /
+            std[type[bid] * ndescrpt + idx_value + ii];
+      }
+    } else {
+      // TODO: move it to the memset.
+      row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] /
+                                 std[type[bid] * ndescrpt + idx_value];
+    }
+  }
+}
+
+template <typename FPTYPE, int THREADS_PER_BLOCK>
+__global__ void compute_env_mat_r(FPTYPE* em,
+                                  FPTYPE* em_deriv,
+                                  FPTYPE* rij,
+                                  const FPTYPE* coord,
+                                  const FPTYPE* avg,
+                                  const FPTYPE* std,
+                                  const int* type,
+                                  const int* nlist,
+                                  const int nnei,
+                                  const float rmin,
+                                  const float rmax) {
+  // <<<nloc, TPB>>>
+  const int_64 bid = blockIdx.x;
+  const unsigned int tid = threadIdx.x;
+  if (tid >= nnei) {
+    return;
+  }
+  const int ndescrpt = nnei;
+  const int* row_nlist = nlist + bid * nnei;
+  FPTYPE* row_rij = rij + bid * nnei * 3;
+  FPTYPE* row_em = em + bid * nnei;
+  FPTYPE* row_em_deriv = em_deriv + bid * nnei * 3;
+  for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) {
+    const int idx_value = ii;      // 4 components
+    const int idx_deriv = ii * 3;  // 4 components time 3 directions
+    if (row_nlist[ii] >= 0) {
+      FPTYPE rr[3] = {0};
+      FPTYPE vv[3] = {0};
+      FPTYPE dd = 0;
+      const int& j_idx = row_nlist[ii];
+      for (int kk = 0; kk < 3; kk++) {
+        rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
+        row_rij[ii * 3 + kk] = rr[kk];
+      }
+      // const FPTYPE * rr = &row_rij[ii * 3];
+      FPTYPE nr2 = dev_dot(rr, rr);
+      FPTYPE inr = _rsqrt(nr2);
+      FPTYPE nr = nr2 * inr;
+      FPTYPE inr2 = inr * inr;
+      FPTYPE inr4 = inr2 * inr2;
+      FPTYPE inr3 = inr4 * nr;
+      FPTYPE sw, dsw;
+      spline5_switch(sw, dsw, nr, rmin, rmax);
+      dd = ((FPTYPE)1. / nr);  //* sw;
+      vv[0] = (rr[0] * inr3 * sw -
+               dd * dsw * rr[0] *
+                   inr);  // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
+      vv[1] = (rr[1] * inr3 * sw -
+               dd * dsw * rr[1] *
+                   inr);  // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
+      vv[2] = (rr[2] * inr3 * sw -
+               dd * dsw * rr[2] *
+                   inr);  // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
+
+      // 4 value components
+      dd *= sw;  // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] *
+                 // ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt +
+                 // idx_value + 0];
+      for (int ii = 0; ii < 3; ii++) {
+        row_em_deriv[idx_deriv + ii] =
+            vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
+      }
+      row_em[idx_value] = (dd - avg[type[bid] * ndescrpt + idx_value]) /
+                          std[type[bid] * ndescrpt + idx_value];
+    } else {
+      // TODO: move it to the memset.
+      row_em[idx_value] -= avg[type[bid] * ndescrpt + idx_value] /
+                           std[type[bid] * ndescrpt + idx_value];
+    }
+  }
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void format_nbor_list_gpu_cuda(int* nlist,
+                               const FPTYPE* coord,
+                               const int* type,
+                               const InputNlist& gpu_inlist,
+                               int* array_int,
+                               uint_64* array_longlong,
+                               const int max_nbor_size,
+                               const int nloc,
+                               const int nall,
+                               const float rcut,
+                               const std::vector<int> sec) {
+  const int LEN = 256;
+  const int nnei = sec.back();
+  const int nblock = (nloc + LEN - 1) / LEN;
+  int* sec_dev = array_int;
+  int* nei_iter = array_int + sec.size();  // = new int[sec_size];
+  int* i_idx = array_int + sec.size() + nloc * sec.size();
+  uint_64* key = array_longlong;
+  assert(max_nbor_size == 256 || max_nbor_size == 512 ||
+         max_nbor_size == 1024 || max_nbor_size == 2048 ||
+         max_nbor_size == 4096);
+  DPErrcheck(cudaMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei));
+  DPErrcheck(cudaMemset(key, 0xffffffff,
+                        sizeof(uint_64) * int_64(nloc) * max_nbor_size));
+  DPErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(),
+                        cudaMemcpyHostToDevice));
+
+  get_i_idx<<<nblock, LEN>>>(i_idx, nloc, gpu_inlist.ilist);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+
+  if (max_nbor_size == 256) {
+    format_nbor_list_256(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+  } else if (max_nbor_size == 512) {
+    format_nbor_list_512(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+  } else if (max_nbor_size == 1024) {
+    format_nbor_list_1024(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+  } else if (max_nbor_size == 2048) {
+    format_nbor_list_2048(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+  } else if (max_nbor_size == 4096) {
+    format_nbor_list_4096(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+  }
+
+  fill_nei_iter<<<dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN>>>(
+      nei_iter, key, nloc, max_nbor_size, sec.size());
+
+  format_nlist_fill_b<<<dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN>>>(
+      nlist, nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+}
+
+namespace deepmd {
+
+template <typename FPTYPE>
+void prod_env_mat_a_gpu_cuda(FPTYPE* em,
+                             FPTYPE* em_deriv,
+                             FPTYPE* rij,
+                             int* nlist,
+                             const FPTYPE* coord,
+                             const int* type,
+                             const InputNlist& gpu_inlist,
+                             int* array_int,
+                             uint_64* array_longlong,
+                             const int max_nbor_size,
+                             const FPTYPE* avg,
+                             const FPTYPE* std,
+                             const int nloc,
+                             const int nall,
+                             const float rcut,
+                             const float rcut_smth,
+                             const std::vector<int> sec,
+                             const int* f_type) {
+  if (f_type == NULL) {
+    f_type = type;
+  }
+  const int nnei = sec.back();
+  const int ndescrpt = nnei * 4;
+  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
+  DPErrcheck(
+      cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
+  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
+
+  format_nbor_list_gpu_cuda(nlist, coord, f_type, gpu_inlist, array_int,
+                            array_longlong, max_nbor_size, nloc, nall, rcut,
+                            sec);
+  nborErrcheck(cudaGetLastError());
+  nborErrcheck(cudaDeviceSynchronize());
+
+  compute_env_mat_a<FPTYPE, TPB><<<nloc, TPB>>>(
+      em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void prod_env_mat_r_gpu_cuda(FPTYPE* em,
+                             FPTYPE* em_deriv,
+                             FPTYPE* rij,
+                             int* nlist,
+                             const FPTYPE* coord,
+                             const int* type,
+                             const deepmd::InputNlist& gpu_inlist,
+                             int* array_int,
+                             uint_64* array_longlong,
+                             const int max_nbor_size,
+                             const FPTYPE* avg,
+                             const FPTYPE* std,
+                             const int nloc,
+                             const int nall,
+                             const float rcut,
+                             const float rcut_smth,
+                             const std::vector<int> sec) {
+  const int nnei = sec.back();
+  const int ndescrpt = nnei * 1;
+  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
+  DPErrcheck(
+      cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
+  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
+
+  format_nbor_list_gpu_cuda(nlist, coord, type, gpu_inlist, array_int,
+                            array_longlong, max_nbor_size, nloc, nall, rcut,
+                            sec);
+  nborErrcheck(cudaGetLastError());
+  nborErrcheck(cudaDeviceSynchronize());
+
+  compute_env_mat_r<FPTYPE, TPB><<<nloc, TPB>>>(
+      em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+}
+
+
+template <typename data_t>
+void prod_env_mat_a_cuda_forward_kernel(
+    int nsamples, int nloc, int ndescrpt, int nnei, int nall, int mem_cpy, int mem_nnei,
+    int max_nbor_size, int nei_mode, float rcut_a, float rcut_r, float rcut_r_smth, int max_cpy_trial,
+    int max_nnei_trial, bool b_nlist_map, const std::vector<int>& sec_a,
+    const std::vector<int>& sec_r, deepmd::InputNlist gpu_inlist, int* nbor_list_dev, int* array_int, unsigned long long* array_longlong,
+    data_t *p_em, data_t *p_em_deriv, data_t *p_rij, int *p_nlist,
+    const data_t *p_coord, const data_t *p_box, const data_t *avg,
+    const data_t *std, const int *p_type, const paddle::Tensor& mesh_tensor)
+{
+
+    for (int ff = 0; ff < nsamples; ++ff)
+    {
+        data_t *em = p_em + ff * nloc * ndescrpt;
+        data_t *em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3;
+        data_t *rij = p_rij + ff * nloc * nnei * 3;
+        int *nlist = p_nlist + ff * nloc * nnei;
+        const data_t *coord = p_coord + ff * nall * 3;
+        const data_t *box = p_box + ff * 9;
+        const int *type = p_type + ff * nall;
+
+
+        int *idx_mapping = NULL;
+        int *ilist = NULL, *numneigh = NULL;
+        int **firstneigh = NULL;
+        deepmd::malloc_device_memory(firstneigh, nloc);
+        int *jlist = NULL;
+        data_t *coord_cpy;
+        int *type_cpy;
+        int frame_nall = nall;
+        int mesh_tensor_size = static_cast<int>(mesh_tensor.size());
+        std::vector<paddle::Tensor> tensor_list;
+        _prepare_coord_nlist_gpu<data_t>(
+            &tensor_list, &coord, coord_cpy, &type, type_cpy, idx_mapping,
+            gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev,
+            frame_nall, mem_cpy, mem_nnei, max_nbor_size,
+            box, mesh_tensor.data<int>(), mesh_tensor_size, nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial);
+        // allocate temp memory, temp memory must not be used after this operation!
+        std::vector<int> int_temp_shape{int(sec_a.size()) + nloc * int(sec_a.size()) + nloc};
+        auto int_temp = paddle::empty(
+          int_temp_shape,
+          paddle::DataType::FLOAT32,
+          paddle::GPUPlace()
+        );
+
+        array_int = int_temp.mutable_data<int>();
+
+        deepmd::malloc_device_memory(array_longlong, nloc * GPU_MAX_NBOR_SIZE * 2);
+        // launch the gpu(nv) compute function
+
+        deepmd::prod_env_mat_a_gpu_cuda(
+            em, em_deriv, rij, nlist,
+            coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut_r, rcut_r_smth, sec_a);
+        if (b_nlist_map)
+            _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
+        deepmd::delete_device_memory(firstneigh);
+        deepmd::delete_device_memory(array_longlong);
+        array_longlong = NULL;
+    }
+}
+
+void cum_sum(std::vector<int>& sec, const std::vector<int>& n_sel) {
+  sec.resize(n_sel.size() + 1);
+  sec[0] = 0;
+  for (int ii = 1; ii < sec.size(); ++ii) {
+    sec[ii] = sec[ii - 1] + n_sel[ii - 1];
+  }
+}
+
+
+std::vector<paddle::Tensor> prod_env_mat_a_cuda_forward(
+  const paddle::Tensor& coord_tensor,
+  const paddle::Tensor& atype_tensor,
+  const paddle::Tensor& box_tensor,
+  const paddle::Tensor& mesh_tensor,
+  const paddle::Tensor& t_avg_tensor,
+  const paddle::Tensor& t_std_tensor,
+  const paddle::Tensor& natoms_tensor,
+  float rcut_a,
+  float rcut_r,
+  float rcut_r_smth,
+  std::vector<int> sel_a,
+  std::vector<int> sel_r
+)
+{
+  std::vector<int> sec_a;
+  std::vector<int> sec_r;
+  int ndescrpt, ndescrpt_a, ndescrpt_r;
+  int nnei, nnei_a, nnei_r, max_nbor_size;
+  int mem_cpy, max_cpy_trial;
+  int mem_nnei, max_nnei_trial;
+  std::string device;
+  int *array_int = NULL;
+  unsigned long long *array_longlong = NULL;
+  deepmd::InputNlist gpu_inlist;
+  int *nbor_list_dev = NULL;
+  float nloc_f, nall_f;
+
+  cum_sum(sec_a, sel_a);
+  cum_sum(sec_r, sel_r);
+  ndescrpt_a = sec_a.back() * 4;
+  ndescrpt_r = sec_r.back() * 1;
+  ndescrpt = ndescrpt_a + ndescrpt_r;
+  // std::cout << "ndescrpt = " << ndescrpt << std::endl;
+  nnei_a = sec_a.back();
+  nnei_r = sec_r.back();
+  nnei = nnei_a + nnei_r;
+  max_nbor_size = 1024;
+  max_cpy_trial = 100;
+  mem_cpy = 256;
+  max_nnei_trial = 100;
+  mem_nnei = 256;
+  // std::cout << "natoms.dtype = " << natoms.dtype() << std::endl;
+  // std::cout << "natoms.shape = ";
+  // for (auto &x: natoms)
+  // {
+  //   std::cout << x << std::endl;
+  // }
+  // std::cout << std::endl;
+
+  // std::cout <<  << std::endl;
+  // std::cout << "natoms.numel = " << natoms.numel() << std::endl;
+  // std::cout << "ckpt 1===============" << std::endl;
+  // auto* natoms = natoms.data<int>();
+  // std::cout << "natoms.numel() = " << natoms.numel() << std::endl;
+  // std::cout << "ckpt 2===============" << std::endl;
+  // std::cout << natoms[0] << std::endl;
+  auto natoms = natoms_tensor.data<int>();
+  int nloc = natoms[0]; // TODO: 使用natoms[0] 会段错误
+  // std::cout << "nloc = " << nloc << std::endl;
+  // std::cout << "ckpt 3===============" << std::endl;
+  int nall = natoms[1]; // TODO: 使用natoms[1] 会段错误
+  // std::cout << "nall = " << nloc << std::endl;
+  // std::cout << "ckpt 4===============" << std::endl;
+  // int ntypes = natoms.shape()[0] - 2;
+  // std::cout << "ckpt 5===============" << std::endl;
+  int nsamples = coord_tensor.shape()[0];
+  // std::cout << "ckpt 6===============" << std::endl;
+
+  int nei_mode = 0;
+  bool b_nlist_map = false;
+  if (mesh_tensor.shape()[0] == 16) {
+    // lammps neighbor list
+    nei_mode = 3;
+  } else if (mesh_tensor.shape()[0] == 6) {
+    // manual copied pbc
+    assert(nloc == nall);
+    nei_mode = 1;
+    b_nlist_map = true;
+  } else if (mesh_tensor.shape()[0] == 0) {
+    // no pbc
+    assert(nloc == nall);
+    nei_mode = -1;
+  } else {
+    PD_THROW("invalid mesh tensor");
+  }
+
+  // create output tensors
+  auto descrpt_tensor = paddle::empty(
+    {nsamples, nloc * ndescrpt},
+    coord_tensor.dtype(),
+    coord_tensor.place()
+  );
+  // std::cout << "descrpt_tensor.shape = ";
+  // for (auto &x: descrpt_tensor.shape())
+  //   std::cout << x << " ";
+  // std::cout << std::endl;
+
+  auto descrpt_deriv_tensor = paddle::empty(
+    {nsamples, nloc * ndescrpt * 3},
+    coord_tensor.dtype(),
+    coord_tensor.place()
+  );
+  // std::cout << "descrpt_deriv_tensor.shape = ";
+  // for (auto &x: descrpt_deriv_tensor.shape())
+  //   std::cout << x << " ";
+  // std::cout << std::endl;
+
+  auto rij_tensor = paddle::empty(
+    {nsamples, nloc * nnei * 3},
+    coord_tensor.dtype(),
+    coord_tensor.place()
+  );
+  // std::cout << "rij_tensor.shape = ";
+  // for (auto &x: rij_tensor.shape())
+  //   std::cout << x << " ";
+  // std::cout << std::endl;
+
+  auto nlist_tensor = paddle::empty(
+    {nsamples, nloc * nnei},
+    coord_tensor.dtype(),
+    coord_tensor.place()
+  );
+  // std::cout << "nlist_tensor.shape = ";
+  // for (auto &x: nlist_tensor.shape())
+  //   std::cout << x << " ";
+  // std::cout << std::endl;
+
+  // loop over samples
+  PD_DISPATCH_FLOATING_TYPES(
+    coord_tensor.type(), "prod_env_mat_a_cuda_forward_kernel", ([&] {
+        prod_env_mat_a_cuda_forward_kernel<data_t>(
+            nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei, max_nbor_size,
+            nei_mode, rcut_a, rcut_r, rcut_r_smth, max_cpy_trial, max_nnei_trial, b_nlist_map, sec_a, sec_r,
+            gpu_inlist, nbor_list_dev, array_int, array_longlong,
+            descrpt_tensor.mutable_data<data_t>(),
+            descrpt_deriv_tensor.mutable_data<data_t>(),
+            rij_tensor.mutable_data<data_t>(),
+            nlist_tensor.mutable_data<int>(),
+            coord_tensor.data<data_t>(),
+            box_tensor.copy_to(paddle::CPUPlace(), false).data<data_t>(),
+            t_avg_tensor.data<data_t>(),
+            t_std_tensor.data<data_t>(),
+            atype_tensor.data<int>(),
+            mesh_tensor);
+    }));
+  return {descrpt_tensor, descrpt_deriv_tensor, rij_tensor, nlist_tensor};
+}
+
+template <typename FPTYPE>
+static int
+_norm_copy_coord_gpu(
+    std::vector<paddle::Tensor>* tensor_list,
+    FPTYPE *&coord_cpy,
+    int *&type_cpy,
+    int *&idx_mapping,
+    int &nall,
+    int &mem_cpy,
+    const FPTYPE *coord,
+    const FPTYPE *box,
+    const int *type,
+    const int &nloc,
+    const int &max_cpy_trial,
+    const float &rcut_r)
+{
+    // Tensor FPTYPE_temp;
+    std::vector<int64_t> FPTYPE_temp_shape{nall*3};
+    paddle::Tensor tmp_coord_tensor = paddle::Tensor(paddle::PlaceType::kGPU, FPTYPE_temp_shape);
+    FPTYPE *tmp_coord = tmp_coord_tensor.mutable_data<FPTYPE>(paddle::PlaceType::kGPU);
+    tensor_list->push_back(tmp_coord_tensor);
+    cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, cudaMemcpyDeviceToDevice);
+
+    deepmd::Region<FPTYPE> region;
+    deepmd::init_region_cpu(region, box);
+    FPTYPE box_info[18];
+    std::copy(region.boxt, region.boxt + 9, box_info);
+    std::copy(region.rec_boxt, region.rec_boxt + 9, box_info + 9);
+    int cell_info[23];
+    deepmd::compute_cell_info(cell_info, rcut_r, region);
+    const int loc_cellnum = cell_info[21];
+    const int total_cellnum = cell_info[22];
+
+    //Tensor double_temp;
+    std::vector<int64_t> double_temp_shape {18};
+    paddle::Tensor double_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU, double_temp_shape);
+    FPTYPE *box_info_dev = double_temp_tensor.mutable_data<FPTYPE>(paddle::PlaceType::kGPU);
+    tensor_list->push_back(double_temp_tensor);
+
+    //Tensor int_temp;
+    std::vector<int64_t> int_temp_shape {23+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc};
+    paddle::Tensor int_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU, int_temp_shape);
+    int *cell_info_dev = int_temp_tensor.mutable_data<int>(paddle::PlaceType::kGPU);
+    int *int_data_dev = cell_info_dev + 23;
+    tensor_list->push_back(int_temp_tensor);
+
+    deepmd::memcpy_host_to_device(box_info_dev, box_info, 18);
+    deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23);
+
+    deepmd::Region<FPTYPE> region_dev;
+    FPTYPE *new_boxt = region_dev.boxt;
+    FPTYPE *new_rec_boxt = region_dev.rec_boxt;
+    region_dev.boxt = box_info_dev;
+    region_dev.rec_boxt = box_info_dev + 9;
+
+    deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev);
+
+
+    int tt;
+    paddle::Tensor cpy_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU);
+    paddle::Tensor t_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU);
+    for (tt = 0; tt < max_cpy_trial; ++tt)
+    {
+        std::vector<int64_t> cpy_temp_shape {mem_cpy * 3};
+        std::vector<int64_t> t_temp_shape {mem_cpy * 2};
+        cpy_temp_tensor.reshape(cpy_temp_shape);
+        coord_cpy = cpy_temp_tensor.mutable_data<FPTYPE>(paddle::PlaceType::kGPU);
+        t_temp_tensor.reshape(t_temp_shape);
+        type_cpy = t_temp_tensor.mutable_data<int>(paddle::PlaceType::kGPU);
+
+        idx_mapping = type_cpy + mem_cpy;
+        int ret = deepmd::copy_coord_gpu(
+            coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev,
+            tmp_coord, type, nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev);
+        if (ret == 0)
+        {
+            break;
+        }
+        else
+        {
+            mem_cpy *= 2;
+        }
+    }
+    tensor_list->push_back(cpy_temp_tensor);
+    tensor_list->push_back(t_temp_tensor);
+    region_dev.boxt = new_boxt;
+    region_dev.rec_boxt = new_rec_boxt;
+
+    return (tt != max_cpy_trial);
+}
+
+template <typename FPTYPE>
+static int
+_build_nlist_gpu(
+    std::vector<paddle::Tensor> *tensor_list,
+    int *&ilist,
+    int *&numneigh,
+    int **&firstneigh,
+    int *&jlist,
+    int &max_nnei,
+    int &mem_nnei,
+    const FPTYPE *coord,
+    const int &nloc,
+    const int &new_nall,
+    const int &max_nnei_trial,
+    const float &rcut_r)
+{
+    //Tensor nlist_temp;
+    std::vector<int64_t> nlist_temp_shape {nloc * 2};
+    paddle::Tensor nlist_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU, nlist_temp_shape);
+    ilist = nlist_temp_tensor.mutable_data<int>(paddle::PlaceType::kGPU);
+    tensor_list->push_back(nlist_temp_tensor);
+    numneigh = ilist + nloc;
+    //Tensor jlist_temp;
+    int *ind_data = NULL;
+
+    std::vector<int *> firstneigh_host(nloc);
+    int tt;
+    paddle::Tensor jlist_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU);
+    for (tt = 0; tt < max_nnei_trial; ++tt)
+    {
+        std::vector<int64_t> jlist_temp_shape {3 * nloc * mem_nnei};
+        jlist_temp_tensor.reshape(jlist_temp_shape);
+        jlist = jlist_temp_tensor.mutable_data<int>(paddle::PlaceType::kGPU);
+        ind_data = jlist + nloc * mem_nnei;
+        for (int ii = 0; ii < nloc; ++ii)
+        {
+            firstneigh_host[ii] = jlist + ii * mem_nnei;
+        }
+        deepmd::memcpy_host_to_device(firstneigh, firstneigh_host);
+        deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh);
+        int ret = deepmd::build_nlist_gpu(
+            inlist, &max_nnei, ind_data,
+            coord, nloc, new_nall, mem_nnei, rcut_r);
+        if (ret == 0)
+        {
+            break;
+        }
+        else
+        {
+            mem_nnei *= 2;
+        }
+    }
+    tensor_list->push_back(jlist_temp_tensor);
+    return (tt != max_nnei_trial);
+}
+
+static void
+_map_nlist_gpu(
+    int *nlist,
+    const int *idx_mapping,
+    const int &nloc,
+    const int &nnei)
+{
+    deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei);
+}
+
+template <typename FPTYPE>
+static void
+_prepare_coord_nlist_gpu(
+    std::vector<paddle::Tensor> *tensor_list,
+    FPTYPE const **coord,
+    FPTYPE *&coord_cpy,
+    int const **type,
+    int *&type_cpy,
+    int *&idx_mapping,
+    deepmd::InputNlist &inlist,
+    int *&ilist,
+    int *&numneigh,
+    int **&firstneigh,
+    int *&jlist,
+    int *&nbor_list_dev,
+    int &new_nall,
+    int &mem_cpy,
+    int &mem_nnei,
+    int &max_nbor_size,
+    const FPTYPE *box,
+    const int *mesh_tensor_data,
+    const int mesh_tensor_size,
+    const int &nloc,
+    const int &nei_mode,
+    const float &rcut_r,
+    const int &max_cpy_trial,
+    const int &max_nnei_trial)
+{
+    inlist.inum = nloc;
+    if (nei_mode != 3)
+    {
+        // build nlist by myself
+        // normalize and copy coord
+        if (nei_mode == 1)
+        {
+            int copy_ok = _norm_copy_coord_gpu(
+                tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall, mem_cpy,
+                *coord, box, *type, nloc, max_cpy_trial, rcut_r);
+            PD_CHECK(copy_ok, "cannot allocate mem for copied coords");
+            *coord = coord_cpy;
+            *type = type_cpy;
+
+        }
+
+        //build nlist
+        int build_ok = _build_nlist_gpu(
+            tensor_list, ilist, numneigh, firstneigh, jlist, max_nbor_size, mem_nnei,
+            *coord, nloc, new_nall, max_nnei_trial, rcut_r);
+        PD_CHECK(build_ok, "cannot allocate mem for nlist");
+        if (max_nbor_size <= 1024)
+        {
+            max_nbor_size = 1024;
+        }
+        else if (max_nbor_size <= 2048)
+        {
+            max_nbor_size = 2048;
+        }
+        else
+        {
+            max_nbor_size = 4096;
+        }
+        inlist.ilist = ilist;
+        inlist.numneigh = numneigh;
+        inlist.firstneigh = firstneigh;
+    }
+    else
+    {
+        // update nbor list
+        deepmd::InputNlist inlist_temp;
+        inlist_temp.inum = nloc;
+        deepmd::env_mat_nbor_update(
+            inlist_temp, inlist, max_nbor_size, nbor_list_dev,
+            mesh_tensor_data, mesh_tensor_size);
+        // env_mat_nbor_update(
+        //     inlist_temp, inlist, max_nbor_size, nbor_list_dev,
+        //     mesh_tensor_data, mesh_tensor_size);
+        PD_CHECK((max_numneigh(inlist_temp) <= GPU_MAX_NBOR_SIZE), "Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_numneigh(inlist_temp)) + " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) + ", which currently is not supported by deepmd-kit.");
+    }
+}
+
+
+std::vector<paddle::Tensor> ProdEnvMatAForward(
+  const paddle::Tensor& coord_tensor,
+  const paddle::Tensor& atype_tensor,
+  const paddle::Tensor& mesh_tensor,
+  const paddle::Tensor& box_tensor,
+  const paddle::Tensor& t_avg_tensor,
+  const paddle::Tensor& t_std_tensor,
+  const paddle::Tensor& natoms_tensor,
+  float rcut_a,
+  float rcut_r,
+  float rcut_r_smth,
+  std::vector<int> sel_a,
+  std::vector<int> sel_r
+) {
+  if (coord_tensor.is_gpu()) {
+    return prod_env_mat_a_cuda_forward(
+      coord_tensor,
+      atype_tensor,
+      mesh_tensor,
+      box_tensor,
+      t_avg_tensor,
+      t_std_tensor,
+      natoms_tensor,
+      rcut_a,
+      rcut_r,
+      rcut_r_smth,
+      sel_a,
+      sel_r
+    );
+  } else {
+    PD_THROW("Unsupported device type for forward function of custom relu operator.");
+  }
+}
+
+
+std::vector<std::vector<int64_t>> ProdEnvMatAInferShape(
+  std::vector<int64_t> coord_shape,
+  std::vector<int64_t> atype_shape,
+  std::vector<int64_t> box_shape,
+  std::vector<int64_t> mesh_shape,
+  std::vector<int64_t> t_avg_shape,
+  std::vector<int64_t> t_std_shape,
+  std::vector<int64_t> natoms_shape,
+  float rcut_a,
+  float rcut_r,
+  float rcut_r_smth,
+  const std::vector<int>& sel_a,
+  const std::vector<int>& sel_r
+) {
+  int64_t nloc = /*natoms[0]*/ 192;
+  int64_t nall = /*natoms[1]*/ 192;
+
+  std::vector<int> sec_a;
+  std::vector<int> sec_r;
+  cum_sum(sec_a, sel_a);
+  cum_sum(sec_r, sel_r);
+
+  int64_t nsamples = coord_shape[0];
+  int64_t ndescrpt_a = sec_a.back() * 4;
+  int64_t ndescrpt_r = sec_r.back() * 1;
+  int64_t ndescrpt = ndescrpt_a + ndescrpt_r;
+
+  int64_t nnei_a = sec_a.back();
+  int64_t nnei_r = sec_r.back();
+  int64_t nnei = nnei_a + nnei_r;
+
+  std::vector<int64_t> descrpt_shape = {nsamples, nloc * ndescrpt};
+  std::vector<int64_t> descrpt_deriv_shape = {nsamples, nloc * ndescrpt * 3};
+  std::vector<int64_t> rij_shape = {nsamples, nloc * nnei * 3};
+  std::vector<int64_t> nlist_shape = {nsamples, nloc * nnei};
+  return {descrpt_shape, descrpt_deriv_shape, rij_shape, nlist_shape};
+}
+
+std::vector<paddle::DataType> ProdEnvMatAInferDtype(
+  paddle::DataType coord_dtype,
+  paddle::DataType atype_dtype,
+  paddle::DataType box_dtype,
+  paddle::DataType mesh_dtype,
+  paddle::DataType t_avg_dtype,
+  paddle::DataType t_std_dtype,
+  paddle::DataType natoms_dtype
+) {
+  return {coord_dtype, coord_dtype, coord_dtype, coord_dtype};
+}
+
+
+PD_BUILD_OP(prod_env_mat_a)
+    .Inputs({"coord", "atype", "box", "mesh", "t_avg", "t_std", "natoms"})
+    .Outputs({"descrpt", "descrpt_deriv", "rij", "nlist"})
+    .Attrs({"rcut_a: float", "rcut_r: float", "rcut_r_smth: float", "sel_a: std::vector<int>", "sel_r: std::vector<int>"})
+    .SetKernelFn(PD_KERNEL(ProdEnvMatAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(ProdEnvMatAInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ProdEnvMatAInferDtype));
\ No newline at end of file
diff --git a/source/lib/paddle_src/prod_env_mat.h b/source/lib/paddle_src/prod_env_mat.h
new file mode 100644
index 0000000000..3052dd2230
--- /dev/null
+++ b/source/lib/paddle_src/prod_env_mat.h
@@ -0,0 +1,140 @@
+#pragma once
+#include <vector>
+
+#include "device.h"
+#include "neighbor_list.h"
+
+namespace deepmd {
+
+template <typename FPTYPE>
+void prod_env_mat_a_cpu(FPTYPE *em,
+                        FPTYPE *em_deriv,
+                        FPTYPE *rij,
+                        int *nlist,
+                        const FPTYPE *coord,
+                        const int *type,
+                        const InputNlist &inlist,
+                        const int max_nbor_size,
+                        const FPTYPE *avg,
+                        const FPTYPE *std,
+                        const int nloc,
+                        const int nall,
+                        const float rcut,
+                        const float rcut_smth,
+                        const std::vector<int> sec,
+                        const int *f_type = NULL);
+
+template <typename FPTYPE>
+void prod_env_mat_r_cpu(FPTYPE *em,
+                        FPTYPE *em_deriv,
+                        FPTYPE *rij,
+                        int *nlist,
+                        const FPTYPE *coord,
+                        const int *type,
+                        const InputNlist &inlist,
+                        const int max_nbor_size,
+                        const FPTYPE *avg,
+                        const FPTYPE *std,
+                        const int nloc,
+                        const int nall,
+                        const float rcut,
+                        const float rcut_smth,
+                        const std::vector<int> sec);
+
+#if GOOGLE_CUDA
+template <typename FPTYPE>
+void prod_env_mat_a_gpu_cuda(FPTYPE *em,
+                             FPTYPE *em_deriv,
+                             FPTYPE *rij,
+                             int *nlist,
+                             const FPTYPE *coord,
+                             const int *type,
+                             const InputNlist &gpu_inlist,
+                             int *array_int,
+                             unsigned long long *array_longlong,
+                             const int max_nbor_size,
+                             const FPTYPE *avg,
+                             const FPTYPE *std,
+                             const int nloc,
+                             const int nall,
+                             const float rcut,
+                             const float rcut_smth,
+                             const std::vector<int> sec,
+                             const int *f_type = NULL);
+
+template <typename FPTYPE>
+void prod_env_mat_r_gpu_cuda(FPTYPE *em,
+                             FPTYPE *em_deriv,
+                             FPTYPE *rij,
+                             int *nlist,
+                             const FPTYPE *coord,
+                             const int *type,
+                             const InputNlist &gpu_inlist,
+                             int *array_int,
+                             unsigned long long *array_longlong,
+                             const int max_nbor_size,
+                             const FPTYPE *avg,
+                             const FPTYPE *std,
+                             const int nloc,
+                             const int nall,
+                             const float rcut,
+                             const float rcut_smth,
+                             const std::vector<int> sec);
+
+void env_mat_nbor_update(InputNlist &inlist,
+                         InputNlist &gpu_inlist,
+                         int &max_nbor_size,
+                         int *&nbor_list_dev,
+                         const int *mesh,
+                         const int size);
+#endif  // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+template <typename FPTYPE>
+void prod_env_mat_a_gpu_rocm(FPTYPE *em,
+                             FPTYPE *em_deriv,
+                             FPTYPE *rij,
+                             int *nlist,
+                             const FPTYPE *coord,
+                             const int *type,
+                             const InputNlist &gpu_inlist,
+                             int *array_int,
+                             unsigned long long *array_longlong,
+                             const int max_nbor_size,
+                             const FPTYPE *avg,
+                             const FPTYPE *std,
+                             const int nloc,
+                             const int nall,
+                             const float rcut,
+                             const float rcut_smth,
+                             const std::vector<int> sec,
+                             const int *f_type = NULL);
+
+template <typename FPTYPE>
+void prod_env_mat_r_gpu_rocm(FPTYPE *em,
+                             FPTYPE *em_deriv,
+                             FPTYPE *rij,
+                             int *nlist,
+                             const FPTYPE *coord,
+                             const int *type,
+                             const InputNlist &gpu_inlist,
+                             int *array_int,
+                             unsigned long long *array_longlong,
+                             const int max_nbor_size,
+                             const FPTYPE *avg,
+                             const FPTYPE *std,
+                             const int nloc,
+                             const int nall,
+                             const float rcut,
+                             const float rcut_smth,
+                             const std::vector<int> sec);
+
+void env_mat_nbor_update(InputNlist &inlist,
+                         InputNlist &gpu_inlist,
+                         int &max_nbor_size,
+                         int *&nbor_list_dev,
+                         const int *mesh,
+                         const int size);
+#endif  // TENSORFLOW_USE_ROCM
+
+}  // namespace deepmd
diff --git a/source/lib/paddle_src/prod_force.cu b/source/lib/paddle_src/prod_force.cu
new file mode 100644
index 0000000000..4416cef082
--- /dev/null
+++ b/source/lib/paddle_src/prod_force.cu
@@ -0,0 +1,303 @@
+#include "paddle/extension.h"
+
+#include "device.h"
+#include "prod_force.h"
+#include "gpu_cuda.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+
+template <typename FPTYPE, int THREADS_PER_BLOCK>
+__global__ void force_deriv_wrt_center_atom(FPTYPE* force,
+                                            const FPTYPE* net_deriv,
+                                            const FPTYPE* in_deriv,
+                                            const int ndescrpt) {
+  __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
+  int_64 bid = blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
+    data[ii] = 0.f;
+  }
+  for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) {
+    for (int jj = 0; jj < 3; jj++) {
+      data[jj * THREADS_PER_BLOCK + tid] +=
+          net_deriv[bid * ndescrpt + ii] *
+          in_deriv[bid * ndescrpt * 3 + ii * 3 + jj];
+    }
+  }
+  __syncthreads();
+  // do reduction in shared memory
+  for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+    if (tid < ii) {
+      for (int jj = 0; jj < 3; jj++) {
+        data[jj * THREADS_PER_BLOCK + tid] +=
+            data[jj * THREADS_PER_BLOCK + tid + ii];
+      }
+    }
+    __syncthreads();
+  }
+  // write result for this block to global memory
+  if (tid == 0) {
+    force[bid * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
+    force[bid * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
+    force[bid * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
+  }
+}
+
+template <typename FPTYPE>
+__global__ void force_deriv_wrt_neighbors_a(FPTYPE* force,
+                                            const FPTYPE* net_deriv,
+                                            const FPTYPE* in_deriv,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei) {
+  // idy -> nnei
+  const int_64 idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
+  const unsigned int idz = threadIdx.y;
+  const int ndescrpt = nnei * 4;
+  if (idy >= nnei) {
+    return;
+  }
+  // deriv wrt neighbors
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  FPTYPE force_tmp = 0.f;
+  for (int idw = 0; idw < 4; ++idw) {
+    force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
+                 in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz];
+  }
+  atomicAdd(force + j_idx * 3 + idz, force_tmp);
+}
+
+template <typename FPTYPE>
+__global__ void force_deriv_wrt_neighbors_r(FPTYPE* force,
+                                            const FPTYPE* net_deriv,
+                                            const FPTYPE* in_deriv,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei) {
+  // idy -> nnei
+  const int_64 idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
+  const unsigned int idz = threadIdx.y;
+  const int ndescrpt = nnei * 1;
+  if (idy >= nnei) {
+    return;
+  }
+  // deriv wrt neighbors
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  atomicAdd(force + j_idx * 3 + idz,
+            net_deriv[idx * ndescrpt + idy] *
+                in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void prod_force_a_gpu_cuda(FPTYPE* force,
+                           const FPTYPE* net_deriv,
+                           const FPTYPE* in_deriv,
+                           const int* nlist,
+                           const int nloc,
+                           const int nall,
+                           const int nnei) {
+  const int ndescrpt = nnei * 4;
+  DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nall * 3));
+
+  force_deriv_wrt_center_atom<FPTYPE, TPB>
+      <<<nloc, TPB>>>(force, net_deriv, in_deriv, ndescrpt);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+
+  const int LEN = 64;
+  const int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(LEN, 3);
+  force_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
+      force, net_deriv, in_deriv, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void prod_force_r_gpu_cuda(FPTYPE* force,
+                           const FPTYPE* net_deriv,
+                           const FPTYPE* in_deriv,
+                           const int* nlist,
+                           const int nloc,
+                           const int nall,
+                           const int nnei) {
+  const int ndescrpt = nnei * 1;
+  DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nall * 3));
+
+  force_deriv_wrt_center_atom<FPTYPE, TPB>
+      <<<nloc, TPB>>>(force, net_deriv, in_deriv, ndescrpt);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+
+  const int LEN = 64;
+  const int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(LEN, 3);
+  force_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
+      force, net_deriv, in_deriv, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template void prod_force_a_gpu_cuda<float>(float* force,
+                                           const float* net_deriv,
+                                           const float* in_deriv,
+                                           const int* nlist,
+                                           const int nloc,
+                                           const int nall,
+                                           const int nnei);
+template void prod_force_a_gpu_cuda<double>(double* force,
+                                            const double* net_deriv,
+                                            const double* in_deriv,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nall,
+                                            const int nnei);
+template void prod_force_r_gpu_cuda<float>(float* force,
+                                           const float* net_deriv,
+                                           const float* in_deriv,
+                                           const int* nlist,
+                                           const int nloc,
+                                           const int nall,
+                                           const int nnei);
+template void prod_force_r_gpu_cuda<double>(double* force,
+                                            const double* net_deriv,
+                                            const double* in_deriv,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nall,
+                                            const int nnei);
+}  // namespace deepmd
+
+
+template <typename data_t>
+void PdProdForceSeAOpForwardCUDAKernel(
+  int nloc, int nall, int nframes, int ndescrpt, int nnei,
+  data_t* p_force, const data_t* p_net_deriv, const data_t* p_in_deriv, const int* p_nlist
+) {
+  for(int kk = 0; kk < nframes; ++kk){
+    data_t * force = p_force + kk * nall * 3;
+    const data_t * net_deriv = p_net_deriv + kk * nloc * ndescrpt;
+    const data_t * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const int * nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_force_a_gpu_cuda(
+        force,
+        net_deriv, in_deriv, nlist, nloc, nall, nnei
+    );
+  }
+}
+
+
+std::vector<paddle::Tensor> PdProdForceSeAOpCUDAForward(
+  const paddle::Tensor& net_deriv_tensor,
+  const paddle::Tensor& in_deriv_tensor,
+  const paddle::Tensor& nlist_tensor,
+  const paddle::Tensor& natoms_tensor,
+  int n_a_sel,
+  int n_r_sel
+) {
+  CHECK_INPUT(net_deriv_tensor);
+  CHECK_INPUT(in_deriv_tensor);
+  CHECK_INPUT(nlist_tensor);
+  // CHECK_INPUT(natoms_tensor);
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3");
+  const int* natoms = natoms_tensor.data<int>();
+  int nloc = natoms[0];
+  int nall = natoms[1];
+  int nframes = net_deriv_tensor.shape()[0];
+  int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
+  int nnei = nlist_tensor.shape()[1] / nloc;
+
+  PD_CHECK(nframes == in_deriv_tensor.shape()[0], "number of samples should match");
+  PD_CHECK(nframes == nlist_tensor.shape()[0],"number of samples should match");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], "number of descriptors should match");
+
+  std::vector<int64_t> force_shape {nframes, 3 * nall};
+  paddle::Tensor force_tensor = paddle::Tensor(paddle::PlaceType::kGPU, force_shape);
+
+  assert (nframes == force_shape[0]);
+  assert (nframes == net_deriv_tensor.shape()[0]);
+  assert (nframes == in_deriv_tensor.shape()[0]);
+  assert (nframes == nlist_tensor.shape()[0]);
+  assert (nall * 3 == force_shape[1]);
+  assert (nloc * ndescrpt == net_deriv_tensor.shape()[1]);
+  assert (nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1]);
+  assert (nloc * nnei == nlist_tensor.shape()[1]);
+  assert (nnei * 4 == ndescrpt);
+
+  PD_DISPATCH_FLOATING_TYPES(
+    net_deriv_tensor.type(), "pd_prod_force_se_a_cpu_forward_kernel", ([&] {
+      PdProdForceSeAOpForwardCUDAKernel<data_t>(
+          nloc, nall, nframes, ndescrpt, nnei,
+          force_tensor.mutable_data<data_t>(), net_deriv_tensor.data<data_t>(),
+          in_deriv_tensor.data<data_t>(), nlist_tensor.data<int>());
+  }));
+
+  return {force_tensor};
+}
+
+
+std::vector<paddle::Tensor> PdProdForceSeAForward(
+  const paddle::Tensor& net_deriv_tensor,
+  const paddle::Tensor& in_deriv_tensor,
+  const paddle::Tensor& nlist_tensor,
+  const paddle::Tensor& natoms_tensor,
+  int n_a_sel,
+  int n_r_sel
+) {
+    // if(net_deriv_tensor.place() == paddle::PlaceType::kCPU){
+    //     return PdProdForceSeAOpCPUForward(net_deriv_tensor, in_deriv_tensor, nlist_tensor, natoms_tensor, n_a_sel, n_r_sel);
+    // }else if(net_deriv_tensor.place() == paddle::PlaceType::kGPU){
+    return PdProdForceSeAOpCUDAForward(net_deriv_tensor, in_deriv_tensor, nlist_tensor, natoms_tensor, n_a_sel, n_r_sel);
+    // }else{
+    //     PD_THROW("No Such kernel for PdFrodForceSeAForward!");
+    // }
+}
+
+std::vector<std::vector<int64_t>> PdProdForceSeAInferShape(
+  std::vector<int64_t> net_deriv_shape,
+  std::vector<int64_t> in_deriv_shape,
+  std::vector<int64_t> nlist_shape,
+  std::vector<int64_t> natoms_shape,
+  const int &n_a_sel,
+  const int &n_r_sel
+) {
+  // int64_t nloc = /*natoms[0]*/ 192;
+  int64_t nall = /*natoms[1]*/ 192;
+  int64_t nframes = net_deriv_shape[0];
+  std::vector<int64_t> force_shape = {nframes, 3 * nall};
+  return {force_shape};
+}
+
+std::vector<paddle::DataType> PdProdForceSeAInferDtype(
+  paddle::DataType net_deriv_dtype,
+  paddle::DataType in_deriv_dtype,
+  paddle::DataType nlist_dtype,
+  paddle::DataType natoms_dtype
+) {
+  return {net_deriv_dtype};
+}
+
+
+PD_BUILD_OP(prod_force_se_a)
+    .Inputs({"net_deriv", "in_deriv", "nlist", "natoms"})
+    .Outputs({"force"})
+    .Attrs({"n_a_sel: int", "n_r_sel: int"})
+    .SetKernelFn(PD_KERNEL(PdProdForceSeAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(PdProdForceSeAInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(PdProdForceSeAInferDtype));
diff --git a/source/lib/paddle_src/prod_force_grad.cu b/source/lib/paddle_src/prod_force_grad.cu
new file mode 100644
index 0000000000..a1dad3dc3c
--- /dev/null
+++ b/source/lib/paddle_src/prod_force_grad.cu
@@ -0,0 +1,275 @@
+#include "paddle/extension.h"
+
+#include "device.h"
+#include "prod_force_grad.h"
+#include "gpu_cuda.h"
+
+#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+
+template <typename FPTYPE>
+__device__ inline FPTYPE dev_dot(const FPTYPE* arr1, const FPTYPE* arr2) {
+  return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
+}
+
+template <typename FPTYPE>
+__global__ void force_grad_wrt_center_atom(FPTYPE* grad_net,
+                                           const FPTYPE* grad,
+                                           const FPTYPE* env_deriv,
+                                           const int ndescrpt) {
+  __shared__ FPTYPE grad_one[3];
+  int_64 center_idx = blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  if (tid < 3) {
+    grad_one[tid] = grad[center_idx * 3 + tid];
+  }
+  __syncthreads();
+  unsigned int descrpt_idx = blockIdx.y * blockDim.x + tid;
+  if (descrpt_idx < ndescrpt) {
+    grad_net[center_idx * ndescrpt + descrpt_idx] -= dev_dot(
+        grad_one, env_deriv + center_idx * ndescrpt * 3 + descrpt_idx * 3);
+  }
+}
+
+template <typename FPTYPE>
+__global__ void force_grad_wrt_neighbors_a(FPTYPE* grad_net,
+                                           const FPTYPE* grad,
+                                           const FPTYPE* env_deriv,
+                                           const int* nlist,
+                                           const int nloc,
+                                           const int nnei) {
+  // idy -> nnei
+  const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const unsigned int idy = blockIdx.y;
+  const unsigned int idw = threadIdx.y;
+  if (idx >= nloc) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  if (j_idx >= nloc) j_idx = j_idx % nloc;
+  grad_net[idx * nnei * 4 + idy * 4 + idw] += dev_dot(
+      grad + j_idx * 3, env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3);
+}
+
+template <typename FPTYPE>
+__global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net,
+                                           const FPTYPE* grad,
+                                           const FPTYPE* env_deriv,
+                                           const int* nlist,
+                                           const int nloc,
+                                           const int nnei) {
+  // idy -> nnei
+  const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const unsigned int idy = blockIdx.y;
+  if (idx >= nloc) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  if (j_idx >= nloc) j_idx = j_idx % nloc;
+  grad_net[idx * nnei + idy] +=
+      dev_dot(grad + j_idx * 3, env_deriv + idx * nnei * 3 + idy * 3);
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
+                                const FPTYPE* grad,
+                                const FPTYPE* env_deriv,
+                                const int* nlist,
+                                const int nloc,
+                                const int nnei) {
+  const int ndescrpt = nnei * 4;
+  DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  const int nblock = (ndescrpt + TPB - 1) / TPB;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(TPB, 1);
+  force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(grad_net, grad,
+                                                          env_deriv, ndescrpt);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+
+  const int LEN = 128;
+  const int nblock_ = (nloc + LEN - 1) / LEN;
+  dim3 block_grid_(nblock_, nnei);
+  dim3 thread_grid_(LEN, 4);
+  force_grad_wrt_neighbors_a<<<block_grid_, thread_grid_>>>(
+      grad_net, grad, env_deriv, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
+                                const FPTYPE* grad,
+                                const FPTYPE* env_deriv,
+                                const int* nlist,
+                                const int nloc,
+                                const int nnei) {
+  const int ndescrpt = nnei * 1;
+  DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  const int nblock = (ndescrpt + TPB - 1) / TPB;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(TPB, 1);
+  force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(grad_net, grad,
+                                                          env_deriv, ndescrpt);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+
+  const int LEN = 128;
+  const int nblock_ = (nloc + LEN - 1) / LEN;
+  dim3 block_grid_(nblock_, nnei);
+  dim3 thread_grid_(LEN, 1);
+  force_grad_wrt_neighbors_r<<<block_grid_, thread_grid_>>>(
+      grad_net, grad, env_deriv, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template void prod_force_grad_a_gpu_cuda<float>(float* grad_net,
+                                                const float* grad,
+                                                const float* env_deriv,
+                                                const int* nlist,
+                                                const int nloc,
+                                                const int nnei);
+template void prod_force_grad_a_gpu_cuda<double>(double* grad_net,
+                                                 const double* grad,
+                                                 const double* env_deriv,
+                                                 const int* nlist,
+                                                 const int nloc,
+                                                 const int nnei);
+template void prod_force_grad_r_gpu_cuda<float>(float* grad_net,
+                                                const float* grad,
+                                                const float* env_deriv,
+                                                const int* nlist,
+                                                const int nloc,
+                                                const int nnei);
+template void prod_force_grad_r_gpu_cuda<double>(double* grad_net,
+                                                 const double* grad,
+                                                 const double* env_deriv,
+                                                 const int* nlist,
+                                                 const int nloc,
+                                                 const int nnei);
+}  // namespace deepmd
+
+template <typename data_t>
+void PdProdForceSeAOpCUDABackwardKernel(
+  int nloc, int nframes, int ndescrpt, int nnei,
+  const data_t* p_grad, const data_t* p_net_deriv, const data_t* p_in_deriv,
+  const int* p_nlist, data_t* p_grad_net
+) {
+    for (int_64 kk = 0; kk < nframes; ++kk) {
+      data_t* grad_net = p_grad_net + kk * nloc * ndescrpt;
+      const data_t* grad = p_grad + kk * nloc * 3;
+      const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+      const int* nlist = p_nlist + kk * nloc * nnei;
+      deepmd::prod_force_grad_a_gpu_cuda(
+        grad_net, grad, in_deriv, nlist, nloc, nnei
+      );
+    }
+}
+
+
+std::vector<paddle::Tensor> PdProdForceSeAOpCUDABackward(
+  const paddle::Tensor& force_grad_tensor,
+  const paddle::Tensor& net_deriv_tensor,
+  const paddle::Tensor& in_deriv_tensor,
+  const paddle::Tensor& nlist_tensor,
+  const paddle::Tensor& natoms_tensor,
+  int n_a_sel,
+  int n_r_sel
+) {
+    auto grad_shape = force_grad_tensor.shape();
+    auto net_deriv_shape = net_deriv_tensor.shape();
+    auto in_deriv_shape = in_deriv_tensor.shape();
+    auto nlist_shape = nlist_tensor.shape();
+    auto natoms_shape = natoms_tensor.shape();
+
+    CHECK_INPUT_DIM(force_grad_tensor, 2);
+    CHECK_INPUT_DIM(net_deriv_tensor, 2);
+    CHECK_INPUT_DIM(in_deriv_tensor, 2);
+    CHECK_INPUT_DIM(nlist_tensor, 2);
+    CHECK_INPUT_DIM(natoms_tensor, 1);
+
+    PD_CHECK(natoms_shape[0] >= 3, "number of atoms should be larger than (or equal to) 3");
+
+    const int* natoms = nullptr;
+    // if(natoms_tensor.place() != paddle::PlaceType::kCPU){
+    //     natoms = natoms_tensor.copy_to<int>(paddle::PlaceType::kCPU).data<int>();
+    // }else{
+    natoms = natoms_tensor.data<int>();
+    // }
+    int nframes = net_deriv_shape[0];
+    int nloc = natoms[0];
+    int ndescrpt = net_deriv_shape[1] / nloc;
+    int nnei = nlist_shape[1] / nloc;
+
+    PD_CHECK(nframes == grad_shape[0], "number of frames should match");
+    PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
+    PD_CHECK(nframes == nlist_shape[0],"number of samples should match");
+    PD_CHECK(nloc * 3 == grad_shape[1], "input grad shape should be 3 x natoms");
+    PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], "number of descriptors should match");
+    PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
+
+    std::vector<int64_t> grad_net_shape {nframes, nloc * ndescrpt};
+    // paddle::Tensor grad_net_tensor = paddle::Tensor(paddle::PlaceType::kCPU, grad_net_shape);
+    paddle::Tensor grad_net_tensor = paddle::empty(
+      grad_net_shape,
+      force_grad_tensor.dtype(),
+      force_grad_tensor.place()
+    );
+
+    // if(force_grad_tensor.place() == paddle::PlaceType::kCPU){
+    //     PD_DISPATCH_FLOATING_TYPES(
+    //     force_grad_tensor.type(), "pd_prod_force_se_a_cpu_backward_kernel", ([&] {
+    //         PdProdForceSeAOpCPUBackwardKernel<data_t>(
+    //             nloc, nframes, ndescrpt, nnei,
+    //             force_grad_tensor.data<data_t>(),
+    //             net_deriv_tensor.data<data_t>(),
+    //             in_deriv_tensor.data<data_t>(),
+    //             nlist_tensor.data<int>(),
+    //             grad_net_tensor.mutable_data<data_t>());
+    //     }));
+    // }else{
+      PD_DISPATCH_FLOATING_TYPES(
+        force_grad_tensor.type(), "pd_prod_force_se_a_cuda_backward_kernel", ([&] {
+          PdProdForceSeAOpCUDABackwardKernel<data_t>(
+              nloc, nframes, ndescrpt, nnei,
+              force_grad_tensor.data<data_t>(),
+              net_deriv_tensor.data<data_t>(),
+              in_deriv_tensor.data<data_t>(),
+              nlist_tensor.data<int>(),
+              grad_net_tensor.mutable_data<data_t>());
+      }));
+    // }
+    return {grad_net_tensor};
+}
+
+
+std::vector<paddle::Tensor> PdProdForceSeABackward(
+  const paddle::Tensor& force_grad_tensor,
+  const paddle::Tensor& net_deriv_tensor,
+  const paddle::Tensor& in_deriv_tensor,
+  const paddle::Tensor& nlist_tensor,
+  const paddle::Tensor& natoms_tensor,
+  int n_a_sel,
+  int n_r_sel
+) {
+    return PdProdForceSeAOpCUDABackward(
+        force_grad_tensor, net_deriv_tensor, in_deriv_tensor,
+        nlist_tensor, natoms_tensor, n_a_sel, n_r_sel
+    );
+}
+
+
+PD_BUILD_GRAD_OP(prod_force_se_a)
+    .Inputs({paddle::Grad("force"), "net_deriv", "in_deriv", "nlist", "natoms"})
+    .Outputs({paddle::Grad("net_deriv")})
+    .Attrs({"n_a_sel: int", "n_r_sel: int"})
+    .SetKernelFn(PD_KERNEL(PdProdForceSeABackward));
+
diff --git a/source/lib/paddle_src/prod_virial.cc b/source/lib/paddle_src/prod_virial.cc
new file mode 100644
index 0000000000..8769ccf8f1
--- /dev/null
+++ b/source/lib/paddle_src/prod_virial.cc
@@ -0,0 +1,219 @@
+#include "custom_op.h"
+
+REGISTER_OP("ProdVirial")
+    .Attr("T: {float, double} = DT_DOUBLE")
+    .Input("net_deriv: T")
+    .Input("in_deriv: T")
+    .Input("rij: T")
+    .Input("nlist: int32")
+    .Input("axis: int32")
+    .Input("natoms: int32")
+    .Attr("n_a_sel: int")
+    .Attr("n_r_sel: int")
+    .Output("virial: T")
+    .Output("atom_virial: T");
+
+using namespace tensorflow;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template <typename Device, typename FPTYPE>
+class ProdVirialOp : public OpKernel {
+ public:
+  explicit ProdVirialOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("n_a_sel", &n_a_sel));
+    OP_REQUIRES_OK(context, context->GetAttr("n_r_sel", &n_r_sel));
+    n_a_shift = n_a_sel * 4;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(
+        context, [this](OpKernelContext* context) { this->_Compute(context); });
+  }
+
+  void _Compute(OpKernelContext* context) {
+    // Grab the input tensor
+    const Tensor& net_deriv_tensor = context->input(0);
+    const Tensor& in_deriv_tensor = context->input(1);
+    const Tensor& rij_tensor = context->input(2);
+    const Tensor& nlist_tensor = context->input(3);
+    const Tensor& axis_tensor = context->input(4);
+    const Tensor& natoms_tensor = context->input(5);
+
+    // set size of the sample
+    OP_REQUIRES(context, (net_deriv_tensor.shape().dims() == 2),
+                errors::InvalidArgument("Dim of net deriv should be 2"));
+    OP_REQUIRES(context, (in_deriv_tensor.shape().dims() == 2),
+                errors::InvalidArgument("Dim of input deriv should be 2"));
+    OP_REQUIRES(context, (rij_tensor.shape().dims() == 2),
+                errors::InvalidArgument("Dim of rij should be 2"));
+    OP_REQUIRES(context, (nlist_tensor.shape().dims() == 2),
+                errors::InvalidArgument("Dim of nlist should be 2"));
+    OP_REQUIRES(context, (axis_tensor.shape().dims() == 2),
+                errors::InvalidArgument("Dim of axis should be 2"));
+    OP_REQUIRES(context, (natoms_tensor.shape().dims() == 1),
+                errors::InvalidArgument("Dim of natoms should be 1"));
+
+    OP_REQUIRES(context, (natoms_tensor.shape().dim_size(0) >= 3),
+                errors::InvalidArgument(
+                    "number of atoms should be larger than (or equal to) 3"));
+    auto natoms = natoms_tensor.flat<int>();
+
+    int nframes = net_deriv_tensor.shape().dim_size(0);
+    int nloc = natoms(0);
+    int nall = natoms(1);
+    int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc;
+    int nnei = nlist_tensor.shape().dim_size(1) / nloc;
+
+    // check the sizes
+    OP_REQUIRES(context, (nframes == in_deriv_tensor.shape().dim_size(0)),
+                errors::InvalidArgument("number of samples should match"));
+    OP_REQUIRES(context, (nframes == rij_tensor.shape().dim_size(0)),
+                errors::InvalidArgument("number of samples should match"));
+    OP_REQUIRES(context, (nframes == nlist_tensor.shape().dim_size(0)),
+                errors::InvalidArgument("number of samples should match"));
+    OP_REQUIRES(context, (nframes == axis_tensor.shape().dim_size(0)),
+                errors::InvalidArgument("number of samples should match"));
+
+    OP_REQUIRES(context,
+                (nloc * ndescrpt * 12 == in_deriv_tensor.shape().dim_size(1)),
+                errors::InvalidArgument("number of descriptors should match"));
+    OP_REQUIRES(context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)),
+                errors::InvalidArgument("dim of rij should be nnei * 3"));
+    OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel),
+                errors::InvalidArgument("number of neighbors should match"));
+    OP_REQUIRES(
+        context, (nloc * 4 == axis_tensor.shape().dim_size(1)),
+        errors::InvalidArgument("number of axis type+id should be 2+2"));
+
+    // Create an output tensor
+    TensorShape virial_shape;
+    virial_shape.AddDim(nframes);
+    virial_shape.AddDim(9);
+    Tensor* virial_tensor = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, virial_shape, &virial_tensor));
+    TensorShape atom_virial_shape;
+    atom_virial_shape.AddDim(nframes);
+    atom_virial_shape.AddDim(9 * nall);
+    Tensor* atom_virial_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape,
+                                                     &atom_virial_tensor));
+
+    // flat the tensors
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
+    auto rij = rij_tensor.flat<FPTYPE>();
+    auto nlist = nlist_tensor.flat<int>();
+    auto axis = axis_tensor.flat<int>();
+    auto virial = virial_tensor->flat<FPTYPE>();
+    auto atom_virial = atom_virial_tensor->flat<FPTYPE>();
+
+    // loop over samples
+#pragma omp parallel for
+    for (int kk = 0; kk < nframes; ++kk) {
+      int net_iter = kk * nloc * ndescrpt;
+      int in_iter = kk * nloc * ndescrpt * 12;
+      int rij_iter = kk * nloc * nnei * 3;
+      int nlist_iter = kk * nloc * nnei;
+      int axis_iter = kk * nloc * 4;
+      int virial_iter = kk * 9;
+      int atom_virial_iter = kk * nall * 9;
+
+      for (int ii = 0; ii < 9; ++ii) {
+        virial(virial_iter + ii) = 0.;
+      }
+      for (int ii = 0; ii < 9 * nall; ++ii) {
+        atom_virial(atom_virial_iter + ii) = 0.;
+      }
+
+      // compute virial of a frame
+      for (int ii = 0; ii < nloc; ++ii) {
+        int i_idx = ii;
+
+        // set axes
+        int axis0_type = axis(axis_iter + i_idx * 4 + 0);
+        int axis1_type = axis(axis_iter + i_idx * 4 + 2);
+        int axis_0 = axis(axis_iter + i_idx * 4 + 1);
+        int axis_1 = axis(axis_iter + i_idx * 4 + 3);
+        if (axis0_type == 1) axis_0 += n_a_sel;
+        if (axis1_type == 1) axis_1 += n_a_sel;
+
+        // deriv wrt neighbors
+        for (int jj = 0; jj < nnei; ++jj) {
+          int j_idx = nlist(nlist_iter + i_idx * nnei + jj);
+          if (j_idx < 0) continue;
+          if (jj == axis_0) {
+            for (int aa = 0; aa < ndescrpt; ++aa) {
+              FPTYPE pref = -1.0 * net_deriv(net_iter + i_idx * ndescrpt + aa);
+              for (int dd0 = 0; dd0 < 3; ++dd0) {
+                for (int dd1 = 0; dd1 < 3; ++dd1) {
+                  FPTYPE tmp_v =
+                      pref * rij(rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *
+                      in_deriv(in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 +
+                               dd0);
+                  virial(virial_iter + dd0 * 3 + dd1) += tmp_v;
+                  atom_virial(atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) +=
+                      tmp_v;
+                }
+              }
+            }
+          } else if (jj == axis_1) {
+            for (int aa = 0; aa < ndescrpt; ++aa) {
+              FPTYPE pref = -1.0 * net_deriv(net_iter + i_idx * ndescrpt + aa);
+              for (int dd0 = 0; dd0 < 3; ++dd0) {
+                for (int dd1 = 0; dd1 < 3; ++dd1) {
+                  FPTYPE tmp_v =
+                      pref * rij(rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *
+                      in_deriv(in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 +
+                               dd0);
+                  virial(virial_iter + dd0 * 3 + dd1) += tmp_v;
+                  atom_virial(atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) +=
+                      tmp_v;
+                }
+              }
+            }
+          } else {
+            int aa_start, aa_end;
+            make_descript_range(aa_start, aa_end, jj);
+            for (int aa = aa_start; aa < aa_end; ++aa) {
+              FPTYPE pref = -1.0 * net_deriv(net_iter + i_idx * ndescrpt + aa);
+              for (int dd0 = 0; dd0 < 3; ++dd0) {
+                for (int dd1 = 0; dd1 < 3; ++dd1) {
+                  FPTYPE tmp_v =
+                      pref * rij(rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *
+                      in_deriv(in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 +
+                               dd0);
+                  virial(virial_iter + dd0 * 3 + dd1) += tmp_v;
+                  atom_virial(atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) +=
+                      tmp_v;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  int n_r_sel, n_a_sel, n_a_shift;
+  inline void make_descript_range(int& idx_start,
+                                  int& idx_end,
+                                  const int& nei_idx) {
+    if (nei_idx < n_a_sel) {
+      idx_start = nei_idx * 4;
+      idx_end = nei_idx * 4 + 4;
+    } else {
+      idx_start = n_a_shift + (nei_idx - n_a_sel);
+      idx_end = n_a_shift + (nei_idx - n_a_sel) + 1;
+    }
+  }
+};
+
+#define REGISTER_CPU(T)                                             \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("ProdVirial").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      ProdVirialOp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
diff --git a/source/lib/paddle_src/prod_virial.cu b/source/lib/paddle_src/prod_virial.cu
new file mode 100644
index 0000000000..fe7abee63b
--- /dev/null
+++ b/source/lib/paddle_src/prod_virial.cu
@@ -0,0 +1,496 @@
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+#include "paddle/extension.h"
+
+#include "device.h"
+#include "prod_virial.h"
+#include "gpu_cuda.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT_READY(x) PD_CHECK(x.IsInitialized(), #x " must be initialized before usage.")
+
+template <typename FPTYPE, int THREADS_PER_BLOCK>
+__global__ void atom_virial_reduction(FPTYPE* virial,
+                                      const FPTYPE* atom_virial,
+                                      const int nall) {
+  unsigned int bid = blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  __shared__ FPTYPE data[THREADS_PER_BLOCK];
+  data[tid] = (FPTYPE)0.;
+  for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) {
+    data[tid] += atom_virial[ii * 9 + bid];
+  }
+  __syncthreads();
+  // do reduction in shared memory
+  for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+    if (tid < ii) {
+      data[tid] += data[tid + ii];
+    }
+    __syncthreads();
+  }
+  // write result for this block to global memory
+  if (tid == 0) virial[bid] = data[0];
+}
+
+template <typename FPTYPE>
+__global__ void virial_deriv_wrt_neighbors_a(FPTYPE* virial,
+                                             FPTYPE* atom_virial,
+                                             const FPTYPE* net_deriv,
+                                             const FPTYPE* in_deriv,
+                                             const FPTYPE* rij,
+                                             const int* nlist,
+                                             const int nloc,
+                                             const int nnei) {
+  // idx -> nloc
+  // idy -> nnei
+  // idz = dd0 * 3 + dd1
+  // dd0 = idz / 3
+  // dd1 = idz % 3
+  const int_64 idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
+  const unsigned int idz = threadIdx.y;
+  const int ndescrpt = nnei * 4;
+  if (idy >= nnei) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  // atomicAdd(
+  //    virial + idz,
+  //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
+  //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
+  //    3]);
+  FPTYPE virial_tmp = (FPTYPE)0.;
+  for (int idw = 0; idw < 4; ++idw) {
+    virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
+                  rij[idx * nnei * 3 + idy * 3 + idz % 3] *
+                  in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3];
+  }
+  atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp);
+}
+
+template <typename FPTYPE>
+__global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial,
+                                             FPTYPE* atom_virial,
+                                             const FPTYPE* net_deriv,
+                                             const FPTYPE* in_deriv,
+                                             const FPTYPE* rij,
+                                             const int* nlist,
+                                             const int nloc,
+                                             const int nnei) {
+  // idx -> nloc
+  // idy -> nnei
+  // idz = dd0 * 3 + dd1
+  // dd0 = idz / 3
+  // dd1 = idz % 3
+  const int_64 idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
+  const unsigned int idz = threadIdx.y;
+  const int ndescrpt = nnei * 1;
+
+  if (idy >= nnei) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  // atomicAdd(
+  //    virial + idz,
+  //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
+  //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
+  //    3]);
+  atomicAdd(atom_virial + j_idx * 9 + idz,
+            net_deriv[idx * ndescrpt + idy] *
+                rij[idx * nnei * 3 + idy * 3 + idz % 3] *
+                in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void prod_virial_a_gpu_cuda(FPTYPE* virial,
+                            FPTYPE* atom_virial,
+                            const FPTYPE* net_deriv,
+                            const FPTYPE* in_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nall,
+                            const int nnei) {
+  DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
+  DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
+
+  const int LEN = 16;
+  int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(LEN, 9);
+  // compute virial of a frame
+  virial_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
+      virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  // reduction atom_virial to virial
+  atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void prod_virial_r_gpu_cuda(FPTYPE* virial,
+                            FPTYPE* atom_virial,
+                            const FPTYPE* net_deriv,
+                            const FPTYPE* in_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nall,
+                            const int nnei) {
+  DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
+  DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
+
+  const int LEN = 16;
+  int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(LEN, 9);
+  // compute virial of a frame
+  virial_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
+      virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  // reduction atom_virial to virial
+  atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+}  // namespace deepmd
+
+template <typename data_t>
+void PdProdVirialSeAOpForwardCUDAKernel(
+    int nloc, int nall, int ndescrpt, int nnei, int nframes,
+    data_t* p_virial, data_t* p_atom_virial, const data_t* p_net_deriv,
+    const data_t* p_in_deriv, const data_t* p_rij, const int* p_nlist){
+
+    for(int kk = 0; kk < nframes; ++kk){
+      data_t * virial = p_virial + kk * 9;
+      data_t * atom_virial = p_atom_virial + kk * nall * 9;
+      const data_t * net_deriv = p_net_deriv + kk * nloc * ndescrpt;
+      const data_t * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+      const data_t * rij = p_rij + kk * nloc * nnei * 3;
+      const int * nlist = p_nlist + kk * nloc * nnei;
+      deepmd::prod_virial_a_gpu_cuda(
+          virial, atom_virial,
+          net_deriv, in_deriv, rij, nlist, nloc, nall, nnei);
+    }
+}
+
+
+std::vector<paddle::Tensor> PdProdVirialSeAOpCUDAForward(
+const paddle::Tensor& net_deriv_tensor,
+const paddle::Tensor& in_deriv_tensor,
+const paddle::Tensor& rij_tensor,
+const paddle::Tensor& nlist_tensor,
+const paddle::Tensor& natoms_tensor,
+int n_a_sel,
+int n_r_sel
+){
+    CHECK_INPUT(net_deriv_tensor);
+    CHECK_INPUT(in_deriv_tensor);
+    CHECK_INPUT(rij_tensor);
+    CHECK_INPUT(nlist_tensor);
+    // CHECK_INPUT(natoms_tensor); // TODO: 暂时指定python端必须为cpu，gpu的copy_to会导致返回的指针数据不对
+
+    CHECK_INPUT_DIM(net_deriv_tensor, 2);
+    CHECK_INPUT_DIM(in_deriv_tensor, 2);
+    CHECK_INPUT_DIM(rij_tensor, 2);
+    CHECK_INPUT_DIM(nlist_tensor, 2);
+    CHECK_INPUT_DIM(natoms_tensor, 1);
+
+    PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3");
+    const int* natoms = natoms_tensor.data<int>();
+    // printf("natoms_tensor.numel() = %d\n", natoms_tensor.numel());
+    int nloc = natoms[0];
+    // printf("nloc = %d\n", nloc);
+    int nall = natoms[1];
+    // printf("nall = %d\n", nall);
+    int nnei = nlist_tensor.shape()[1] / nloc;
+    int nframes = net_deriv_tensor.shape()[0];
+    int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
+    PD_CHECK(nframes == in_deriv_tensor.shape()[0], "number of samples should match");
+    PD_CHECK(nframes == rij_tensor.shape()[0], "number of samples should match");
+    PD_CHECK(nframes == nlist_tensor.shape()[0],"number of samples should match");
+    PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], "number of descriptors should match");
+    PD_CHECK((nloc * nnei * 3) == rij_tensor.shape()[1], "dim of rij should be nnei * 3");
+
+    std::vector<int64_t> virial_shape {nframes, 9};
+    std::vector<int64_t> atom_virial_shape {nframes, 9 * nall};
+    paddle::Tensor virial_tensor = paddle::Tensor(paddle::PlaceType::kGPU, virial_shape);
+    paddle::Tensor atom_virial_tensor = paddle::Tensor(paddle::PlaceType::kGPU, atom_virial_shape);
+
+    PD_DISPATCH_FLOATING_TYPES(
+      net_deriv_tensor.type(), "pd_prod_virial_se_a_cpu_forward_kernel", ([&] {
+        PdProdVirialSeAOpForwardCUDAKernel<data_t>(
+            nloc, nall, ndescrpt, nnei, nframes,
+            virial_tensor.mutable_data<data_t>(), atom_virial_tensor.mutable_data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            rij_tensor.data<data_t>(), nlist_tensor.data<int>());
+      }));
+
+    return {virial_tensor, atom_virial_tensor};
+}
+
+std::vector<paddle::Tensor> PdProdVirialSeAForward(
+  const paddle::Tensor& net_deriv_tensor,
+  const paddle::Tensor& in_deriv_tensor,
+  const paddle::Tensor& rij_tensor,
+  const paddle::Tensor& nlist_tensor,
+  const paddle::Tensor& natoms_tensor,
+  int n_a_sel,
+  int n_r_sel
+) {
+  if (net_deriv_tensor.is_gpu()) {
+    // std::cout << natoms_tensor.dtype() << std::endl;
+    // const int* natoms = natoms_tensor.data<int>();
+    // printf("%d\n", natoms[0]);
+    return PdProdVirialSeAOpCUDAForward(
+      net_deriv_tensor,
+      in_deriv_tensor,
+      rij_tensor,
+      nlist_tensor,
+      natoms_tensor,
+      n_a_sel,
+      n_r_sel
+    );
+  } else {
+    PD_THROW("Unsupported device type for forward function of custom relu operator.");
+  }
+}
+
+
+/*以下是反向代码*/
+
+// template <typename data_t>
+// void PdProdForceSeAOpCPUBackwardKernel(
+//     int nloc, int nframes, int ndescrpt, int nnei,
+//     const data_t* grad, const data_t* net_deriv,
+//     const data_t* in_deriv, const data_t* rij, const int* nlist,
+//     data_t* grad_net){
+
+// #pragma omp parallel for
+//     for (int kk = 0; kk < nframes; ++kk){
+
+//       int grad_iter	= kk * 9;
+//       int in_iter	= kk * nloc * ndescrpt * 3;
+//       int rij_iter	= kk * nloc * nnei * 3;
+//       int nlist_iter	= kk * nloc * nnei;
+//       int grad_net_iter	= kk * nloc * ndescrpt;
+//       // int n_a_shift = n_a_sel * 4;
+
+//       deepmd::prod_virial_grad_a_cpu(
+//         &grad_net[grad_net_iter],
+//         &grad[grad_iter],
+//         &in_deriv[in_iter],
+//         &rij[rij_iter],
+//         &nlist[nlist_iter],
+//         nloc,
+//         nnei);
+//     }
+// }
+
+
+// template <typename data_t>
+// void PdProdForceSeAOpGPUBackwardKernel(
+//   int nloc, int nframes, int ndescrpt, int nnei,
+//   const data_t* virial_grad, const data_t* net_deriv,
+//   const data_t* in_deriv, const data_t* rij, const int* nlist,
+//   data_t* grad_net
+// ) {
+//     for (int_64 kk = 0; kk < nframes; ++kk) {
+//       FPTYPE* grad_net = p_grad_net + kk * nloc * ndescrpt;
+//       const FPTYPE* virial_grad = p_grad + kk * 9;
+//       const FPTYPE* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+//       const FPTYPE* rij = p_rij + kk * nloc * nnei * 3;
+//       const int* nlist = p_nlist + kk * nloc * nnei;
+//       if (device == "GPU") {
+//         deepmd::prod_virial_grad_a_gpu_cuda(
+//           grad_net, virial_grad, in_deriv, rij, nlist, nloc, nnei
+//         );
+//       }
+//     }
+// }
+
+
+// std::vector<paddle::Tensor> PdProdVirialSeAOpCPUBackward(
+// const paddle::Tensor& virial_grad_tensor,
+// const paddle::Tensor& net_deriv_tensor,
+// const paddle::Tensor& in_deriv_tensor,
+// const paddle::Tensor& rij_tensor,
+// const paddle::Tensor& nlist_tensor,
+// const paddle::Tensor& natoms_tensor,
+// int n_a_sel,
+// int n_r_sel
+// ){
+//     // CHECK_INPUT_READY(virial_grad_tensor);
+//     // CHECK_INPUT_READY(net_deriv_tensor);
+//     // CHECK_INPUT_READY(in_deriv_tensor);
+//     // CHECK_INPUT_READY(rij_tensor);
+//     // CHECK_INPUT_READY(nlist_tensor);
+//     // CHECK_INPUT_READY(natoms_tensor);
+
+//     auto grad_shape = virial_grad_tensor.shape();
+//     auto net_deriv_shape = net_deriv_tensor.shape();
+//     auto in_deriv_shape = in_deriv_tensor.shape();
+//     auto rij_shape = rij_tensor.shape();
+//     auto nlist_shape = nlist_tensor.shape();
+//     auto natoms_shape = natoms_tensor.shape();
+
+//     CHECK_INPUT_DIM(virial_grad_tensor, 2);
+//     CHECK_INPUT_DIM(net_deriv_tensor, 2);
+//     CHECK_INPUT_DIM(in_deriv_tensor, 2);
+//     CHECK_INPUT_DIM(rij_tensor, 2);
+//     CHECK_INPUT_DIM(nlist_tensor, 2);
+//     CHECK_INPUT_DIM(natoms_tensor, 1);
+
+//     PD_CHECK(natoms_shape[0] >= 3, "number of atoms should be larger than (or equal to) 3");
+
+//     const int* natoms = nullptr;
+//     // if(natoms_tensor.place() != paddle::PlaceType::kCPU){
+//     //     natoms = natoms_tensor.copy_to<int>(paddle::PlaceType::kCPU).data<int>();
+//     // }else{
+//     natoms = natoms_tensor.data<int>();
+//     // }
+//     int nframes = net_deriv_shape[0];
+//     int nloc = natoms[0];
+//     int ndescrpt = net_deriv_shape[1] / nloc;
+//     int nnei = nlist_shape[1] / nloc;
+
+//     PD_CHECK(nframes == grad_shape[0], "number of frames should match");
+//     PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
+//     PD_CHECK(nframes == rij_shape[0], "number of frames should match");
+//     PD_CHECK(nframes == nlist_shape[0],"number of samples should match");
+//     PD_CHECK(9 == grad_shape[1], "input grad shape should be 3 x natoms");
+//     PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], "number of descriptors should match");
+//     PD_CHECK(nloc * nnei * 3 == rij_shape[1], "dim of rij should be  nnei * 3");
+//     PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
+
+//     std::vector<int64_t> grad_net_shape {nframes, nloc * ndescrpt};
+//     // paddle::Tensor grad_net_tensor = paddle::Tensor(paddle::PlaceType::kCPU, grad_net_shape);
+//     paddle::Tensor grad_net_tensor = paddle::empty(
+//       grad_net_shape,
+//       virial_grad_tensor.dtype(),
+//       virial_grad_tensor.place()
+//     );
+
+//     if(virial_grad_tensor.place() == paddle::PlaceType::kCPU){
+//         PD_DISPATCH_FLOATING_TYPES(
+//         virial_grad_tensor.type(), "pd_prod_force_se_a_cpu_backward_kernel", ([&] {
+//             PdProdForceSeAOpCPUBackwardKernel<data_t>(
+//                 nloc, nframes, ndescrpt, nnei,
+//                 virial_grad_tensor.data<data_t>(),
+//                 net_deriv_tensor.data<data_t>(),
+//                 in_deriv_tensor.data<data_t>(),
+//                 rij_tensor.data<data_t>(), nlist_tensor.data<int>(),
+//                 grad_net_tensor.mutable_data<data_t>());
+//         }));
+//     }else{
+//         PD_DISPATCH_FLOATING_TYPES(
+//         virial_grad_tensor.type(), "pd_prod_force_se_a_cpu_backward_kernel", ([&] {
+//             PdProdForceSeAOpGPUBackwardKernel<data_t>(
+//                 nloc, nframes, ndescrpt, nnei,
+//                 virial_grad_tensor.data<data_t>(),
+//                 net_deriv_tensor.data<data_t>(),
+//                 in_deriv_tensor.data<data_t>(),
+//                 rij_tensor.data<data_t>(),
+//                 nlist_tensor.data<int>(),
+//                 grad_net_tensor.mutable_data<data_t>());
+//         }));
+//     }
+//     return {grad_net_tensor};
+// }
+
+
+// std::vector<paddle::Tensor> PdProdVirialSeABackward(
+//   const paddle::Tensor& virial_grad_tensor,
+//   const paddle::Tensor& net_deriv_tensor,
+//   const paddle::Tensor& in_deriv_tensor,
+//   const paddle::Tensor& rij_tensor,
+//   const paddle::Tensor& nlist_tensor,
+//   const paddle::Tensor& natoms_tensor,
+//   int n_a_sel,
+//   int n_r_sel
+// ) {
+//     return PdProdVirialSeAOpCPUBackward(
+//         virial_grad_tensor, net_deriv_tensor, in_deriv_tensor,
+//         rij_tensor,
+//         nlist_tensor, natoms_tensor, n_a_sel, n_r_sel
+//     );
+// }
+
+// std::vector<std::vector<int64_t>> PdProdVirialSeAInferShape(
+//   // std::vector<int64_t> x_shape
+//   std::vector<int64_t> net_deriv_shape,
+//   std::vector<int64_t> in_deriv_shape,
+//   std::vector<int64_t> rij_shape,
+//   std::vector<int64_t> nlist_shape,
+//   std::vector<int64_t> natoms_shape,
+//   int n_a_sel,
+//   int n_r_sel
+// ) {
+//   auto virial_shape {nframes, 9};
+//   auto atom_virial_shape {nframes, 9 * nall};
+//   return {virial_shape, atom_virial_shape};
+// }
+
+// std::vector<paddle::DataType> PdProdVirialSeAInferDtype(paddle::DataType x_dtype, ...) {
+//   return {x_dtype, ...};
+// }
+
+std::vector<std::vector<int64_t>> PdProdVirialSeAInferShape(
+  std::vector<int64_t> net_deriv_shape,
+  std::vector<int64_t> in_deriv_shape,
+  std::vector<int64_t> rij_shape,
+  std::vector<int64_t> nlist_shape,
+  std::vector<int64_t> natoms_shape,
+  const int &n_a_sel,
+  const int &n_r_sel
+) {
+  // int64_t nloc = /*natoms[0]*/ 192;
+  int64_t nall = /*natoms[1]*/ 192;
+  int64_t nframes = net_deriv_shape[0];
+
+  std::vector<int64_t> virial_shape = {nframes, 9};
+  std::vector<int64_t> atom_virial_shape = {nframes, 9 * nall};
+
+  return {virial_shape, atom_virial_shape};
+}
+
+std::vector<paddle::DataType> PdProdVirialSeAInferDtype(
+  paddle::DataType net_deriv_dtype,
+  paddle::DataType in_deriv_dtype,
+  paddle::DataType rij_dtype,
+  paddle::DataType nlist_dtype,
+  paddle::DataType natoms_dtype
+) {
+  return {net_deriv_dtype, net_deriv_dtype};
+}
+
+
+PD_BUILD_OP(prod_virial_se_a)
+    .Inputs({"net_deriv", "in_deriv", "rij", "nlist", "natoms"})
+    .Outputs({"virial", "atom_virial"})
+    .Attrs({"n_a_sel: int", "n_r_sel: int"})
+    .SetKernelFn(PD_KERNEL(PdProdVirialSeAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(PdProdVirialSeAInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(PdProdVirialSeAInferDtype));
+
+
+// PD_BUILD_GRAD_OP(prod_virial_se_a)
+//     .Inputs({paddle::Grad("virial"), "net_deriv", "in_deriv", "rij", "nlist", "natoms"})
+//     .Outputs({paddle::Grad("net_deriv")})
+//     .Attrs({"n_a_sel: int", "n_r_sel: int"})
+//     .SetKernelFn(PD_KERNEL(PdProdVirialSeABackward));
+
+
diff --git a/source/lib/paddle_src/prod_virial.h b/source/lib/paddle_src/prod_virial.h
new file mode 100644
index 0000000000..c51e333a47
--- /dev/null
+++ b/source/lib/paddle_src/prod_virial.h
@@ -0,0 +1,75 @@
+#pragma once
+
+namespace deepmd {
+
+template <typename FPTYPE>
+void prod_virial_a_cpu(FPTYPE* virial,
+                       FPTYPE* atom_virial,
+                       const FPTYPE* net_deriv,
+                       const FPTYPE* env_deriv,
+                       const FPTYPE* rij,
+                       const int* nlist,
+                       const int nloc,
+                       const int nall,
+                       const int nnei);
+
+template <typename FPTYPE>
+void prod_virial_r_cpu(FPTYPE* virial,
+                       FPTYPE* atom_virial,
+                       const FPTYPE* net_deriv,
+                       const FPTYPE* env_deriv,
+                       const FPTYPE* rij,
+                       const int* nlist,
+                       const int nloc,
+                       const int nall,
+                       const int nnei);
+
+#if GOOGLE_CUDA
+template <typename FPTYPE>
+void prod_virial_a_gpu_cuda(FPTYPE* virial,
+                            FPTYPE* atom_virial,
+                            const FPTYPE* net_deriv,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nall,
+                            const int nnei);
+
+template <typename FPTYPE>
+void prod_virial_r_gpu_cuda(FPTYPE* virial,
+                            FPTYPE* atom_virial,
+                            const FPTYPE* net_deriv,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nall,
+                            const int nnei);
+#endif  // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+template <typename FPTYPE>
+void prod_virial_a_gpu_rocm(FPTYPE* virial,
+                            FPTYPE* atom_virial,
+                            const FPTYPE* net_deriv,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nall,
+                            const int nnei);
+
+template <typename FPTYPE>
+void prod_virial_r_gpu_rocm(FPTYPE* virial,
+                            FPTYPE* atom_virial,
+                            const FPTYPE* net_deriv,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nall,
+                            const int nnei);
+#endif  // TENSORFLOW_USE_ROCM
+
+}  // namespace deepmd
diff --git a/source/lib/paddle_src/prod_virial_grad.cc b/source/lib/paddle_src/prod_virial_grad.cc
new file mode 100644
index 0000000000..14ba158cc1
--- /dev/null
+++ b/source/lib/paddle_src/prod_virial_grad.cc
@@ -0,0 +1,138 @@
+#include "prod_virial_grad.h"
+
+#include <cstring>
+#include <stdexcept>
+
+#include "errors.h"
+
+inline void make_index_range(int& idx_start,
+                             int& idx_end,
+                             const int& nei_idx,
+                             const int& nnei) {
+  if (nei_idx < nnei) {
+    idx_start = nei_idx * 4;
+    idx_end = nei_idx * 4 + 4;
+  } else {
+    throw deepmd::deepmd_exception("should no reach here");
+  }
+}
+
+template <typename FPTYPE>
+void deepmd::prod_virial_grad_a_cpu(FPTYPE* grad_net,
+                                    const FPTYPE* grad,
+                                    const FPTYPE* env_deriv,
+                                    const FPTYPE* rij,
+                                    const int* nlist,
+                                    const int nloc,
+                                    const int nnei) {
+  const int ndescrpt = nnei * 4;
+
+  // reset the frame to 0
+  for (int ii = 0; ii < nloc; ++ii) {
+    for (int aa = 0; aa < ndescrpt; ++aa) {
+      grad_net[ii * ndescrpt + aa] = 0;
+    }
+  }
+
+// compute grad of one frame
+#pragma omp parallel for
+  for (int ii = 0; ii < nloc; ++ii) {
+    int i_idx = ii;
+
+    // loop over neighbors
+    for (int jj = 0; jj < nnei; ++jj) {
+      int j_idx = nlist[i_idx * nnei + jj];
+      if (j_idx < 0) continue;
+      int aa_start, aa_end;
+      make_index_range(aa_start, aa_end, jj, nnei);
+      for (int aa = aa_start; aa < aa_end; ++aa) {
+        for (int dd0 = 0; dd0 < 3; ++dd0) {
+          for (int dd1 = 0; dd1 < 3; ++dd1) {
+            grad_net[i_idx * ndescrpt + aa] -=
+                -1.0 * grad[dd0 * 3 + dd1] *
+                rij[i_idx * nnei * 3 + jj * 3 + dd1] *
+                env_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0];
+          }
+        }
+      }
+    }
+  }
+}
+
+template void deepmd::prod_virial_grad_a_cpu<double>(double* grad_net,
+                                                     const double* grad,
+                                                     const double* env_deriv,
+                                                     const double* rij,
+                                                     const int* nlist,
+                                                     const int nloc,
+                                                     const int nnei);
+
+template void deepmd::prod_virial_grad_a_cpu<float>(float* grad_net,
+                                                    const float* grad,
+                                                    const float* env_deriv,
+                                                    const float* rij,
+                                                    const int* nlist,
+                                                    const int nloc,
+                                                    const int nnei);
+
+template <typename FPTYPE>
+void deepmd::prod_virial_grad_r_cpu(FPTYPE* grad_net,
+                                    const FPTYPE* grad,
+                                    const FPTYPE* env_deriv,
+                                    const FPTYPE* rij,
+                                    const int* nlist,
+                                    const int nloc,
+                                    const int nnei)
+//
+//	grad_net:	nloc x ndescrpt
+//	grad:		9
+//	env_deriv:	nloc x ndescrpt x 3
+//	rij:		nloc x nnei x 3
+//	nlist:		nloc x nnei
+//
+{
+  const int ndescrpt = nnei * 1;
+
+  // reset the frame to 0
+  for (int ii = 0; ii < nloc; ++ii) {
+    for (int aa = 0; aa < ndescrpt; ++aa) {
+      grad_net[ii * ndescrpt + aa] = 0;
+    }
+  }
+
+// compute grad of one frame
+#pragma omp parallel for
+  for (int ii = 0; ii < nloc; ++ii) {
+    int i_idx = ii;
+
+    // loop over neighbors
+    for (int jj = 0; jj < nnei; ++jj) {
+      int j_idx = nlist[i_idx * nnei + jj];
+      if (j_idx < 0) continue;
+      for (int dd0 = 0; dd0 < 3; ++dd0) {
+        for (int dd1 = 0; dd1 < 3; ++dd1) {
+          grad_net[i_idx * ndescrpt + jj] -=
+              -1.0 * grad[dd0 * 3 + dd1] *
+              rij[i_idx * nnei * 3 + jj * 3 + dd1] *
+              env_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd0];
+        }
+      }
+    }
+  }
+}
+
+template void deepmd::prod_virial_grad_r_cpu<double>(double* grad_net,
+                                                     const double* grad,
+                                                     const double* env_deriv,
+                                                     const double* rij,
+                                                     const int* nlist,
+                                                     const int nloc,
+                                                     const int nnei);
+
+template void deepmd::prod_virial_grad_r_cpu<float>(float* grad_net,
+                                                    const float* grad,
+                                                    const float* env_deriv,
+                                                    const float* rij,
+                                                    const int* nlist,
+                                                    const int nloc,
+                                                    const int nnei);
diff --git a/source/lib/paddle_src/prod_virial_grad.cu b/source/lib/paddle_src/prod_virial_grad.cu
new file mode 100644
index 0000000000..6205c4bdf8
--- /dev/null
+++ b/source/lib/paddle_src/prod_virial_grad.cu
@@ -0,0 +1,536 @@
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+#include "paddle/extension.h"
+
+#include "device.h"
+#include "prod_virial.h"
+#include "gpu_cuda.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT_READY(x) PD_CHECK(x.IsInitialized(), #x " must be initialized before usage.")
+
+// template <typename FPTYPE, int THREADS_PER_BLOCK>
+// __global__ void atom_virial_reduction(FPTYPE* virial,
+//                                       const FPTYPE* atom_virial,
+//                                       const int nall) {
+//   unsigned int bid = blockIdx.x;
+//   unsigned int tid = threadIdx.x;
+//   __shared__ FPTYPE data[THREADS_PER_BLOCK];
+//   data[tid] = (FPTYPE)0.;
+//   for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) {
+//     data[tid] += atom_virial[ii * 9 + bid];
+//   }
+//   __syncthreads();
+//   // do reduction in shared memory
+//   for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+//     if (tid < ii) {
+//       data[tid] += data[tid + ii];
+//     }
+//     __syncthreads();
+//   }
+//   // write result for this block to global memory
+//   if (tid == 0) virial[bid] = data[0];
+// }
+
+// template <typename FPTYPE>
+// __global__ void virial_deriv_wrt_neighbors_a(FPTYPE* virial,
+//                                              FPTYPE* atom_virial,
+//                                              const FPTYPE* net_deriv,
+//                                              const FPTYPE* in_deriv,
+//                                              const FPTYPE* rij,
+//                                              const int* nlist,
+//                                              const int nloc,
+//                                              const int nnei) {
+//   // idx -> nloc
+//   // idy -> nnei
+//   // idz = dd0 * 3 + dd1
+//   // dd0 = idz / 3
+//   // dd1 = idz % 3
+//   const int_64 idx = blockIdx.x;
+//   const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
+//   const unsigned int idz = threadIdx.y;
+//   const int ndescrpt = nnei * 4;
+//   if (idy >= nnei) {
+//     return;
+//   }
+//   int j_idx = nlist[idx * nnei + idy];
+//   if (j_idx < 0) {
+//     return;
+//   }
+//   // atomicAdd(
+//   //    virial + idz,
+//   //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
+//   //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
+//   //    3]);
+//   FPTYPE virial_tmp = (FPTYPE)0.;
+//   for (int idw = 0; idw < 4; ++idw) {
+//     virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
+//                   rij[idx * nnei * 3 + idy * 3 + idz % 3] *
+//                   in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3];
+//   }
+//   atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp);
+// }
+
+// template <typename FPTYPE>
+// __global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial,
+//                                              FPTYPE* atom_virial,
+//                                              const FPTYPE* net_deriv,
+//                                              const FPTYPE* in_deriv,
+//                                              const FPTYPE* rij,
+//                                              const int* nlist,
+//                                              const int nloc,
+//                                              const int nnei) {
+//   // idx -> nloc
+//   // idy -> nnei
+//   // idz = dd0 * 3 + dd1
+//   // dd0 = idz / 3
+//   // dd1 = idz % 3
+//   const int_64 idx = blockIdx.x;
+//   const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
+//   const unsigned int idz = threadIdx.y;
+//   const int ndescrpt = nnei * 1;
+
+//   if (idy >= nnei) {
+//     return;
+//   }
+//   int j_idx = nlist[idx * nnei + idy];
+//   if (j_idx < 0) {
+//     return;
+//   }
+//   // atomicAdd(
+//   //    virial + idz,
+//   //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
+//   //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
+//   //    3]);
+//   atomicAdd(atom_virial + j_idx * 9 + idz,
+//             net_deriv[idx * ndescrpt + idy] *
+//                 rij[idx * nnei * 3 + idy * 3 + idz % 3] *
+//                 in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
+// }
+
+// namespace deepmd {
+// template <typename FPTYPE>
+// void prod_virial_a_gpu_cuda(FPTYPE* virial,
+//                             FPTYPE* atom_virial,
+//                             const FPTYPE* net_deriv,
+//                             const FPTYPE* in_deriv,
+//                             const FPTYPE* rij,
+//                             const int* nlist,
+//                             const int nloc,
+//                             const int nall,
+//                             const int nnei) {
+//   DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
+//   DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
+
+//   const int LEN = 16;
+//   int nblock = (nnei + LEN - 1) / LEN;
+//   dim3 block_grid(nloc, nblock);
+//   dim3 thread_grid(LEN, 9);
+//   // compute virial of a frame
+//   virial_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
+//       virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
+//   DPErrcheck(cudaGetLastError());
+//   DPErrcheck(cudaDeviceSynchronize());
+//   // reduction atom_virial to virial
+//   atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
+//   DPErrcheck(cudaGetLastError());
+//   DPErrcheck(cudaDeviceSynchronize());
+// }
+
+// template <typename FPTYPE>
+// void prod_virial_r_gpu_cuda(FPTYPE* virial,
+//                             FPTYPE* atom_virial,
+//                             const FPTYPE* net_deriv,
+//                             const FPTYPE* in_deriv,
+//                             const FPTYPE* rij,
+//                             const int* nlist,
+//                             const int nloc,
+//                             const int nall,
+//                             const int nnei) {
+//   DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
+//   DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
+
+//   const int LEN = 16;
+//   int nblock = (nnei + LEN - 1) / LEN;
+//   dim3 block_grid(nloc, nblock);
+//   dim3 thread_grid(LEN, 9);
+//   // compute virial of a frame
+//   virial_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
+//       virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
+//   DPErrcheck(cudaGetLastError());
+//   DPErrcheck(cudaDeviceSynchronize());
+//   // reduction atom_virial to virial
+//   atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
+//   DPErrcheck(cudaGetLastError());
+//   DPErrcheck(cudaDeviceSynchronize());
+// }
+// }  // namespace deepmd
+
+// template <typename data_t>
+// void PdProdVirialSeAOpForwardCUDAKernel(
+//     int nloc, int nall, int ndescrpt, int nnei, int nframes,
+//     data_t* p_virial, data_t* p_atom_virial, const data_t* p_net_deriv,
+//     const data_t* p_in_deriv, const data_t* p_rij, const int* p_nlist){
+
+//     for(int kk = 0; kk < nframes; ++kk){
+//       data_t * virial = p_virial + kk * 9;
+//       data_t * atom_virial = p_atom_virial + kk * nall * 9;
+//       const data_t * net_deriv = p_net_deriv + kk * nloc * ndescrpt;
+//       const data_t * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+//       const data_t * rij = p_rij + kk * nloc * nnei * 3;
+//       const int * nlist = p_nlist + kk * nloc * nnei;
+//       deepmd::prod_virial_a_gpu_cuda(
+//           virial, atom_virial,
+//           net_deriv, in_deriv, rij, nlist, nloc, nall, nnei);
+//     }
+// }
+
+// std::vector<paddle::Tensor> PdProdVirialSeAOpCUDAForward(
+// const paddle::Tensor& net_deriv_tensor,
+// const paddle::Tensor& in_deriv_tensor,
+// const paddle::Tensor& rij_tensor,
+// const paddle::Tensor& nlist_tensor,
+// const paddle::Tensor& natoms_tensor,
+// int n_a_sel,
+// int n_r_sel
+// ){
+//     CHECK_INPUT(net_deriv_tensor);
+//     CHECK_INPUT(in_deriv_tensor);
+//     CHECK_INPUT(rij_tensor);
+//     CHECK_INPUT(nlist_tensor);
+//     // CHECK_INPUT(natoms_tensor); // TODO: 暂时指定python端必须为cpu，gpu的copy_to会导致返回的指针数据不对
+
+//     CHECK_INPUT_DIM(net_deriv_tensor, 2);
+//     CHECK_INPUT_DIM(in_deriv_tensor, 2);
+//     CHECK_INPUT_DIM(rij_tensor, 2);
+//     CHECK_INPUT_DIM(nlist_tensor, 2);
+//     CHECK_INPUT_DIM(natoms_tensor, 1);
+
+//     PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3");
+//     const int* natoms = natoms_tensor.data<int>();
+//     // printf("natoms_tensor.numel() = %d\n", natoms_tensor.numel());
+//     int nloc = natoms[0];
+//     // printf("nloc = %d\n", nloc);
+//     int nall = natoms[1];
+//     // printf("nall = %d\n", nall);
+//     int nnei = nlist_tensor.shape()[1] / nloc;
+//     int nframes = net_deriv_tensor.shape()[0];
+//     int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
+//     PD_CHECK(nframes == in_deriv_tensor.shape()[0], "number of samples should match");
+//     PD_CHECK(nframes == rij_tensor.shape()[0], "number of samples should match");
+//     PD_CHECK(nframes == nlist_tensor.shape()[0],"number of samples should match");
+//     PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], "number of descriptors should match");
+//     PD_CHECK((nloc * nnei * 3) == rij_tensor.shape()[1], "dim of rij should be nnei * 3");
+
+//     std::vector<int64_t> virial_shape {nframes, 9};
+//     std::vector<int64_t> atom_virial_shape {nframes, 9 * nall};
+//     paddle::Tensor virial_tensor = paddle::Tensor(paddle::PlaceType::kGPU, virial_shape);
+//     paddle::Tensor atom_virial_tensor = paddle::Tensor(paddle::PlaceType::kGPU, atom_virial_shape);
+
+//     PD_DISPATCH_FLOATING_TYPES(
+//       net_deriv_tensor.type(), "pd_prod_virial_se_a_cpu_forward_kernel", ([&] {
+//         PdProdVirialSeAOpForwardCUDAKernel<data_t>(
+//             nloc, nall, ndescrpt, nnei, nframes,
+//             virial_tensor.mutable_data<data_t>(), atom_virial_tensor.mutable_data<data_t>(),
+//             net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+//             rij_tensor.data<data_t>(), nlist_tensor.data<int>());
+//       }));
+
+//     return {virial_tensor, atom_virial_tensor};
+// }
+
+// std::vector<paddle::Tensor> PdProdVirialSeAForward(
+//   const paddle::Tensor& net_deriv_tensor,
+//   const paddle::Tensor& in_deriv_tensor,
+//   const paddle::Tensor& rij_tensor,
+//   const paddle::Tensor& nlist_tensor,
+//   const paddle::Tensor& natoms_tensor,
+//   int n_a_sel,
+//   int n_r_sel
+// ) {
+//   if (net_deriv_tensor.is_gpu()) {
+//     // std::cout << natoms_tensor.dtype() << std::endl;
+//     // const int* natoms = natoms_tensor.data<int>();
+//     // printf("%d\n", natoms[0]);
+//     return PdProdVirialSeAOpCUDAForward(
+//       net_deriv_tensor,
+//       in_deriv_tensor,
+//       rij_tensor,
+//       nlist_tensor,
+//       natoms_tensor,
+//       n_a_sel,
+//       n_r_sel
+//     );
+//   } else {
+//     PD_THROW("Unsupported device type for forward function of custom relu operator.");
+//   }
+// }
+
+
+/*以下是反向代码*/
+
+// template <typename data_t>
+// void PdProdForceSeAOpCPUBackwardKernel(
+//     int nloc, int nframes, int ndescrpt, int nnei,
+//     const data_t* grad, const data_t* net_deriv,
+//     const data_t* in_deriv, const data_t* rij, const int* nlist,
+//     data_t* grad_net){
+
+// #pragma omp parallel for
+//     for (int kk = 0; kk < nframes; ++kk){
+
+//       int grad_iter	= kk * 9;
+//       int in_iter	= kk * nloc * ndescrpt * 3;
+//       int rij_iter	= kk * nloc * nnei * 3;
+//       int nlist_iter	= kk * nloc * nnei;
+//       int grad_net_iter	= kk * nloc * ndescrpt;
+//       // int n_a_shift = n_a_sel * 4;
+
+//       deepmd::prod_virial_grad_a_cpu(
+//         &grad_net[grad_net_iter],
+//         &grad[grad_iter],
+//         &in_deriv[in_iter],
+//         &rij[rij_iter],
+//         &nlist[nlist_iter],
+//         nloc,
+//         nnei);
+//     }
+// }
+
+
+template <typename FPTYPE>
+__device__ inline FPTYPE dev_dot9(const FPTYPE* arr1, const FPTYPE* arr2) {
+  FPTYPE result = (FPTYPE)0.0;
+  for (int ii = 0; ii < 9; ii++) {
+    result += arr1[ii] * arr2[ii];
+  }
+  return result;
+}
+
+
+template <typename FPTYPE>
+__global__ void virial_grad_wrt_neighbors_a(FPTYPE* grad_net,
+                                            const FPTYPE* grad,
+                                            const FPTYPE* env_deriv,
+                                            const FPTYPE* rij,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei) {
+  // idy -> nnei
+  const unsigned int tid = threadIdx.x;
+  const int_64 idx = blockIdx.x * blockDim.x + tid;
+  const unsigned int idy = blockIdx.y;
+  const unsigned int idw = threadIdx.y;
+  const int ndescrpt = nnei * 4;
+  __shared__ FPTYPE grad_one[9];
+  if (tid < 9) {
+    grad_one[tid] = grad[tid];
+  }
+  __syncthreads();
+  if (idx >= nloc) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  FPTYPE tmp[9];
+  for (int dd0 = 0; dd0 < 3; ++dd0) {
+    for (int dd1 = 0; dd1 < 3; ++dd1) {
+      tmp[dd0 * 3 + dd1] =
+          rij[idx * nnei * 3 + idy * 3 + dd1] *
+          env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0];
+    }
+  }
+  grad_net[idx * ndescrpt + idy * 4 + idw] -=
+      (FPTYPE)-1.0 * dev_dot9(grad_one, tmp);
+}
+
+
+namespace deepmd {
+template <typename FPTYPE>
+void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
+                                 const FPTYPE* grad,
+                                 const FPTYPE* env_deriv,
+                                 const FPTYPE* rij,
+                                 const int* nlist,
+                                 const int nloc,
+                                 const int nnei) {
+  const int ndescrpt = nnei * 4;
+  DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  const int LEN = 128;
+  const int nblock = (nloc + LEN - 1) / LEN;
+  dim3 block_grid(nblock, nnei);
+  dim3 thread_grid(LEN, 4);
+  virial_grad_wrt_neighbors_a<<<block_grid, thread_grid>>>(
+      grad_net, grad, env_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+
+template void prod_virial_grad_a_gpu_cuda<float>(float* grad_net,
+                                                 const float* grad,
+                                                 const float* env_deriv,
+                                                 const float* rij,
+                                                 const int* nlist,
+                                                 const int nloc,
+                                                 const int nnei);
+template void prod_virial_grad_a_gpu_cuda<double>(double* grad_net,
+                                                  const double* grad,
+                                                  const double* env_deriv,
+                                                  const double* rij,
+                                                  const int* nlist,
+                                                  const int nloc,
+                                                  const int nnei);
+}  // namespace deepmd
+
+
+template <typename data_t>
+void PdProdForceSeAOpGPUBackwardKernel(
+  int nloc, int nframes, int ndescrpt, int nnei,
+  const data_t* virial_grad, const data_t* net_deriv,
+  const data_t* in_deriv, const data_t* rij, const int* nlist,
+  data_t* grad_net
+) {
+    data_t* p_grad_net = grad_net;
+    const data_t* p_grad = virial_grad;
+    const data_t* p_in_deriv = in_deriv;
+    const data_t* p_rij = rij;
+    const int* p_nlist = nlist;
+    for (int_64 kk = 0; kk < nframes; ++kk) {
+      data_t* grad_net = p_grad_net + kk * nloc * ndescrpt;
+      const data_t* virial_grad = p_grad + kk * 9;
+      const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+      const data_t* rij = p_rij + kk * nloc * nnei * 3;
+      const int* nlist = p_nlist + kk * nloc * nnei;
+      // if (device == "GPU") {
+      deepmd::prod_virial_grad_a_gpu_cuda(
+        grad_net, virial_grad, in_deriv, rij, nlist, nloc, nnei
+      );
+      // }
+    }
+}
+
+
+std::vector<paddle::Tensor> PdProdVirialSeAOpCUDABackward(
+  const paddle::Tensor& virial_grad_tensor,
+  const paddle::Tensor& net_deriv_tensor,
+  const paddle::Tensor& in_deriv_tensor,
+  const paddle::Tensor& rij_tensor,
+  const paddle::Tensor& nlist_tensor,
+  const paddle::Tensor& natoms_tensor,
+  int n_a_sel,
+  int n_r_sel
+){
+    // CHECK_INPUT_READY(virial_grad_tensor);
+    // CHECK_INPUT_READY(net_deriv_tensor);
+    // CHECK_INPUT_READY(in_deriv_tensor);
+    // CHECK_INPUT_READY(rij_tensor);
+    // CHECK_INPUT_READY(nlist_tensor);
+    // CHECK_INPUT_READY(natoms_tensor);
+
+    auto grad_shape = virial_grad_tensor.shape();
+    auto net_deriv_shape = net_deriv_tensor.shape();
+    auto in_deriv_shape = in_deriv_tensor.shape();
+    auto rij_shape = rij_tensor.shape();
+    auto nlist_shape = nlist_tensor.shape();
+    auto natoms_shape = natoms_tensor.shape();
+
+    CHECK_INPUT_DIM(virial_grad_tensor, 2);
+    CHECK_INPUT_DIM(net_deriv_tensor, 2);
+    CHECK_INPUT_DIM(in_deriv_tensor, 2);
+    CHECK_INPUT_DIM(rij_tensor, 2);
+    CHECK_INPUT_DIM(nlist_tensor, 2);
+    CHECK_INPUT_DIM(natoms_tensor, 1);
+
+    PD_CHECK(natoms_shape[0] >= 3, "number of atoms should be larger than (or equal to) 3");
+
+    const int* natoms = nullptr;
+    // if(natoms_tensor.place() != paddle::PlaceType::kCPU){
+    //     natoms = natoms_tensor.copy_to<int>(paddle::PlaceType::kCPU).data<int>();
+    // }else{
+    natoms = natoms_tensor.data<int>();
+    // }
+    int nframes = net_deriv_shape[0];
+    int nloc = natoms[0];
+    int ndescrpt = net_deriv_shape[1] / nloc;
+    int nnei = nlist_shape[1] / nloc;
+
+    PD_CHECK(nframes == grad_shape[0], "number of frames should match");
+    PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
+    PD_CHECK(nframes == rij_shape[0], "number of frames should match");
+    PD_CHECK(nframes == nlist_shape[0],"number of samples should match");
+    PD_CHECK(9 == grad_shape[1], "input grad shape should be 3 x natoms");
+    PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], "number of descriptors should match");
+    PD_CHECK(nloc * nnei * 3 == rij_shape[1], "dim of rij should be  nnei * 3");
+    PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
+
+    std::vector<int64_t> grad_net_shape {nframes, nloc * ndescrpt};
+    // paddle::Tensor grad_net_tensor = paddle::Tensor(paddle::PlaceType::kCPU, grad_net_shape);
+    paddle::Tensor grad_net_tensor = paddle::empty(
+      grad_net_shape,
+      virial_grad_tensor.dtype(),
+      virial_grad_tensor.place()
+    );
+
+    // if(virial_grad_tensor.place() == paddle::PlaceType::kCPU){
+    //     PD_DISPATCH_FLOATING_TYPES(
+    //     virial_grad_tensor.type(), "pd_prod_force_se_a_cpu_backward_kernel", ([&] {
+    //         PdProdForceSeAOpCPUBackwardKernel<data_t>(
+    //             nloc, nframes, ndescrpt, nnei,
+    //             virial_grad_tensor.data<data_t>(),
+    //             net_deriv_tensor.data<data_t>(),
+    //             in_deriv_tensor.data<data_t>(),
+    //             rij_tensor.data<data_t>(), nlist_tensor.data<int>(),
+    //             grad_net_tensor.mutable_data<data_t>());
+    //     }));
+    // }else{
+      PD_DISPATCH_FLOATING_TYPES(
+      virial_grad_tensor.type(), "pd_prod_force_se_a_cuda_backward_kernel", ([&] {
+          PdProdForceSeAOpGPUBackwardKernel<data_t>(
+              nloc, nframes, ndescrpt, nnei,
+              virial_grad_tensor.data<data_t>(),
+              net_deriv_tensor.data<data_t>(),
+              in_deriv_tensor.data<data_t>(),
+              rij_tensor.data<data_t>(),
+              nlist_tensor.data<int>(),
+              grad_net_tensor.mutable_data<data_t>());
+      }));
+    // }
+    return {grad_net_tensor};
+}
+
+
+std::vector<paddle::Tensor> PdProdVirialSeABackward(
+  const paddle::Tensor& virial_grad_tensor,
+  const paddle::Tensor& net_deriv_tensor,
+  const paddle::Tensor& in_deriv_tensor,
+  const paddle::Tensor& rij_tensor,
+  const paddle::Tensor& nlist_tensor,
+  const paddle::Tensor& natoms_tensor,
+  int n_a_sel,
+  int n_r_sel
+) {
+    return PdProdVirialSeAOpCUDABackward(
+        virial_grad_tensor, net_deriv_tensor, in_deriv_tensor,
+        rij_tensor,
+        nlist_tensor, natoms_tensor, n_a_sel, n_r_sel
+    );
+}
+
+
+// PD_BUILD_OP(prod_virial_se_a)
+//     .Inputs({"net_deriv", "in_deriv", "rij", "nlist", "natoms"})
+//     .Outputs({"virial", "atom_virial"})
+//     .Attrs({"n_a_sel: int", "n_r_sel: int"})
+//     .SetKernelFn(PD_KERNEL(PdProdVirialSeAForward));
+
+
+PD_BUILD_GRAD_OP(prod_virial_se_a)
+    .Inputs({paddle::Grad("virial"), "net_deriv", "in_deriv", "rij", "nlist", "natoms"})
+    .Outputs({paddle::Grad("net_deriv")})
+    .Attrs({"n_a_sel: int", "n_r_sel: int"})
+    .SetKernelFn(PD_KERNEL(PdProdVirialSeABackward));
diff --git a/source/lib/paddle_src/prod_virial_grad.h b/source/lib/paddle_src/prod_virial_grad.h
new file mode 100644
index 0000000000..0e2cc46baa
--- /dev/null
+++ b/source/lib/paddle_src/prod_virial_grad.h
@@ -0,0 +1,63 @@
+#pragma once
+
+namespace deepmd {
+
+template <typename FPTYPE>
+void prod_virial_grad_a_cpu(FPTYPE* grad_net,
+                            const FPTYPE* grad,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nnei);
+
+template <typename FPTYPE>
+void prod_virial_grad_r_cpu(FPTYPE* grad_net,
+                            const FPTYPE* grad,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nnei);
+
+#if GOOGLE_CUDA
+template <typename FPTYPE>
+void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
+                                 const FPTYPE* grad,
+                                 const FPTYPE* env_deriv,
+                                 const FPTYPE* rij,
+                                 const int* nlist,
+                                 const int nloc,
+                                 const int nnei);
+
+template <typename FPTYPE>
+void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net,
+                                 const FPTYPE* grad,
+                                 const FPTYPE* env_deriv,
+                                 const FPTYPE* rij,
+                                 const int* nlist,
+                                 const int nloc,
+                                 const int nnei);
+#endif  // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+template <typename FPTYPE>
+void prod_virial_grad_a_gpu_rocm(FPTYPE* grad_net,
+                                 const FPTYPE* grad,
+                                 const FPTYPE* env_deriv,
+                                 const FPTYPE* rij,
+                                 const int* nlist,
+                                 const int nloc,
+                                 const int nnei);
+
+template <typename FPTYPE>
+void prod_virial_grad_r_gpu_rocm(FPTYPE* grad_net,
+                                 const FPTYPE* grad,
+                                 const FPTYPE* env_deriv,
+                                 const FPTYPE* rij,
+                                 const int* nlist,
+                                 const int nloc,
+                                 const int nnei);
+#endif  // TENSORFLOW_USE_ROCM
+
+}  // namespace deepmd
diff --git a/source/lib/paddle_src/setup_ins.py b/source/lib/paddle_src/setup_ins.py
new file mode 100644
index 0000000000..f510bd62f4
--- /dev/null
+++ b/source/lib/paddle_src/setup_ins.py
@@ -0,0 +1,54 @@
+from paddle.utils import cpp_extension
+
+cpp_extension.setup(
+    name="paddle_deepmd_lib",
+    ext_modules=cpp_extension.CppExtension(
+        sources=[
+            "../src/coord.cc",
+            "../src/env_mat_nvnmd.cc",
+            "../src/env_mat.cc",
+            "../src/ewald.cc",
+            "../src/fmt_nlist.cc",
+            "../src/gelu.cc",
+            "../src/map_aparam.cc",
+            "../src/neighbor_list.cc",
+            "../src/pair_tab.cc",
+            "../src/prod_env_mat_nvnmd.cc",
+            "../src/prod_env_mat.cc",
+            # "../src/prod_force_grad.cc",
+            # "../src/prod_force.cc",
+            # "../src/prod_virial_grad.cc",
+            # "../src/prod_virial.cc",
+            "../src/region.cc",
+            "../src/SimulationRegion.cpp",
+            "../src/soft_min_switch_force_grad.cc",
+            "../src/soft_min_switch_force.cc",
+            "../src/soft_min_switch_virial_grad.cc",
+            "../src/soft_min_switch_virial.cc",
+            "../src/soft_min_switch.cc",
+            "../src/tabulate.cc",
+            "../src/utilities.cc",
+            "../src/cuda/coord.cu",
+            "../src/cuda/gelu.cu",
+            "../src/cuda/neighbor_list.cu",
+            # "../src/cuda/prod_force_grad.cu",
+            # "../src/cuda/prod_force.cu",
+            # "../src/cuda/prod_virial_grad.cu",
+            # "../src/cuda/prod_virial.cu",
+            "../src/cuda/region.cu",
+            "../src/cuda/tabulate.cu",
+            "./prod_env_mat.cu",
+            "./prod_virial_grad.cu",
+            "./prod_virial_grad.cc",
+            "./prod_virial.cu",
+            "./prod_force.cu",
+            # "./prod_force_grad.cc",
+            "./prod_force_grad.cu",
+            "./neighbor_stat.cu",
+        ],
+        include_dirs=[
+            "/workspace/hesensen/deepmd_backend/deepmd-kit-tf/source/lib/include"
+        ],
+        library_dirs=["/usr/local/cuda-11/lib64"],
+    ),
+)