diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1922ea33f8..0fb064c9db 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -48,10 +48,10 @@ repos:
     -   id: clang-format
         exclude: ^source/3rdparty|source/lib/src/cuda/cudart/.+\.inc
 # CSS
--   repo: https://github.com/pre-commit/mirrors-csslint
-    rev: v1.0.5
-    hooks:
-    -   id: csslint
+# -   repo: https://github.com/pre-commit/mirrors-csslint
+#     rev: v1.0.5
+#     hooks:
+#     -   id: csslint
 # Shell
 - repo: https://github.com/scop/pre-commit-shfmt
   rev: v3.6.0-2
diff --git a/README.md b/README.md
index 4cb4fbb24a..40dad38b23 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,198 @@
+# DeePMD-kit(PaddlePaddle backend)
+
+> [!IMPORTANT]
+> 本项目为 DeePMD-kit 的 PaddlePaddle 版本，主要修改了部分代码，使其可以运行在 PaddlePaddle 上。运行功能包括 water_se_e2_a 案例的单卡 GPU 训练、单卡 GPU 评估、导出静态图模型、接入 LAMMPS(GPU) 推理 4 部分的功能。
+
+## 1. 环境安装
+
+1. 安装 tensorflow 2.12
+
+    由于 DeepMD-kit 大量代码基于 tensorflow 编写，暂时没有完全迁移到 PaddlePaddle 上，因此运行前需要安装 tensorflow 2.12。
+
+    ``` sh
+    # Current stable release for CPU and GPU(CPU和GPU使用同一个命令，不再以安装tensorflow-gpu的形式安装GPU版本)
+    pip install tensorflow==2.12 -i https://pypi.tuna.tsinghua.edu.cn/simple
+    ```
+
+2. 安装 paddlepaddle-develop
+
+    参考 [Paddle 官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)，安装对应机器环境的 GPU 版 paddlepaddle-develop
+
+3. 安装 deepmd-kit
+
+
+    ``` sh
+    git clone https://github.com/HydrogenSulfate/deepmd-kit.git -b add_ddle_backend_polish_ver
+    cd deepmd-kit
+    # 以 editable 的方式安装，方便调试
+    pip install -e . -i https://pypi.tuna.tsinghua.edu.cn/simple
+    ```
+
+## 2. 运行具体功能
+
+### 2.1 安装 python 自定义算子
+
+**在运行训练、评估、导出静态图模型这 3 个功能之前，需要先安装 python 端的自定义算子库 `paddle_deepmd_lib`**，LAMMPS 推理功能由于单独使用自定义算子的源代码进行联合编译，因此不需要安装python 端的自定义算子。
+
+``` sh
+cd ./source/lib/paddle_src
+python setup_ins.py install
+```
+
+安装完毕之后建议运行如下命令测试一下 python 端自定义算子在 CPU、GPU 上的正确性：
+
+``` sh
+wget -nc https://paddle-org.bj.bcebos.com/paddlescience/deepmd/deepmd_custom_op_test_data.tar
+tar -xf deepmd_custom_op_test_data.tar
+export UNITTEST_DIR=$PWD/deepmd_custom_op_test_data
+python ./custom_op_test.py
+```
+
+除少量 `deprecated` 相关的警告外，如果输出全部都是 True，则说明 python 端自定义算子安装成功并且运行正常。
+
+### 2.2 训练
+
+``` sh
+# 进入案例目录
+cd examples/water/se_e2_a
+# 运行 GPU 训练
+dp train ./input.json
+
+# 运行 CPU 训练(速度极慢，不推荐运行，仅作为跑通测试)
+dp train ./input.json --cpu
+```
+
+### 2.3 评估
+
+``` sh
+# 进入案例目录
+cd examples/water/se_e2_a
+# 设置好权重文件路径
+WEIGHT_PATH="path/to/your_model.pdparams"
+# 运行评估
+dp test -m ${WEIGHT_PATH} -s ../data/data_3/ -n 30
+```
+
+### 2.4 导出静态图模型
+
+``` sh
+# 进入案例目录
+cd examples/water/se_e2_a
+# 设置权重文件路径
+WEIGHT_PATH="path/to/your_model.pdparams"
+# 设置导出的静态图模型路径前缀(不需要加.pdmodel或.pdiparams后缀)
+DUMP_PATH="path/to/your_dump"
+# 导出静态图模型
+dp freeze -i ${WEIGHT_PATH} -o ${DUMP_PATH}
+```
+
+### 2.5 在 LAMMPS(GPU) 中推理
+
+1. 修改 `examples/water/lmp/in.lammps` 文件，将 `pair_style deepmd` 后面的路径改为 **2.3 导出静态图模型** 这一章节内设置好的 DUMP_PATH 的值
+
+    ``` suggestion
+    pair_style  deepmd "path/to/your_dump"
+    ```
+
+2. 编译 Paddle，得到未裁剪算子的 Paddle 推理库(LAMMPS 推理涉及到 `xxx_grad` 反向算子，因而在此需要手动编译 Paddle，得到未裁剪的 Paddle 推理库)
+
+    ``` sh
+    git clone https://github.com/PaddlePaddle/Paddle.git -b develop
+    cd Paddle
+    mkdir build
+    cd build
+    # 推荐使用 Anaconda 安装 python3.9 环境，并在该环境下执行编译命令
+    cmake .. -DPY_VERSION=3.9 -DWITH_GPU=ON -WITH_DISTRIBUTE=ON -DWITH_TESTING=ON -DCMAKE_BUILD_TYPE=Release
+    make -j$(nproc)
+
+    # 编译完成后，确认 paddle_inference_install_dir 推理库是否存在
+    ls build/paddle_inference_install_dir
+    ```
+
+3. Paddle 推理库和 LAMMPS 联合编译安装，并运行推理
+
+    ``` sh
+    # 下载并解压 lammps 源码
+    wget https://github.com/lammps/lammps/archive/stable_2Aug2023_update1.tar.gz
+    tar xf stable_2Aug2023_update1.tar.gz
+    # LAMMPS_DIR 设置为 LAMMPS 的安装目录
+    export LAMMPS_DIR="/path/to/lammps-stable_2Aug2023_update1"
+
+    # 设置推理时的 GPU 卡号
+    export CUDA_VISIBLE_DEVICES=0
+    # PADDLE_DIR 设置为第二步 clone下来的 Paddle 目录
+    export PADDLE_DIR="/path/to/Paddle"
+    # DEEPMD_DIR 设置为本项目的根目录
+    export DEEPMD_DIR="/path/to/deepmd-kit"
+    # PADDLE_INFERENCE_DIR 设置为第二步编译得到的 Paddle 推理库目录
+    export PADDLE_INFERENCE_DIR="/path/to/paddle_inference_install_dir"
+    # TENSORFLOW_DIR 设置为 tensorflow 的安装目录，可用 pip show tensorflow 确定
+    export TENSORFLOW_DIR="/path/to/tensorflow"
+
+    export LD_LIBRARY_PATH=${PADDLE_DIR}/paddle/fluid/pybind/:$LD_LIBRARY_PATH
+    export LD_LIBRARY_PATH=${DEEPMD_DIR}/deepmd/op:$LD_LIBRARY_PATH
+    export LD_LIBRARY_PATH=${PADDLE_INFERENCE_DIR}/paddle/lib:$LD_LIBRARY_PATH
+    export LD_LIBRARY_PATH=${PADDLE_INFERENCE_DIR}/third_party/install/mkldnn/lib:$LD_LIBRARY_PATH
+    export LD_LIBRARY_PATH=${PADDLE_INFERENCE_DIR}/third_party/install/mklml/lib:$LD_LIBRARY_PATH
+    export LD_LIBRARY_PATH=${DEEPMD_DIR}/source/build:$LD_LIBRARY_PATH
+    export LIBRARY_PATH=${DEEPMD_DIR}/deepmd/op:$LIBRARY_PATH
+
+    cd ${DEEPMD_DIR}/source
+    # rm -rf build # 若改动CMakeLists.txt，则需要打开该注释
+    mkdir build
+    cd build
+
+    # DEEPMD_INSTALL_DIR 设置为 deepmd-lammps 的目标安装目录，可自行设置任意路径
+    export DEEPMD_INSTALL_DIR="path/to/deepmd_root"
+
+    # 开始编译
+    cmake -DCMAKE_INSTALL_PREFIX=${DEEPMD_INSTALL_DIR} -DPADDLE_ROOT=${PADDLE_INFERENCE_DIR} \
+        -DUSE_CUDA_TOOLKIT=TRUE \
+        -DTENSORFLOW_ROOT=${TENSORFLOW_DIR} \
+        -DPADDLE_LIB=${PADDLE_INFERENCE_DIR} \
+        -DFLOAT_PREC=low ..
+    make -j4 && make install
+    make lammps
+
+    cd ${LAMMPS_DIR}/src/
+    \cp -r ${DEEPMD_DIR}/source/build/USER-DEEPMD .
+    make yes-kspace
+    make yes-extra-fix
+    make yes-user-deepmd
+    make serial -j
+    export PATH=${LAMMPS_DIR}/src:$PATH
+
+    cd ${DEEPMD_DIR}/examples/water/lmp
+
+    lmp_serial -in in.lammps
+    ```
+
+4. [可选]直接运行推理
+
+    若已完成 **3. Paddle 推理库和 LAMMPS 联合编译安装，并运行推理**，且没有对 C++ 代码进行修改，则无需重新联合编译 Paddle 推理库和 LAMMPS，直接运行以下命令即可开始推理。
+
+    ``` sh
+    # 设置推理时的 GPU 卡号
+    export CUDA_VISIBLE_DEVICES=0
+    # LAMMPS_DIR 设置为 LAMMPS 的安装目录
+    export LAMMPS_DIR="/path/to/lammps-stable_2Aug2023_update1"
+
+    cd ${LAMMPS_DIR}/src/
+    export PATH=${LAMMPS_DIR}/src:$PATH
+
+    cd ${DEEPMD_DIR}/examples/water/lmp
+
+    lmp_serial -in in.lammps
+    ```
+
+--------------------------------------------------------------------------------
+
 [<picture><source media="(prefers-color-scheme: dark)" srcset="./doc/_static/logo-dark.svg"><source media="(prefers-color-scheme: light)" srcset="./doc/_static/logo.svg"><img alt="DeePMD-kit logo" src="./doc/_static/logo.svg"></picture>](./doc/logo.md)
 
 --------------------------------------------------------------------------------
 
-<span style="font-size:larger;">DeePMD-kit Manual</span>
-========
+# DeePMD-kit Manual
+
 [![GitHub release](https://img.shields.io/github/release/deepmodeling/deepmd-kit.svg?maxAge=86400)](https://github.com/deepmodeling/deepmd-kit/releases)
 [![doi:10.1016/j.cpc.2018.03.016](https://img.shields.io/badge/DOI-10.1016%2Fj.cpc.2018.03.016-blue)](https://doi.org/10.1016/j.cpc.2020.107206)
 [![Citations](https://citations.njzjz.win/10.1016/j.cpc.2018.03.016)](https://badge.dimensions.ai/details/doi/10.1016/j.cpc.2018.03.016)
@@ -14,46 +203,55 @@
 [![Documentation Status](https://readthedocs.org/projects/deepmd/badge/)](https://deepmd.readthedocs.io/)
 
 # Table of contents
+
 - [About DeePMD-kit](#about-deepmd-kit)
- 	- [Highlights in v2.0](#highlights-in-deepmd-kit-v2.0)
- 	- [Highlighted features](#highlighted-features)
- 	- [License and credits](#license-and-credits)
- 	- [Deep Potential in a nutshell](#deep-potential-in-a-nutshell)
+  - [Highlights in v2.0](#highlights-in-deepmd-kit-v2.0)
+  - [Highlighted features](#highlighted-features)
+  - [License and credits](#license-and-credits)
+  - [Deep Potential in a nutshell](#deep-potential-in-a-nutshell)
 - [Download and install](#download-and-install)
 - [Use DeePMD-kit](#use-deepmd-kit)
 - [Code structure](#code-structure)
 - [Troubleshooting](#troubleshooting)
 
 # About DeePMD-kit
+
 DeePMD-kit is a package written in Python/C++, designed to minimize the effort required to build deep learning-based model of interatomic potential energy and force field and to perform molecular dynamics (MD). This brings new hopes to addressing the accuracy-versus-efficiency dilemma in molecular simulations. Applications of DeePMD-kit span from finite molecules to extended systems and from metallic systems to chemically bonded systems.
 
 For more information, check the [documentation](https://deepmd.readthedocs.io/).
 
 # Highlights in DeePMD-kit v2.0
-* [Model compression](doc/freeze/compress.md). Accelerate the efficiency of model inference 4-15 times.
-* [New descriptors](doc/model/overall.md). Including [`se_e2_r`](doc/model/train-se-e2-r.md) and [`se_e3`](doc/model/train-se-e3.md).
-* [Hybridization of descriptors](doc/model/train-hybrid.md). Hybrid descriptor constructed from the concatenation of several descriptors.
-* [Atom type embedding](doc/model/train-se-e2-a-tebd.md). Enable atom-type embedding to decline training complexity and refine performance.
-* Training and inference of the dipole (vector) and polarizability (matrix).
-* Split of training and validation dataset.
-* Optimized training on GPUs.
+
+- [Model compression](doc/freeze/compress.md). Accelerate the efficiency of model inference 4-15 times.
+
+- [New descriptors](doc/model/overall.md). Including [`se_e2_r`](doc/model/train-se-e2-r.md) and [`se_e3`](doc/model/train-se-e3.md).
+- [Hybridization of descriptors](doc/model/train-hybrid.md). Hybrid descriptor constructed from the concatenation of several descriptors.
+- [Atom type embedding](doc/model/train-se-e2-a-tebd.md). Enable atom-type embedding to decline training complexity and refine performance.
+- Training and inference of the dipole (vector) and polarizability (matrix).
+- Split of training and validation dataset.
+- Optimized training on GPUs.
 
 ## Highlighted features
-* **interfaced with TensorFlow**, one of the most popular deep learning frameworks, making the training process highly automatic and efficient, in addition, Tensorboard can be used to visualize training procedures.
-* **interfaced with high-performance classical MD and quantum (path-integral) MD packages**, i.e., LAMMPS and i-PI, respectively.
-* **implements the Deep Potential series models**, which have been successfully applied to finite and extended systems including organic molecules, metals, semiconductors, insulators, etc.
-* **implements MPI and GPU supports**, making it highly efficient for high-performance parallel and distributed computing.
-* **highly modularized**, easy to adapt to different descriptors for deep learning-based potential energy models.
+
+- **interfaced with TensorFlow**, one of the most popular deep learning frameworks, making the training process highly automatic and efficient, in addition, Tensorboard can be used to visualize training procedures.
+
+- **interfaced with high-performance classical MD and quantum (path-integral) MD packages**, i.e., LAMMPS and i-PI, respectively.
+- **implements the Deep Potential series models**, which have been successfully applied to finite and extended systems including organic molecules, metals, semiconductors, insulators, etc.
+- **implements MPI and GPU supports**, making it highly efficient for high-performance parallel and distributed computing.
+- **highly modularized**, easy to adapt to different descriptors for deep learning-based potential energy models.
 
 ## License and credits
+
 The project DeePMD-kit is licensed under [GNU LGPLv3.0](./LICENSE).
 If you use this code in any future publications, please cite the following publications for general purpose:
+
 - Han Wang, Linfeng Zhang, Jiequn Han, and Weinan E. "DeePMD-kit: A deep learning package for many-body potential energy representation and molecular dynamics." Computer Physics Communications 228 (2018): 178-184.
 - Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang. "DeePMD-kit v2: A software package for Deep Potential models." [arXiv:2304.09409](https://doi.org/10.48550/arXiv.2304.09409).
 
 In addition, please follow [the bib file](CITATIONS.bib) to cite the methods you used.
 
 ## Deep Potential in a nutshell
+
 The goal of Deep Potential is to employ deep learning techniques and realize an inter-atomic potential energy model that is general, accurate, computationally efficient and scalable. The key component is to respect the extensive and symmetry-invariant properties of a potential energy model by assigning a local reference frame and a local environment to each atom. Each environment contains a finite number of atoms, whose local coordinates are arranged in a symmetry-preserving way. These local coordinates are then transformed, through a sub-network, to so-called *atomic energy*. Summing up all the atomic energies gives the potential energy of the system.
 
 The initial proof of concept is in the [Deep Potential][1] paper, which employed an approach that was devised to train the neural network model with the potential energy only. With typical *ab initio* molecular dynamics (AIMD) datasets this is insufficient to reproduce the trajectories. The Deep Potential Molecular Dynamics ([DeePMD][2]) model overcomes this limitation. In addition, the learning process in DeePMD improves significantly over the Deep Potential method thanks to the introduction of a flexible family of loss functions. The NN potential constructed in this way reproduces accurately the AIMD trajectories, both classical and quantum (path integral), in extended and finite systems, at a cost that scales linearly with system size and is always several orders of magnitude lower than that of equivalent AIMD simulations.
@@ -72,7 +270,6 @@ DeePMD-kit offers multiple installation methods. It is recommended to use easy m
 
 One may manually install DeePMD-kit by following the instructions on [installing the Python interface](doc/install/install-from-source.md#install-the-python-interface) and [installing the C++ interface](doc/install/install-from-source.md#install-the-c-interface). The C++ interface is necessary when using DeePMD-kit with LAMMPS, i-PI or GROMACS.
 
-
 # Use DeePMD-kit
 
 A quick start on using DeePMD-kit can be found [here](doc/getting-started/quick_start.ipynb).
@@ -82,74 +279,73 @@ A full [document](doc/train/train-input-auto.rst) on options in the training inp
 # Advanced
 
 - [Installation](doc/install/index.md)
-    - [Easy install](doc/install/easy-install.md)
-    - [Install from source code](doc/install/install-from-source.md)
-    - [Install from pre-compiled C library](doc/install/install-from-c-library.md)
-    - [Install LAMMPS](doc/install/install-lammps.md)
-    - [Install i-PI](doc/install/install-ipi.md)
-    - [Install GROMACS](doc/install/install-gromacs.md)
-    - [Building conda packages](doc/install/build-conda.md)
-    - [Install Node.js interface](doc/install/install-nodejs.md)
+  - [Easy install](doc/install/easy-install.md)
+  - [Install from source code](doc/install/install-from-source.md)
+  - [Install from pre-compiled C library](doc/install/install-from-c-library.md)
+  - [Install LAMMPS](doc/install/install-lammps.md)
+  - [Install i-PI](doc/install/install-ipi.md)
+  - [Install GROMACS](doc/install/install-gromacs.md)
+  - [Building conda packages](doc/install/build-conda.md)
+  - [Install Node.js interface](doc/install/install-nodejs.md)
 - [Data](doc/data/index.md)
-    - [System](doc/data/system.md)
-    - [Formats of a system](doc/data/data-conv.md)
-    - [Prepare data with dpdata](doc/data/dpdata.md)
+  - [System](doc/data/system.md)
+  - [Formats of a system](doc/data/data-conv.md)
+  - [Prepare data with dpdata](doc/data/dpdata.md)
 - [Model](doc/model/index.md)
-    - [Overall](doc/model/overall.md)
-    - [Descriptor `"se_e2_a"`](doc/model/train-se-e2-a.md)
-    - [Descriptor `"se_e2_r"`](doc/model/train-se-e2-r.md)
-    - [Descriptor `"se_e3"`](doc/model/train-se-e3.md)
-    - [Descriptor `"se_atten"`](doc/model/train-se-atten.md)
-    - [Descriptor `"hybrid"`](doc/model/train-hybrid.md)
-    - [Descriptor `sel`](doc/model/sel.md)
-    - [Fit energy](doc/model/train-energy.md)
-    - [Fit spin energy](doc/model/train-energy-spin.md)
-    - [Fit `tensor` like `Dipole` and `Polarizability`](doc/model/train-fitting-tensor.md)
+  - [Overall](doc/model/overall.md)
+  - [Descriptor `"se_e2_a"`](doc/model/train-se-e2-a.md)
+  - [Descriptor `"se_e2_r"`](doc/model/train-se-e2-r.md)
+  - [Descriptor `"se_e3"`](doc/model/train-se-e3.md)
+  - [Descriptor `"se_atten"`](doc/model/train-se-atten.md)
+  - [Descriptor `"hybrid"`](doc/model/train-hybrid.md)
+  - [Descriptor `sel`](doc/model/sel.md)
+  - [Fit energy](doc/model/train-energy.md)
+  - [Fit spin energy](doc/model/train-energy-spin.md)
+  - [Fit `tensor` like `Dipole` and `Polarizability`](doc/model/train-fitting-tensor.md)
 - [Fit electronic density of states (DOS)](doc/model/train-fitting-dos.md)
-    - [Train a Deep Potential model using `type embedding` approach](doc/model/train-se-e2-a-tebd.md)
-    - [Deep potential long-range](doc/model/dplr.md)
-    - [Deep Potential - Range Correction (DPRc)](doc/model/dprc.md)
+  - [Train a Deep Potential model using `type embedding` approach](doc/model/train-se-e2-a-tebd.md)
+  - [Deep potential long-range](doc/model/dplr.md)
+  - [Deep Potential - Range Correction (DPRc)](doc/model/dprc.md)
 - [Training](doc/train/index.md)
-    - [Training a model](doc/train/training.md)
-    - [Advanced options](doc/train/training-advanced.md)
-    - [Parallel training](doc/train/parallel-training.md)
-    - [Multi-task training](doc/train/multi-task-training.md)
-    - [TensorBoard Usage](doc/train/tensorboard.md)
-    - [Known limitations of using GPUs](doc/train/gpu-limitations.md)
-    - [Training Parameters](doc/train-input-auto.rst)
+  - [Training a model](doc/train/training.md)
+  - [Advanced options](doc/train/training-advanced.md)
+  - [Parallel training](doc/train/parallel-training.md)
+  - [Multi-task training](doc/train/multi-task-training.md)
+  - [TensorBoard Usage](doc/train/tensorboard.md)
+  - [Known limitations of using GPUs](doc/train/gpu-limitations.md)
+  - [Training Parameters](doc/train-input-auto.rst)
 - [Freeze and Compress](doc/freeze/index.rst)
-    - [Freeze a model](doc/freeze/freeze.md)
-    - [Compress a model](doc/freeze/compress.md)
+  - [Freeze a model](doc/freeze/freeze.md)
+  - [Compress a model](doc/freeze/compress.md)
 - [Test](doc/test/index.rst)
-    - [Test a model](doc/test/test.md)
-    - [Calculate Model Deviation](doc/test/model-deviation.md)
+  - [Test a model](doc/test/test.md)
+  - [Calculate Model Deviation](doc/test/model-deviation.md)
 - [Inference](doc/inference/index.rst)
-    - [Python interface](doc/inference/python.md)
-    - [C++ interface](doc/inference/cxx.md)
-    - [Node.js interface](doc/inference/nodejs.md)
+  - [Python interface](doc/inference/python.md)
+  - [C++ interface](doc/inference/cxx.md)
+  - [Node.js interface](doc/inference/nodejs.md)
 - [Integrate with third-party packages](doc/third-party/index.rst)
-    - [Use deep potential with ASE](doc/third-party/ase.md)
-    - [Run MD with LAMMPS](doc/third-party/lammps.md)
-    - [LAMMPS commands](doc/third-party/lammps-command.md)
-    - [Run path-integral MD with i-PI](doc/third-party/ipi.md)
-    - [Run MD with GROMACS](doc/third-party/gromacs.md)
-    - [Interfaces out of DeePMD-kit](doc/third-party/out-of-deepmd-kit.md)
+  - [Use deep potential with ASE](doc/third-party/ase.md)
+  - [Run MD with LAMMPS](doc/third-party/lammps.md)
+  - [LAMMPS commands](doc/third-party/lammps-command.md)
+  - [Run path-integral MD with i-PI](doc/third-party/ipi.md)
+  - [Run MD with GROMACS](doc/third-party/gromacs.md)
+  - [Interfaces out of DeePMD-kit](doc/third-party/out-of-deepmd-kit.md)
 - [Use NVNMD](doc/nvnmd/index.md)
 
 # Code structure
 
 The code is organized as follows:
 
-* `data/raw`: tools manipulating the raw data files.
-* `examples`: examples.
-* `deepmd`: DeePMD-kit python modules.
-* `source/api_cc`: source code of DeePMD-kit C++ API.
-* `source/ipi`: source code of i-PI client.
-* `source/lib`: source code of DeePMD-kit library.
-* `source/lmp`: source code of Lammps module.
-* `source/gmx`: source code of Gromacs plugin.
-* `source/op`: TensorFlow op implementation. working with the library.
-
+- `data/raw`: tools manipulating the raw data files.
+- `examples`: examples.
+- `deepmd`: DeePMD-kit python modules.
+- `source/api_cc`: source code of DeePMD-kit C++ API.
+- `source/ipi`: source code of i-PI client.
+- `source/lib`: source code of DeePMD-kit library.
+- `source/lmp`: source code of Lammps module.
+- `source/gmx`: source code of Gromacs plugin.
+- `source/op`: TensorFlow op implementation. working with the library.
 
 # Troubleshooting
 
@@ -167,7 +363,6 @@ The code is organized as follows:
 
 See [DeePMD-kit Contributing Guide](CONTRIBUTING.md) to become a contributor! 🤓
 
-
 [1]: https://arxiv.org/abs/1707.01478
 [2]: https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.143001
 [3]: https://arxiv.org/abs/1805.09003
diff --git a/deepmd/common.py b/deepmd/common.py
index d0afbf0784..99e121c990 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -28,8 +28,10 @@
 
 from deepmd.env import (
     GLOBAL_NP_FLOAT_PRECISION,
+    GLOBAL_PD_FLOAT_PRECISION,
     GLOBAL_TF_FLOAT_PRECISION,
     op_module,
+    paddle,
     tf,
 )
 from deepmd.utils.path import (
@@ -50,11 +52,11 @@
 
 # define constants
 PRECISION_DICT = {
-    "default": GLOBAL_TF_FLOAT_PRECISION,
-    "float16": tf.float16,
-    "float32": tf.float32,
-    "float64": tf.float64,
-    "bfloat16": tf.bfloat16,
+    "default": GLOBAL_PD_FLOAT_PRECISION,
+    "float16": paddle.float16,
+    "float32": paddle.float32,
+    "float64": paddle.float64,
+    "bfloat16": paddle.bfloat16,
 }
 
 
@@ -119,11 +121,11 @@ def gelu_wrapper(x):
 data_requirement = {}
 
 ACTIVATION_FN_DICT = {
-    "relu": tf.nn.relu,
-    "relu6": tf.nn.relu6,
-    "softplus": tf.nn.softplus,
-    "sigmoid": tf.sigmoid,
-    "tanh": tf.nn.tanh,
+    "relu": paddle.nn.functional.relu,
+    "relu6": paddle.nn.functional.relu6,
+    "softplus": paddle.nn.functional.softplus,
+    "sigmoid": paddle.nn.functional.sigmoid,
+    "tanh": paddle.nn.functional.tanh,
     "gelu": gelu,
     "gelu_tf": gelu_tf,
     "None": None,
diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 641210f0d1..58f1139ffb 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -7,41 +7,26 @@
 import numpy as np
 
 from deepmd.common import (
-    cast_precision,
     get_activation_func,
     get_precision,
 )
 from deepmd.env import (
     GLOBAL_NP_FLOAT_PRECISION,
-    GLOBAL_TF_FLOAT_PRECISION,
-    default_tf_session_config,
+    GLOBAL_PD_FLOAT_PRECISION,
     op_module,
+    paddle,
     tf,
 )
-from deepmd.nvnmd.descriptor.se_a import (
-    build_davg_dstd,
-    build_op_descriptor,
-    check_switch_range,
-    descrpt2r4,
-    filter_GR2D,
-    filter_lower_R42GR,
-)
-from deepmd.nvnmd.utils.config import (
-    nvnmd_cfg,
-)
 from deepmd.utils.errors import (
     GraphWithoutTensorError,
 )
 from deepmd.utils.graph import (
     get_tensor_by_name_from_graph,
 )
+from deepmd.utils.network import EmbeddingNet  # embedding_net,
 from deepmd.utils.network import (
-    embedding_net,
     embedding_net_rand_seed_shift,
 )
-from deepmd.utils.sess import (
-    run_sess,
-)
 from deepmd.utils.spin import (
     Spin,
 )
@@ -52,17 +37,8 @@
     embed_atom_type,
 )
 
-from .descriptor import (
-    Descriptor,
-)
-from .se import (
-    DescrptSe,
-)
-
 
-@Descriptor.register("se_e2_a")
-@Descriptor.register("se_a")
-class DescrptSeA(DescrptSe):
+class DescrptSeA(paddle.nn.Layer):
     r"""DeepPot-SE constructed from all information (both angular and radial) of
     atomic configurations. The embedding takes the distance between atoms as input.
 
@@ -166,12 +142,15 @@ def __init__(
         spin: Optional[Spin] = None,
     ) -> None:
         """Constructor."""
+        super().__init__()
         if rcut < rcut_smth:
             raise RuntimeError(
                 f"rcut_smth ({rcut_smth:f}) should be no more than rcut ({rcut:f})!"
             )
         self.sel_a = sel
         self.rcut_r = rcut
+        # NOTE: register 'rcut' in buffer to be accessed in inference
+        self.register_buffer("buffer_rcut", paddle.to_tensor(rcut, dtype="float64"))
         self.rcut_r_smth = rcut_smth
         self.filter_neuron = neuron
         self.n_axis_neuron = axis_neuron
@@ -189,8 +168,9 @@ def __init__(
             self.exclude_types.add((tt[0], tt[1]))
             self.exclude_types.add((tt[1], tt[0]))
         self.set_davg_zero = set_davg_zero
-        self.type_one_side = type_one_side
-        self.spin = spin
+        # self.type_one_side = type_one_side
+        self.type_one_side = False
+        self.spin = spin  # None
 
         # extend sel_a for spin system
         if self.spin is not None:
@@ -199,10 +179,18 @@ def __init__(
             self.sel_a.extend(self.sel_a_spin)
         else:
             self.ntypes_spin = 0
+        # NOTE: register 'ntypes_spin' in buffer to be accessed in inference
+        self.register_buffer(
+            "buffer_ntypes_spin", paddle.to_tensor(self.ntypes_spin, dtype="int32")
+        )
 
         # descrpt config
         self.sel_r = [0 for ii in range(len(self.sel_a))]
         self.ntypes = len(self.sel_a)
+        # NOTE: register 'ntypes' in buffer to be accessed in inference
+        self.register_buffer(
+            "buffer_ntypes", paddle.to_tensor(self.ntypes, dtype="int32")
+        )
         assert self.ntypes == len(self.sel_r)
         self.rcut_a = -1
         # numb of neighbors and numb of descrptors
@@ -215,49 +203,33 @@ def __init__(
         self.useBN = False
         self.dstd = None
         self.davg = None
+        self.avg_zero = paddle.zeros([self.ntypes, self.ndescrpt], dtype="float32")
+        self.std_ones = paddle.ones([self.ntypes, self.ndescrpt], dtype="float32")
+
+        nets = []
+        for type_input in range(self.ntypes):
+            layer = []
+            for type_i in range(self.ntypes):
+                layer.append(
+                    EmbeddingNet(
+                        self.filter_neuron,
+                        self.filter_precision,
+                        self.filter_activation_fn,
+                        self.filter_resnet_dt,
+                        self.seed,
+                        self.trainable,
+                        name="filter_type_" + str(type_input) + str(type_i),
+                    )
+                )
+            nets.append(paddle.nn.LayerList(layer))
+
+        self.embedding_nets = paddle.nn.LayerList(nets)
+
         self.compress = False
         self.embedding_net_variables = None
         self.mixed_prec = None
-        self.place_holders = {}
         self.nei_type = np.repeat(np.arange(self.ntypes), self.sel_a)  # like a mask
 
-        avg_zero = np.zeros([self.ntypes, self.ndescrpt]).astype(
-            GLOBAL_NP_FLOAT_PRECISION
-        )
-        std_ones = np.ones([self.ntypes, self.ndescrpt]).astype(
-            GLOBAL_NP_FLOAT_PRECISION
-        )
-        sub_graph = tf.Graph()
-        with sub_graph.as_default():
-            name_pfx = "d_sea_"
-            for ii in ["coord", "box"]:
-                self.place_holders[ii] = tf.placeholder(
-                    GLOBAL_NP_FLOAT_PRECISION, [None, None], name=name_pfx + "t_" + ii
-                )
-            self.place_holders["type"] = tf.placeholder(
-                tf.int32, [None, None], name=name_pfx + "t_type"
-            )
-            self.place_holders["natoms_vec"] = tf.placeholder(
-                tf.int32, [self.ntypes + 2], name=name_pfx + "t_natoms"
-            )
-            self.place_holders["default_mesh"] = tf.placeholder(
-                tf.int32, [None], name=name_pfx + "t_mesh"
-            )
-            self.stat_descrpt, descrpt_deriv, rij, nlist = op_module.prod_env_mat_a(
-                self.place_holders["coord"],
-                self.place_holders["type"],
-                self.place_holders["natoms_vec"],
-                self.place_holders["box"],
-                self.place_holders["default_mesh"],
-                tf.constant(avg_zero),
-                tf.constant(std_ones),
-                rcut_a=self.rcut_a,
-                rcut_r=self.rcut_r,
-                rcut_r_smth=self.rcut_r_smth,
-                sel_a=self.sel_a,
-                sel_r=self.sel_r,
-            )
-        self.sub_sess = tf.Session(graph=sub_graph, config=default_tf_session_config)
         self.original_sel = None
         self.multi_task = multi_task
         if multi_task:
@@ -269,6 +241,20 @@ def __init__(
                 "suma2": [],
             }
 
+        self.t_rcut = paddle.to_tensor(
+            np.max([self.rcut_r, self.rcut_a]), dtype="float32"
+        )
+        self.t_ntypes = paddle.to_tensor(self.ntypes, dtype="int32")
+        self.t_ndescrpt = paddle.to_tensor(self.ndescrpt, dtype="int32")
+        self.t_sel = paddle.to_tensor(self.sel_a, dtype="int32")
+
+        t_avg = paddle.to_tensor(
+            np.zeros([self.ntypes, self.ndescrpt]), dtype="float64"
+        )
+        t_std = paddle.to_tensor(np.ones([self.ntypes, self.ndescrpt]), dtype="float64")
+        self.register_buffer("t_avg", t_avg)
+        self.register_buffer("t_std", t_std)
+
     def get_rcut(self) -> float:
         """Returns the cut-off radius."""
         return self.rcut_r
@@ -285,7 +271,7 @@ def get_dim_rot_mat_1(self) -> int:
         """Returns the first dimension of the rotation matrix. The rotation is of shape dim_1 x 3."""
         return self.filter_neuron[-1]
 
-    def get_nlist(self) -> Tuple[tf.Tensor, tf.Tensor, List[int], List[int]]:
+    def get_nlist(self) -> Tuple[paddle.Tensor, paddle.Tensor, List[int], List[int]]:
         """Returns neighbor information.
 
         Returns
@@ -360,6 +346,9 @@ def compute_input_stats(
                 self.stat_dict["sumr2"] += sumr2
                 self.stat_dict["suma2"] += suma2
 
+        self.t_avg = paddle.to_tensor(self.davg, dtype="float64")
+        self.t_std = paddle.to_tensor(self.dstd, dtype="float64")
+
     def merge_input_stats(self, stat_dict):
         """Merge the statisitcs computed from compute_input_stats to obtain the self.davg and self.dstd.
 
@@ -498,17 +487,17 @@ def enable_mixed_precision(self, mixed_prec: Optional[dict] = None) -> None:
         self.mixed_prec = mixed_prec
         self.filter_precision = get_precision(mixed_prec["output_prec"])
 
-    def build(
+    def forward(
         self,
-        coord_: tf.Tensor,
-        atype_: tf.Tensor,
-        natoms: tf.Tensor,
-        box_: tf.Tensor,
-        mesh: tf.Tensor,
+        coord_: paddle.Tensor,
+        atype_: paddle.Tensor,
+        natoms: paddle.Tensor,
+        box_: paddle.Tensor,
+        mesh: paddle.Tensor,
         input_dict: dict,
         reuse: Optional[bool] = None,
         suffix: str = "",
-    ) -> tf.Tensor:
+    ) -> paddle.Tensor:
         """Build the computational graph for the descriptor.
 
         Parameters
@@ -542,59 +531,27 @@ def build(
         """
         davg = self.davg
         dstd = self.dstd
-        if nvnmd_cfg.enable:
-            if nvnmd_cfg.restore_descriptor:
-                davg, dstd = build_davg_dstd()
-            check_switch_range(davg, dstd)
-        with tf.variable_scope("descrpt_attr" + suffix, reuse=reuse):
-            if davg is None:
-                davg = np.zeros([self.ntypes, self.ndescrpt])
-            if dstd is None:
-                dstd = np.ones([self.ntypes, self.ndescrpt])
-            t_rcut = tf.constant(
-                np.max([self.rcut_r, self.rcut_a]),
-                name="rcut",
-                dtype=GLOBAL_TF_FLOAT_PRECISION,
-            )
-            t_ntypes = tf.constant(self.ntypes, name="ntypes", dtype=tf.int32)
-            t_ndescrpt = tf.constant(self.ndescrpt, name="ndescrpt", dtype=tf.int32)
-            t_sel = tf.constant(self.sel_a, name="sel", dtype=tf.int32)
-            t_original_sel = tf.constant(
-                self.original_sel if self.original_sel is not None else self.sel_a,
-                name="original_sel",
-                dtype=tf.int32,
-            )
-            self.t_avg = tf.get_variable(
-                "t_avg",
-                davg.shape,
-                dtype=GLOBAL_TF_FLOAT_PRECISION,
-                trainable=False,
-                initializer=tf.constant_initializer(davg),
-            )
-            self.t_std = tf.get_variable(
-                "t_std",
-                dstd.shape,
-                dtype=GLOBAL_TF_FLOAT_PRECISION,
-                trainable=False,
-                initializer=tf.constant_initializer(dstd),
-            )
-
-        with tf.control_dependencies([t_sel, t_original_sel]):
-            coord = tf.reshape(coord_, [-1, natoms[1] * 3])
-        box = tf.reshape(box_, [-1, 9])
-        atype = tf.reshape(atype_, [-1, natoms[1]])
-
-        op_descriptor = (
-            build_op_descriptor() if nvnmd_cfg.enable else op_module.prod_env_mat_a
-        )
-        self.descrpt, self.descrpt_deriv, self.rij, self.nlist = op_descriptor(
+        if davg is None:
+            davg = np.zeros([self.ntypes, self.ndescrpt])
+        if dstd is None:
+            dstd = np.ones([self.ntypes, self.ndescrpt])
+
+        coord = paddle.reshape(coord_, [-1, natoms[1] * 3])
+        box = paddle.reshape(box_, [-1, 9])
+        atype = paddle.reshape(atype_, [-1, natoms[1]])
+        (
+            self.descrpt,
+            self.descrpt_deriv,
+            self.rij,
+            self.nlist,
+        ) = op_module.prod_env_mat_a(
             coord,
             atype,
-            natoms,
             box,
             mesh,
             self.t_avg,
             self.t_std,
+            natoms,
             rcut_a=self.rcut_a,
             rcut_r=self.rcut_r,
             rcut_r_smth=self.rcut_r_smth,
@@ -602,13 +559,8 @@ def build(
             sel_r=self.sel_r,
         )
         # only used when tensorboard was set as true
-        tf.summary.histogram("descrpt", self.descrpt)
-        tf.summary.histogram("rij", self.rij)
-        tf.summary.histogram("nlist", self.nlist)
-
-        self.descrpt_reshape = tf.reshape(self.descrpt, [-1, self.ndescrpt])
-        self._identity_tensors(suffix=suffix)
-
+        self.descrpt_reshape = paddle.reshape(self.descrpt, [-1, self.ndescrpt])
+        self.descrpt_reshape.stop_gradient = False
         self.dout, self.qmat = self._pass_filter(
             self.descrpt_reshape,
             atype,
@@ -619,17 +571,15 @@ def build(
             trainable=self.trainable,
         )
 
-        # only used when tensorboard was set as true
-        tf.summary.histogram("embedding_net_output", self.dout)
         return self.dout
 
-    def get_rot_mat(self) -> tf.Tensor:
+    def get_rot_mat(self) -> paddle.Tensor:
         """Get rotational matrix."""
         return self.qmat
 
     def prod_force_virial(
-        self, atom_ener: tf.Tensor, natoms: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        self, atom_ener: paddle.Tensor, natoms: paddle.Tensor
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         """Compute force and virial.
 
         Parameters
@@ -651,11 +601,11 @@ def prod_force_virial(
         atom_virial
             The atomic virial
         """
-        [net_deriv] = tf.gradients(atom_ener, self.descrpt_reshape)
-        tf.summary.histogram("net_derivative", net_deriv)
-        net_deriv_reshape = tf.reshape(
+        net_deriv = paddle.grad(atom_ener, self.descrpt_reshape, create_graph=True)[0]
+        # tf.summary.histogram("net_derivative", net_deriv)
+        net_deriv_reshape = paddle.reshape(
             net_deriv,
-            [np.cast["int64"](-1), natoms[0] * np.cast["int64"](self.ndescrpt)],
+            [-1, natoms[0] * self.ndescrpt],
         )
         force = op_module.prod_force_se_a(
             net_deriv_reshape,
@@ -674,29 +624,56 @@ def prod_force_virial(
             n_a_sel=self.nnei_a,
             n_r_sel=self.nnei_r,
         )
-        tf.summary.histogram("force", force)
-        tf.summary.histogram("virial", virial)
-        tf.summary.histogram("atom_virial", atom_virial)
 
         return force, virial, atom_virial
 
     def _pass_filter(
         self, inputs, atype, natoms, input_dict, reuse=None, suffix="", trainable=True
     ):
+        """pass_filter.
+
+        Parameters
+        ----------
+        inputs : paddle.Tensor
+            Inputs tensor.
+        atype : paddle.Tensor
+            Atom type Tensor.
+        natoms : paddle.Tensor
+            Number of atoms vector
+        input_dict : Dict[str, paddle.Tensor]
+            Input data dict.
+        reuse : bool, optional
+            Whether reuse variables. Defaults to None.
+        suffix : str, optional
+            Variable suffix. Defaults to "".
+        trainable : bool, optional
+            Whether make subnetwork traninable. Defaults to True.
+
+        Returns
+        -------
+        Tuple[Tensor, Tensor]: output: [1, all_atom, M1*M2], output_qmat: [1, all_atom, M1*3]
+        """
         if input_dict is not None:
             type_embedding = input_dict.get("type_embedding", None)
         else:
             type_embedding = None
         start_index = 0
-        inputs = tf.reshape(inputs, [-1, natoms[0], self.ndescrpt])
+        inputs = paddle.reshape(inputs, [-1, int(natoms[0].item()), int(self.ndescrpt)])
         output = []
         output_qmat = []
         if not self.type_one_side and type_embedding is None:
             for type_i in range(self.ntypes):
-                inputs_i = tf.slice(
-                    inputs, [0, start_index, 0], [-1, natoms[2 + type_i], -1]
+                inputs_i = paddle.slice(
+                    inputs,
+                    [0, 1, 2],
+                    [0, start_index, 0],
+                    [
+                        inputs.shape[0],
+                        start_index + natoms[2 + type_i],
+                        inputs.shape[2],
+                    ],
                 )
-                inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
+                inputs_i = paddle.reshape(inputs_i, [-1, self.ndescrpt])
                 filter_name = "filter_type_" + str(type_i) + suffix
                 layer, qmat = self._filter(
                     inputs_i,
@@ -707,13 +684,13 @@ def _pass_filter(
                     trainable=trainable,
                     activation_fn=self.filter_activation_fn,
                 )
-                layer = tf.reshape(
-                    layer, [tf.shape(inputs)[0], natoms[2 + type_i], self.get_dim_out()]
+                layer = paddle.reshape(
+                    layer, [inputs.shape[0], natoms[2 + type_i], self.get_dim_out()]
                 )
-                qmat = tf.reshape(
+                qmat = paddle.reshape(
                     qmat,
                     [
-                        tf.shape(inputs)[0],
+                        inputs.shape[0],
                         natoms[2 + type_i],
                         self.get_dim_rot_mat_1() * 3,
                     ],
@@ -722,61 +699,78 @@ def _pass_filter(
                 output_qmat.append(qmat)
                 start_index += natoms[2 + type_i]
         else:
-            inputs_i = inputs
-            inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
-            type_i = -1
-            if nvnmd_cfg.enable and nvnmd_cfg.quantize_descriptor:
-                inputs_i = descrpt2r4(inputs_i, natoms)
-            if len(self.exclude_types):
-                atype_nloc = tf.reshape(
-                    tf.slice(atype, [0, 0], [-1, natoms[0]]), [-1]
-                )  # when nloc != nall, pass nloc to mask
-                mask = self.build_type_exclude_mask(
-                    self.exclude_types,
-                    self.ntypes,
-                    self.sel_a,
-                    self.ndescrpt,
-                    atype_nloc,
-                    tf.shape(inputs_i)[0],
-                )
-                inputs_i *= mask
-
-            layer, qmat = self._filter(
-                inputs_i,
-                type_i,
-                name="filter_type_all" + suffix,
-                natoms=natoms,
-                reuse=reuse,
-                trainable=trainable,
-                activation_fn=self.filter_activation_fn,
-                type_embedding=type_embedding,
-            )
-            layer = tf.reshape(
-                layer, [tf.shape(inputs)[0], natoms[0], self.get_dim_out()]
-            )
-            qmat = tf.reshape(
-                qmat, [tf.shape(inputs)[0], natoms[0], self.get_dim_rot_mat_1() * 3]
-            )
-            output.append(layer)
-            output_qmat.append(qmat)
-        output = tf.concat(output, axis=1)
-        output_qmat = tf.concat(output_qmat, axis=1)
+            raise NotImplementedError()
+            # This branch will not be excecuted at current
+            # inputs_i = inputs
+            # inputs_i = paddle.reshape(inputs_i, [-1, self.ndescrpt])
+            # type_i = -1
+            # # if nvnmd_cfg.enable and nvnmd_cfg.quantize_descriptor:
+            # #     inputs_i = descrpt2r4(inputs_i, natoms)
+            # if len(self.exclude_types):
+            #     atype_nloc = paddle.reshape(
+            #         paddle.slice(atype, [0, 1], [0, 0], [atype.shape[0], natoms[0]]),
+            #         [-1],
+            #     )  # when nloc != nall, pass nloc to mask
+            #     mask = self.build_type_exclude_mask(
+            #         self.exclude_types,
+            #         self.ntypes,
+            #         self.sel_a,
+            #         self.ndescrpt,
+            #         atype_nloc,
+            #         paddle.shape(inputs_i)[0],
+            #     )
+            #     inputs_i *= mask
+
+            # layer, qmat = self._filter(
+            #     inputs_i,
+            #     type_i,
+            #     name="filter_type_all" + suffix,
+            #     natoms=natoms,
+            #     reuse=reuse,
+            #     trainable=trainable,
+            #     activation_fn=self.filter_activation_fn,
+            #     type_embedding=type_embedding,
+            # )
+            # layer = paddle.reshape(
+            #     layer, [inputs.shape[0], natoms[0], self.get_dim_out()]
+            # )
+            # qmat = paddle.reshape(
+            #     qmat, [inputs.shape[0], natoms[0], self.get_dim_rot_mat_1() * 3]
+            # )
+            # output.append(layer)
+            # output_qmat.append(qmat)
+        output = paddle.concat(output, axis=1)
+        output_qmat = paddle.concat(output_qmat, axis=1)
         return output, output_qmat
 
     def _compute_dstats_sys_smth(
         self, data_coord, data_box, data_atype, natoms_vec, mesh
     ):
-        dd_all = run_sess(
-            self.sub_sess,
-            self.stat_descrpt,
-            feed_dict={
-                self.place_holders["coord"]: data_coord,
-                self.place_holders["type"]: data_atype,
-                self.place_holders["natoms_vec"]: natoms_vec,
-                self.place_holders["box"]: data_box,
-                self.place_holders["default_mesh"]: mesh,
-            },
+        input_dict = {}
+        input_dict["coord"] = paddle.to_tensor(data_coord, dtype="float32")
+        input_dict["box"] = paddle.to_tensor(data_box, dtype="float32")
+        input_dict["type"] = paddle.to_tensor(data_atype, dtype="int32")
+        input_dict["natoms_vec"] = paddle.to_tensor(
+            natoms_vec, dtype="int32", place="cpu"
         )
+        input_dict["default_mesh"] = paddle.to_tensor(mesh, dtype="int32")
+
+        self.stat_descrpt, descrpt_deriv, rij, nlist = op_module.prod_env_mat_a(
+            input_dict["coord"],
+            input_dict["type"],
+            input_dict["box"],
+            input_dict["default_mesh"],
+            self.avg_zero,
+            self.std_ones,
+            input_dict["natoms_vec"],
+            rcut_a=self.rcut_a,
+            rcut_r=self.rcut_r,
+            rcut_r_smth=self.rcut_r_smth,
+            sel_a=self.sel_a,
+            sel_r=self.sel_r,
+        )
+
+        dd_all = self.stat_descrpt.numpy()
         natoms = natoms_vec
         dd_all = np.reshape(dd_all, [-1, self.ndescrpt * natoms[0]])
         start_index = 0
@@ -840,89 +834,77 @@ def _concat_type_embedding(
         embedding:
             environment of each atom represented by embedding.
         """
-        te_out_dim = type_embedding.get_shape().as_list()[-1]
-        self.t_nei_type = tf.constant(self.nei_type, dtype=tf.int32)
-        nei_embed = tf.nn.embedding_lookup(
-            type_embedding, tf.cast(self.t_nei_type, dtype=tf.int32)
+        te_out_dim = type_embedding.shape[-1]
+        self.t_nei_type = paddle.to_tensor(self.nei_type, dtype=paddle.int32)
+        nei_embed = paddle.nn.functional.embedding(
+            paddle.cast(self.t_nei_type, dtype=paddle.int32),
+            type_embedding,
         )  # shape is [self.nnei, 1+te_out_dim]
-        nei_embed = tf.tile(
+        nei_embed = paddle.tile(
             nei_embed, (nframes * natoms[0], 1)
         )  # shape is [nframes*natoms[0]*self.nnei, te_out_dim]
-        nei_embed = tf.reshape(nei_embed, [-1, te_out_dim])
-        embedding_input = tf.concat(
+        nei_embed = paddle.reshape(nei_embed, [-1, te_out_dim])
+        embedding_input = paddle.concat(
             [xyz_scatter, nei_embed], 1
         )  # shape is [nframes*natoms[0]*self.nnei, 1+te_out_dim]
         if not self.type_one_side:
             atm_embed = embed_atom_type(
                 self.ntypes, natoms, type_embedding
             )  # shape is [natoms[0], te_out_dim]
-            atm_embed = tf.tile(
+            atm_embed = paddle.tile(
                 atm_embed, (nframes, self.nnei)
             )  # shape is [nframes*natoms[0], self.nnei*te_out_dim]
-            atm_embed = tf.reshape(
+            atm_embed = paddle.reshape(
                 atm_embed, [-1, te_out_dim]
             )  # shape is [nframes*natoms[0]*self.nnei, te_out_dim]
-            embedding_input = tf.concat(
+            embedding_input = paddle.concat(
                 [embedding_input, atm_embed], 1
             )  # shape is [nframes*natoms[0]*self.nnei, 1+te_out_dim+te_out_dim]
         return embedding_input
 
     def _filter_lower(
         self,
-        type_i,
-        type_input,
-        start_index,
-        incrs_index,
-        inputs,
-        nframes,
-        natoms,
+        type_i: int,  # inner-loop
+        type_input: int,  # outer-loop
+        start_index: int,
+        incrs_index: int,
+        inputs: paddle.Tensor,
+        nframes: int,
+        natoms: int,
         type_embedding=None,
         is_exclude=False,
-        activation_fn=None,
-        bavg=0.0,
-        stddev=1.0,
-        trainable=True,
-        suffix="",
     ):
         """Input env matrix, returns R.G."""
         outputs_size = [1] + self.filter_neuron
         # cut-out inputs
         # with natom x (nei_type_i x 4)
-        inputs_i = tf.slice(inputs, [0, start_index * 4], [-1, incrs_index * 4])
-        shape_i = inputs_i.get_shape().as_list()
-        natom = tf.shape(inputs_i)[0]
+        inputs_i = paddle.slice(
+            inputs,
+            [0, 1],
+            [0, start_index * 4],
+            [inputs.shape[0], start_index * 4 + incrs_index * 4],
+        )
+
+        shape_i = inputs_i.shape
+        natom = inputs_i.shape[0]
+
         # with (natom x nei_type_i) x 4
-        inputs_reshape = tf.reshape(inputs_i, [-1, 4])
+        inputs_reshape = paddle.reshape(inputs_i, [-1, 4])
         # with (natom x nei_type_i) x 1
-        xyz_scatter = tf.reshape(tf.slice(inputs_reshape, [0, 0], [-1, 1]), [-1, 1])
+        xyz_scatter = paddle.reshape(
+            paddle.slice(inputs_reshape, [0, 1], [0, 0], [inputs_reshape.shape[0], 1]),
+            [-1, 1],
+        )
+
         if type_embedding is not None:
             xyz_scatter = self._concat_type_embedding(
                 xyz_scatter, nframes, natoms, type_embedding
-            )
+            )  #
             if self.compress:
                 raise RuntimeError(
                     "compression of type embedded descriptor is not supported at the moment"
                 )
         # natom x 4 x outputs_size
-        if nvnmd_cfg.enable:
-            return filter_lower_R42GR(
-                type_i,
-                type_input,
-                inputs_i,
-                is_exclude,
-                activation_fn,
-                bavg,
-                stddev,
-                trainable,
-                suffix,
-                self.seed,
-                self.seed_shift,
-                self.uniform_seed,
-                self.filter_neuron,
-                self.filter_precision,
-                self.filter_resnet_dt,
-                self.embedding_net_variables,
-            )
         if self.compress and (not is_exclude):
             if self.type_one_side:
                 net = "filter_-1_net_" + str(type_i)
@@ -937,173 +919,195 @@ def _filter_lower(
                 self.table_config[3],
             ]
             return op_module.tabulate_fusion_se_a(
-                tf.cast(self.table.data[net], self.filter_precision),
+                paddle.cast(self.table.data[net], self.filter_precision),
                 info,
                 xyz_scatter,
-                tf.reshape(inputs_i, [natom, shape_i[1] // 4, 4]),
+                paddle.reshape(inputs_i, [natom, shape_i[1] // 4, 4]),
                 last_layer_size=outputs_size[-1],
             )
         else:
             if not is_exclude:
-                # with (natom x nei_type_i) x out_size
-                xyz_scatter = embedding_net(
-                    xyz_scatter,
-                    self.filter_neuron,
-                    self.filter_precision,
-                    activation_fn=activation_fn,
-                    resnet_dt=self.filter_resnet_dt,
-                    name_suffix=suffix,
-                    stddev=stddev,
-                    bavg=bavg,
-                    seed=self.seed,
-                    trainable=trainable,
-                    uniform_seed=self.uniform_seed,
-                    initial_variables=self.embedding_net_variables,
-                    mixed_prec=self.mixed_prec,
-                )
+                # excuted this branch
+                xyz_scatter_out = self.embedding_nets[type_input][type_i](xyz_scatter)
                 if (not self.uniform_seed) and (self.seed is not None):
                     self.seed += self.seed_shift
             else:
                 # we can safely return the final xyz_scatter filled with zero directly
-                return tf.cast(
-                    tf.fill((natom, 4, outputs_size[-1]), 0.0), self.filter_precision
+                return paddle.cast(
+                    paddle.fill((natom, 4, outputs_size[-1]), 0.0),
+                    self.filter_precision,
                 )
             # natom x nei_type_i x out_size
-            xyz_scatter = tf.reshape(
-                xyz_scatter, (-1, shape_i[1] // 4, outputs_size[-1])
-            )
-            # When using tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]) below
+            xyz_scatter_out = paddle.reshape(
+                xyz_scatter_out, (-1, shape_i[1] // 4, outputs_size[-1])
+            )  # (natom x nei_type_i) x 100 ==> natom x nei_type_i x 100
+            # When using paddle.reshape(inputs_i, [-1, shape_i[1]//4, 4]) below
             # [588 24] -> [588 6 4] correct
             # but if sel is zero
             # [588 0] -> [147 0 4] incorrect; the correct one is [588 0 4]
-            # So we need to explicitly assign the shape to tf.shape(inputs_i)[0] instead of -1
+            # So we need to explicitly assign the shape to paddle.shape(inputs_i)[0] instead of -1
             # natom x 4 x outputs_size
-            return tf.matmul(
-                tf.reshape(inputs_i, [natom, shape_i[1] // 4, 4]),
-                xyz_scatter,
-                transpose_a=True,
+
+            return paddle.matmul(
+                paddle.reshape(
+                    inputs_i, [natom, shape_i[1] // 4, 4]
+                ),  # [natom, nei_type_i, 4]
+                xyz_scatter_out,  # [natom, nei_type_i, 100]
+                transpose_x=True,
             )
 
-    @cast_precision
+    # @cast_precision
     def _filter(
         self,
-        inputs,
-        type_input,
+        inputs: paddle.Tensor,
+        type_input: int,
         natoms,
         type_embedding=None,
-        activation_fn=tf.nn.tanh,
+        activation_fn=paddle.nn.functional.tanh,
         stddev=1.0,
         bavg=0.0,
         name="linear",
         reuse=None,
         trainable=True,
     ):
-        nframes = tf.shape(tf.reshape(inputs, [-1, natoms[0], self.ndescrpt]))[0]
+        """_filter.
+
+        Parameters
+        ----------
+        inputs : paddle.Tensor
+            Inputs tensor.
+        type_input : int
+            Type of input.
+        natoms : paddle.Tensor
+            Number of atoms, a vector.
+        type_embedding : paddle.Tensor
+            Type embedding. Defaults to None.
+        activation_fn : Callable
+            Activation function. Defaults to paddle.nn.functional.tanh.
+        stddev : float, optional
+            Stddev for parameters initialization. Defaults to 1.0.
+        bavg : float, optional
+            Bavg for parameters initialization . Defaults to 0.0.
+        name : str, optional
+            Name for subnetwork. Defaults to "linear".
+        reuse : bool, optional
+            Whether reuse variables. Defaults to None.
+        trainable : bool, optional
+            Whether make subnetwork trainable. Defaults to True.
+
+        Returns
+        -------
+        Tuple[Tensor, Tensor]: result: [64/128, M1*M2], qmat: [64/128, M1, 3]
+        """
+        # NOTE: code below is annotated as nframes computation is wrong
+        # nframes = paddle.shape(paddle.reshape(inputs, [-1, natoms[0], self.ndescrpt]))[0]
+
+        nframes = 1
         # natom x (nei x 4)
-        shape = inputs.get_shape().as_list()
+        shape = inputs.shape
         outputs_size = [1] + self.filter_neuron
-        outputs_size_2 = self.n_axis_neuron
+        outputs_size_2 = self.n_axis_neuron  # 16
         all_excluded = all(
             [
-                (type_input, type_i) in self.exclude_types
+                (type_input, type_i) in self.exclude_types  #  set()
                 for type_i in range(self.ntypes)
             ]
-        )
+        )  # False
         if all_excluded:
             # all types are excluded so result and qmat should be zeros
             # we can safaly return a zero matrix...
             # See also https://stackoverflow.com/a/34725458/9567349
             # result: natom x outputs_size x outputs_size_2
             # qmat: natom x outputs_size x 3
-            natom = tf.shape(inputs)[0]
-            result = tf.cast(
-                tf.fill((natom, outputs_size_2, outputs_size[-1]), 0.0),
-                GLOBAL_TF_FLOAT_PRECISION,
+            natom = paddle.shape(inputs)[0]
+            result = paddle.cast(
+                paddle.full((natom, outputs_size_2, outputs_size[-1]), 0.0),
+                GLOBAL_PD_FLOAT_PRECISION,
             )
-            qmat = tf.cast(
-                tf.fill((natom, outputs_size[-1], 3), 0.0), GLOBAL_TF_FLOAT_PRECISION
+            qmat = paddle.cast(
+                paddle.full((natom, outputs_size[-1], 3), 0.0),
+                GLOBAL_PD_FLOAT_PRECISION,
             )
             return result, qmat
 
-        with tf.variable_scope(name, reuse=reuse):
-            start_index = 0
-            type_i = 0
-            # natom x 4 x outputs_size
-            if type_embedding is None:
-                rets = []
-                for type_i in range(self.ntypes):
-                    ret = self._filter_lower(
-                        type_i,
-                        type_input,
-                        start_index,
-                        self.sel_a[type_i],
-                        inputs,
-                        nframes,
-                        natoms,
-                        type_embedding=type_embedding,
-                        is_exclude=(type_input, type_i) in self.exclude_types,
-                        activation_fn=activation_fn,
-                        stddev=stddev,
-                        bavg=bavg,
-                        trainable=trainable,
-                        suffix="_" + str(type_i),
-                    )
-                    if (type_input, type_i) not in self.exclude_types:
-                        # add zero is meaningless; skip
-                        rets.append(ret)
-                    start_index += self.sel_a[type_i]
-                # faster to use accumulate_n than multiple add
-                xyz_scatter_1 = tf.accumulate_n(rets)
-            else:
-                xyz_scatter_1 = self._filter_lower(
+        # with tf.variable_scope(name, reuse=reuse):
+        start_index = 0
+        type_i = 0
+        # natom x 4 x outputs_size
+        if type_embedding is None:
+            rets = []
+            # execute this branch
+            for type_i in range(self.ntypes):
+                ret = self._filter_lower(
                     type_i,
                     type_input,
                     start_index,
-                    np.cumsum(self.sel_a)[-1],
+                    self.sel_a[type_i],  # 46(O)/92(H)
                     inputs,
                     nframes,
                     natoms,
                     type_embedding=type_embedding,
-                    is_exclude=False,
-                    activation_fn=activation_fn,
-                    stddev=stddev,
-                    bavg=bavg,
-                    trainable=trainable,
+                    is_exclude=(type_input, type_i) in self.exclude_types,
                 )
-            if nvnmd_cfg.enable:
-                return filter_GR2D(xyz_scatter_1)
-            # natom x nei x outputs_size
-            # xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
-            # natom x nei x 4
-            # inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4])
-            # natom x 4 x outputs_size
-            # xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True)
-            if self.original_sel is None:
-                # shape[1] = nnei * 4
-                nnei = shape[1] / 4
-            else:
-                nnei = tf.cast(
-                    tf.Variable(
-                        np.sum(self.original_sel),
-                        dtype=tf.int32,
-                        trainable=False,
-                        name="nnei",
-                    ),
-                    self.filter_precision,
-                )
-            xyz_scatter_1 = xyz_scatter_1 / nnei
-            # natom x 4 x outputs_size_2
-            xyz_scatter_2 = tf.slice(xyz_scatter_1, [0, 0, 0], [-1, -1, outputs_size_2])
-            # # natom x 3 x outputs_size_2
-            # qmat = tf.slice(xyz_scatter_2, [0,1,0], [-1, 3, -1])
-            # natom x 3 x outputs_size_1
-            qmat = tf.slice(xyz_scatter_1, [0, 1, 0], [-1, 3, -1])
-            # natom x outputs_size_1 x 3
-            qmat = tf.transpose(qmat, perm=[0, 2, 1])
-            # natom x outputs_size x outputs_size_2
-            result = tf.matmul(xyz_scatter_1, xyz_scatter_2, transpose_a=True)
-            # natom x (outputs_size x outputs_size_2)
-            result = tf.reshape(result, [-1, outputs_size_2 * outputs_size[-1]])
+                if (type_input, type_i) not in self.exclude_types:
+                    # add zero is meaningless; skip
+                    rets.append(ret)
+                start_index += self.sel_a[type_i]
+            # faster to use accumulate_n than multiple add
+            xyz_scatter_1 = paddle.add_n(rets)
+        else:
+            xyz_scatter_1 = self._filter_lower(
+                type_i,
+                type_input,
+                start_index,
+                np.cumsum(self.sel_a)[-1],
+                inputs,
+                nframes,
+                natoms,
+                type_embedding=type_embedding,
+                is_exclude=False,
+            )
+        # natom x nei x outputs_size
+        # xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
+        # natom x nei x 4
+        # inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4])
+        # natom x 4 x outputs_size
+        # xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True)
+        if self.original_sel is None:
+            # shape[1] = nnei * 4
+            nnei = shape[1] / 4
+        else:
+            nnei = paddle.cast(
+                paddle.to_tensor(
+                    np.sum(self.original_sel),
+                    dtype=paddle.int32,
+                    stop_gradient=True,
+                ),
+                self.filter_precision,
+            )
+        xyz_scatter_1 = xyz_scatter_1 / nnei
+        # natom x 4 x outputs_size_2
+        xyz_scatter_2 = paddle.slice(
+            xyz_scatter_1,
+            [0, 1, 2],
+            [0, 0, 0],
+            [xyz_scatter_1.shape[0], xyz_scatter_1.shape[1], outputs_size_2],
+        )
+        # natom x 3 x outputs_size_2
+        # qmat = tf.slice(xyz_scatter_2, [0,1,0], [-1, 3, -1])
+        # natom x 3 x outputs_size_1
+        qmat = paddle.slice(
+            xyz_scatter_1,
+            [0, 1, 2],
+            [0, 1, 0],
+            [xyz_scatter_1.shape[0], 1 + 3, xyz_scatter_1.shape[2]],
+        )
+        # natom x outputs_size_1 x 3
+        qmat = paddle.transpose(qmat, perm=[0, 2, 1])  # [64/128, M1, 3]
+        # natom x outputs_size x outputs_size_2
+        result = paddle.matmul(xyz_scatter_1, xyz_scatter_2, transpose_x=True)
+        # natom x (outputs_size x outputs_size_2)
+        result = paddle.reshape(result, [-1, outputs_size_2 * outputs_size[-1]])
 
         return result, qmat
 
diff --git a/deepmd/entrypoints/freeze.py b/deepmd/entrypoints/freeze.py
index 9f6547998f..57efbc09ce 100755
--- a/deepmd/entrypoints/freeze.py
+++ b/deepmd/entrypoints/freeze.py
@@ -8,17 +8,12 @@
 
 import json
 import logging
-from os.path import (
-    abspath,
-)
 from typing import (
     List,
     Optional,
     Union,
 )
 
-import google.protobuf.message
-
 # load grad of force module
 import deepmd.op  # noqa: F401
 from deepmd.env import (
@@ -26,12 +21,6 @@
     REMOVE_SUFFIX_DICT,
     tf,
 )
-from deepmd.nvnmd.entrypoints.freeze import (
-    save_weight,
-)
-from deepmd.utils.errors import (
-    GraphTooLargeError,
-)
 from deepmd.utils.graph import (
     get_pattern_nodes_from_graph_def,
 )
@@ -320,70 +309,66 @@ def _make_node_names(
 
 
 def freeze_graph(
-    sess,
-    input_graph,
-    input_node,
-    freeze_type,
-    modifier,
-    out_graph_name,
-    node_names=None,
-    out_suffix="",
+    model_file: str,
+    output: str,
+    # sess,
+    # input_graph,
+    # input_node,
+    # freeze_type,
+    # modifier,
+    # out_graph_name,
+    # node_names=None,
+    # out_suffix="",
 ):
     """Freeze the single graph with chosen out_suffix.
 
     Parameters
     ----------
-    sess : tf.Session
-        The default session.
-    input_graph : tf.GraphDef
-        The input graph_def stored from the checkpoint.
-    input_node : List[str]
-        The expected nodes to freeze.
-    freeze_type : str
-        The model type to freeze.
-    modifier : Optional[str], optional
-        Modifier type if any, by default None.
-    out_graph_name : str
-        The output graph.
-    node_names : Optional[str], optional
-        Names of nodes to output, by default None.
-    out_suffix : str
-        The chosen suffix to freeze in the input_graph.
+    model_file : str
+        Location of the *.pdparams file
+    output : str
+        output file name
     """
-    output_node = _make_node_names(
-        freeze_type, modifier, out_suffix=out_suffix, node_names=node_names
-    )
-    different_set = set(output_node) - set(input_node)
-    if different_set:
-        log.warning(
-            "The following nodes are not in the graph: %s. "
-            "Skip freezeing these nodes. You may be freezing "
-            "a checkpoint generated by an old version." % different_set
-        )
-        # use intersection as output list
-        output_node = list(set(output_node) & set(input_node))
-    log.info(f"The following nodes will be frozen: {output_node}")
-    # We use a built-in TF helper to export variables to constants
-    output_graph_def = tf.graph_util.convert_variables_to_constants(
-        sess,  # The session is used to retrieve the weights
-        input_graph,  # The graph_def is used to retrieve the nodes
-        output_node,  # The output node names are used to select the usefull nodes
+    import paddle
+
+    from deepmd.infer import (
+        DeepPot,
     )
-    # if multi-task, change fitting_net suffix and model_type
-    if out_suffix != "":
-        output_graph_def = _modify_model_suffix(
-            output_graph_def, out_suffix, freeze_type
-        )
 
-    # If we need to transfer the fitting net variables
-    output_graph_def = _transfer_fitting_net_trainable_variables(
-        sess, output_graph_def, input_graph
+    dp = DeepPot(
+        model_file,
+        load_prefix="load",
+        default_tf_graph=False,
+    )
+    dp.model.eval()
+    from paddle.static import (
+        InputSpec,
     )
 
-    # Finally we serialize and dump the output graph to the filesystem
-    with tf.gfile.GFile(out_graph_name, "wb") as f:
-        f.write(output_graph_def.SerializeToString())
-    log.info(f"{len(output_graph_def.node):d} ops in the final graph.")
+    st_model = paddle.jit.to_static(
+        dp.model,
+        input_spec=[
+            InputSpec(shape=[None], dtype="float64"),  # coord_
+            InputSpec(shape=[None], dtype="int32"),  # atype_
+            InputSpec(shape=[4], dtype="int32"),  # natoms
+            InputSpec(shape=[None], dtype="float64"),  # box
+            InputSpec(shape=[6], dtype="int32"),  # mesh
+            {
+                "box": InputSpec(shape=[None], dtype="float64"),
+            },
+            "",
+            False,
+        ],
+    )
+    for name, param in st_model.named_buffers():
+        print(
+            f"[{name}, {param.shape}] generated name in static_model is: {param.name}"
+        )
+    #  skip pruning for program so as to keep buffers into files
+    skip_prune_program = True
+    print(f"==>> Set skip_prune_program = {skip_prune_program}")
+    paddle.jit.save(st_model, output, skip_prune_program=skip_prune_program)
+    print(f"Infernece model has been saved to: {output}")
 
 
 def freeze_graph_multi(
@@ -464,100 +449,22 @@ def freeze_graph_multi(
 
 def freeze(
     *,
-    checkpoint_folder: str,
+    input_file: str,
     output: str,
-    node_names: Optional[str] = None,
-    nvnmd_weight: Optional[str] = None,
-    united_model: bool = False,
     **kwargs,
 ):
     """Freeze the graph in supplied folder.
 
     Parameters
     ----------
-    checkpoint_folder : str
-        location of the folder with model
+    input_file : str
+        location of the *.pdparams file
     output : str
         output file name
-    node_names : Optional[str], optional
-        names of nodes to output, by default None
-    nvnmd_weight : Optional[str], optional
-        nvnmd weight file
-    united_model : bool
-        when in multi-task mode, freeze all nodes into one unit model
     **kwargs
         other arguments
     """
-    # We retrieve our checkpoint fullpath
-    checkpoint = tf.train.get_checkpoint_state(checkpoint_folder)
-    input_checkpoint = checkpoint.model_checkpoint_path
-
-    # expand the output file to full path
-    output_graph = abspath(output)
-
-    # Before exporting our graph, we need to precise what is our output node
-    # This is how TF decides what part of the Graph he has to keep
-    # and what part it can dump
-    # NOTE: this variable is plural, because you can have multiple output nodes
-    # node_names = "energy_test,force_test,virial_test,t_rcut"
-
-    # We clear devices to allow TensorFlow to control
-    # on which device it will load operations
-    clear_devices = True
-
-    # We import the meta graph and retrieve a Saver
-    try:
-        # In case paralle training
-        import horovod.tensorflow as _  # noqa: F401
-    except ImportError:
-        pass
-    saver = tf.train.import_meta_graph(
-        f"{input_checkpoint}.meta", clear_devices=clear_devices
+    freeze_graph(
+        input_file,
+        output,
     )
-
-    # We retrieve the protobuf graph definition
-    graph = tf.get_default_graph()
-    try:
-        input_graph_def = graph.as_graph_def()
-    except google.protobuf.message.DecodeError as e:
-        raise GraphTooLargeError(
-            "The graph size exceeds 2 GB, the hard limitation of protobuf."
-            " Then a DecodeError was raised by protobuf. You should "
-            "reduce the size of your model."
-        ) from e
-    nodes = [n.name for n in input_graph_def.node]
-
-    # We start a session and restore the graph weights
-    with tf.Session() as sess:
-        saver.restore(sess, input_checkpoint)
-        model_type = run_sess(sess, "model_attr/model_type:0", feed_dict={}).decode(
-            "utf-8"
-        )
-        if "modifier_attr/type" in nodes:
-            modifier_type = run_sess(sess, "modifier_attr/type:0", feed_dict={}).decode(
-                "utf-8"
-            )
-        else:
-            modifier_type = None
-        if nvnmd_weight is not None:
-            save_weight(sess, nvnmd_weight)  # nvnmd
-        if model_type != "multi_task":
-            freeze_graph(
-                sess,
-                input_graph_def,
-                nodes,
-                model_type,
-                modifier_type,
-                output_graph,
-                node_names,
-            )
-        else:
-            freeze_graph_multi(
-                sess,
-                input_graph_def,
-                nodes,
-                modifier_type,
-                output_graph,
-                node_names,
-                united_model=united_model,
-            )
diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py
index 587bdaace7..baa96bd7a1 100644
--- a/deepmd/entrypoints/main.py
+++ b/deepmd/entrypoints/main.py
@@ -202,6 +202,13 @@ def main_parser() -> argparse.ArgumentParser:
         help="Skip calculating neighbor statistics. Sel checking, automatic sel, and model compression will be disabled.",
     )
 
+    parser_train.add_argument(
+        "--cpu",
+        action="store_true",
+        default=False,
+        help="Training on CPU",
+    )
+
     # * freeze script ******************************************************************
     parser_frz = subparsers.add_parser(
         "freeze",
@@ -217,8 +224,8 @@ def main_parser() -> argparse.ArgumentParser:
         ),
     )
     parser_frz.add_argument(
-        "-c",
-        "--checkpoint-folder",
+        "-i",
+        "--input_file",
         type=str,
         default=".",
         help="path to checkpoint folder",
@@ -230,26 +237,26 @@ def main_parser() -> argparse.ArgumentParser:
         default="frozen_model.pb",
         help="name of graph, will output to the checkpoint folder",
     )
-    parser_frz.add_argument(
-        "-n",
-        "--node-names",
-        type=str,
-        default=None,
-        help="the frozen nodes, if not set, determined from the model type",
-    )
-    parser_frz.add_argument(
-        "-w",
-        "--nvnmd-weight",
-        type=str,
-        default=None,
-        help="the name of weight file (.npy), if set, save the model's weight into the file",
-    )
-    parser_frz.add_argument(
-        "--united-model",
-        action="store_true",
-        default=False,
-        help="When in multi-task mode, freeze all nodes into one united model",
-    )
+    # parser_frz.add_argument(
+    #     "-n",
+    #     "--node-names",
+    #     type=str,
+    #     default=None,
+    #     help="the frozen nodes, if not set, determined from the model type",
+    # )
+    # parser_frz.add_argument(
+    #     "-w",
+    #     "--nvnmd-weight",
+    #     type=str,
+    #     default=None,
+    #     help="the name of weight file (.npy), if set, save the model's weight into the file",
+    # )
+    # parser_frz.add_argument(
+    #     "--united-model",
+    #     action="store_true",
+    #     default=False,
+    #     help="When in multi-task mode, freeze all nodes into one united model",
+    # )
 
     # * test script ********************************************************************
     parser_tst = subparsers.add_parser(
diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py
index c806fb3804..1d295e091d 100755
--- a/deepmd/entrypoints/train.py
+++ b/deepmd/entrypoints/train.py
@@ -19,7 +19,6 @@
     j_must_have,
 )
 from deepmd.env import (
-    GLOBAL_ENER_FLOAT_PRECISION,
     reset_default_tf_session_config,
     tf,
 )
@@ -112,6 +111,12 @@ def train(
     RuntimeError
         if distributed training job name is wrong
     """
+    if kwargs.get("cpu", False):
+        import paddle
+
+        paddle.set_device("cpu")
+        print("[NOTE]", "=" * 10, "Running paddle code on CPU", "=" * 10)
+
     run_opt = RunOptions(
         init_model=init_model,
         restart=restart,
@@ -275,7 +280,7 @@ def _do_work(jdata: Dict[str, Any], run_opt: RunOptions, is_compress: bool = Fal
     if not is_compress:
         # train the model with the provided systems in a cyclic way
         start_time = time.time()
-        model.train(train_data, valid_data)
+        model.train(train_data, valid_data, stop_batch)
         end_time = time.time()
         log.info("finished training")
         log.info(f"wall time: {(end_time - start_time):.3f} s")
@@ -413,15 +418,6 @@ def get_nbor_stat(jdata, rcut, one_type: bool = False):
 
     min_nbor_dist, max_nbor_size = neistat.get_stat(train_data)
 
-    # moved from traier.py as duplicated
-    # TODO: this is a simple fix but we should have a clear
-    #       architecture to call neighbor stat
-    tf.constant(
-        min_nbor_dist,
-        name="train_attr/min_nbor_dist",
-        dtype=GLOBAL_ENER_FLOAT_PRECISION,
-    )
-    tf.constant(max_nbor_size, name="train_attr/max_nbor_size", dtype=tf.int32)
     return min_nbor_dist, max_nbor_size
 
 
@@ -468,7 +464,7 @@ def update_one_sel(jdata, descriptor):
         return descriptor
     rcut = descriptor["rcut"]
     tmp_sel = get_sel(jdata, rcut, one_type=descriptor["type"] in ("se_atten",))
-    sel = descriptor["sel"]
+    sel = descriptor["sel"]  # [46, 92]
     if isinstance(sel, int):
         # convert to list and finnally convert back to int
         sel = [sel]
diff --git a/deepmd/env.py b/deepmd/env.py
index 2917fff1e8..044301c628 100644
--- a/deepmd/env.py
+++ b/deepmd/env.py
@@ -67,6 +67,7 @@ def dlopen_library(module: str, filename: str):
 
 # import tensorflow v1 compatability
 try:
+    import paddle
     import tensorflow.compat.v1 as tf
 
     tf.disable_v2_behavior()
@@ -105,6 +106,7 @@ def dlopen_library(module: str, filename: str):
 # Python library version
 try:
     tf_py_version = tf.version.VERSION
+    pd_py_version = paddle.version.commit
 except AttributeError:
     tf_py_version = tf.__version__
 
@@ -370,7 +372,8 @@ def get_module(module_name: str) -> "ModuleType":
         raise FileNotFoundError(f"module {module_name} does not exist")
     else:
         try:
-            module = tf.load_op_library(str(module_file))
+            import paddle_deepmd_lib as module
+
         except tf.errors.NotFoundError as e:
             # check CXX11_ABI_FLAG is compatiblity
             # see https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html
@@ -452,9 +455,9 @@ def _get_package_constants(
 
 
 GLOBAL_CONFIG = _get_package_constants()
-MODEL_VERSION = GLOBAL_CONFIG["model_version"]
-TF_VERSION = GLOBAL_CONFIG["tf_version"]
-TF_CXX11_ABI_FLAG = int(GLOBAL_CONFIG["tf_cxx11_abi_flag"])
+MODEL_VERSION = 0
+TF_VERSION = 0
+TF_CXX11_ABI_FLAG = 0
 
 op_module = get_module("deepmd_op")
 op_grads_module = get_module("op_grads")
@@ -464,13 +467,15 @@ def _get_package_constants(
 if dp_float_prec in ("high", ""):
     # default is high
     GLOBAL_TF_FLOAT_PRECISION = tf.float64
+    GLOBAL_PD_FLOAT_PRECISION = paddle.float64
     GLOBAL_NP_FLOAT_PRECISION = np.float64
     GLOBAL_ENER_FLOAT_PRECISION = np.float64
     global_float_prec = "double"
 elif dp_float_prec == "low":
     GLOBAL_TF_FLOAT_PRECISION = tf.float32
+    GLOBAL_PD_FLOAT_PRECISION = paddle.float32
     GLOBAL_NP_FLOAT_PRECISION = np.float32
-    GLOBAL_ENER_FLOAT_PRECISION = np.float64
+    GLOBAL_ENER_FLOAT_PRECISION = np.float32
     global_float_prec = "float"
 else:
     raise RuntimeError(
@@ -496,17 +501,33 @@ def global_cvt_2_tf_float(xx: tf.Tensor) -> tf.Tensor:
     return tf.cast(xx, GLOBAL_TF_FLOAT_PRECISION)
 
 
-def global_cvt_2_ener_float(xx: tf.Tensor) -> tf.Tensor:
+def global_cvt_2_pd_float(xx: paddle.Tensor) -> paddle.Tensor:
+    """Cast tensor to globally set TF precision.
+
+    Parameters
+    ----------
+    xx : paddle.Tensor
+        input tensor
+
+    Returns
+    -------
+    paddle.Tensor
+        output tensor cast to `GLOBAL_TF_FLOAT_PRECISION`
+    """
+    return paddle.cast(xx, GLOBAL_PD_FLOAT_PRECISION)
+
+
+def global_cvt_2_ener_float(xx: paddle.Tensor) -> paddle.Tensor:
     """Cast tensor to globally set energy precision.
 
     Parameters
     ----------
-    xx : tf.Tensor
+    xx : paddle.Tensor
         input tensor
 
     Returns
     -------
-    tf.Tensor
+    paddle.Tensor
         output tensor cast to `GLOBAL_ENER_FLOAT_PRECISION`
     """
-    return tf.cast(xx, GLOBAL_ENER_FLOAT_PRECISION)
+    return paddle.cast(xx, GLOBAL_ENER_FLOAT_PRECISION)
diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py
index f482173495..27bf6a2105 100644
--- a/deepmd/fit/ener.py
+++ b/deepmd/fit/ener.py
@@ -5,30 +5,24 @@
 )
 
 import numpy as np
+from paddle import (
+    nn,
+)
 
 from deepmd.common import (
     add_data_requirement,
-    cast_precision,
     get_activation_func,
     get_precision,
 )
 from deepmd.env import (
-    GLOBAL_TF_FLOAT_PRECISION,
-    global_cvt_2_tf_float,
+    GLOBAL_PD_FLOAT_PRECISION,
+    global_cvt_2_pd_float,
+    paddle,
     tf,
 )
-from deepmd.fit.fitting import (
-    Fitting,
-)
 from deepmd.infer import (
     DeepPotential,
 )
-from deepmd.nvnmd.fit.ener import (
-    one_layer_nvnmd,
-)
-from deepmd.nvnmd.utils.config import (
-    nvnmd_cfg,
-)
 from deepmd.utils.errors import (
     GraphWithoutTensorError,
 )
@@ -36,7 +30,7 @@
     get_fitting_net_variables_from_graph_def,
     get_tensor_by_name_from_graph,
 )
-from deepmd.utils.network import one_layer as one_layer_deepmd
+from deepmd.utils.network import OneLayer as OneLayer_deepmd
 from deepmd.utils.network import (
     one_layer_rand_seed_shift,
 )
@@ -47,8 +41,8 @@
 log = logging.getLogger(__name__)
 
 
-@Fitting.register("ener")
-class EnerFitting(Fitting):
+# @Fitting.register("ener")
+class EnerFitting(nn.Layer):
     r"""Fitting the energy of the system. The force and the virial can also be trained.
 
     The potential energy :math:`E` is a fitting network function of the descriptor :math:`\mathcal{D}`:
@@ -121,7 +115,7 @@ class EnerFitting(Fitting):
 
     def __init__(
         self,
-        descrpt: tf.Tensor,
+        descrpt: paddle.Tensor,
         neuron: List[int] = [120, 120, 120],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
@@ -138,10 +132,11 @@ def __init__(
         use_aparam_as_mask: bool = False,
         spin: Optional[Spin] = None,
     ) -> None:
+        super().__init__(name_scope="EnerFitting")
         """Constructor."""
         # model param
         self.ntypes = descrpt.get_ntypes()
-        self.dim_descrpt = descrpt.get_dim_out()
+        self.dim_descrpt = descrpt.get_dim_out()  # M1*M2
         self.use_aparam_as_mask = use_aparam_as_mask
         # args = ()\
         #        .add('numb_fparam',      int,    default = 0)\
@@ -156,7 +151,9 @@ def __init__(
         #        .add("precision",           str, default = "default")\
         #        .add("trainable",        [list, bool], default = True)
         self.numb_fparam = numb_fparam
+        self.register_buffer("buffer_dfparam", paddle.to_tensor(self.numb_fparam))
         self.numb_aparam = numb_aparam
+        self.register_buffer("buffer_daparam", paddle.to_tensor(self.numb_aparam))
         self.n_neuron = neuron
         self.resnet_dt = resnet_dt
         self.rcond = rcond
@@ -180,13 +177,15 @@ def __init__(
         self.atom_ener_v = atom_ener
         for at, ae in enumerate(atom_ener):
             if ae is not None:
-                self.atom_ener.append(
-                    tf.constant(ae, GLOBAL_TF_FLOAT_PRECISION, name="atom_%d_ener" % at)
-                )
+                self.atom_ener.append(paddle.to_tensor(ae, GLOBAL_PD_FLOAT_PRECISION))
             else:
                 self.atom_ener.append(None)
         self.useBN = False
         self.bias_atom_e = np.zeros(self.ntypes, dtype=np.float64)
+        self.register_buffer(
+            "t_bias_atom_e",
+            paddle.to_tensor(self.bias_atom_e),
+        )
         # data requirement
         if self.numb_fparam > 0:
             add_data_requirement(
@@ -212,6 +211,91 @@ def __init__(
                 len(self.layer_name) == len(self.n_neuron) + 1
             ), "length of layer_name should be that of n_neuron + 1"
 
+        type_suffix = ""
+        suffix = ""
+        self.one_layers = nn.LayerList()
+        self.final_layers = nn.LayerList()
+        ntypes_atom = self.ntypes - self.ntypes_spin
+        for type_i in range(0, ntypes_atom):
+            type_i_layers = nn.LayerList()
+            for ii in range(0, len(self.n_neuron)):
+                if self.layer_name is not None and self.layer_name[ii] is not None:
+                    layer_suffix = "share_" + self.layer_name[ii] + type_suffix
+                else:
+                    layer_suffix = "layer_" + str(ii) + type_suffix + suffix
+
+                if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii - 1]:
+                    type_i_layers.append(
+                        OneLayer_deepmd(
+                            self.n_neuron[ii - 1],
+                            self.n_neuron[ii],
+                            activation_fn=self.fitting_activation_fn,
+                            precision=self.fitting_precision,
+                            name=layer_suffix,
+                            seed=self.seed,
+                            use_timestep=self.resnet_dt,
+                            trainable=self.trainable[ii],
+                        )
+                    )
+                else:
+                    type_i_layers.append(
+                        OneLayer_deepmd(
+                            self.dim_descrpt + self.numb_fparam + self.numb_aparam,
+                            self.n_neuron[ii],
+                            activation_fn=self.fitting_activation_fn,
+                            precision=self.fitting_precision,
+                            name=layer_suffix,
+                            seed=self.seed,
+                            trainable=self.trainable[ii],
+                        )
+                    )
+                if (not self.uniform_seed) and (self.seed is not None):
+                    self.seed += self.seed_shift
+
+            self.one_layers.append(type_i_layers)
+            self.final_layers.append(
+                OneLayer_deepmd(
+                    self.n_neuron[-1],
+                    1,
+                    activation_fn=None,
+                    precision=self.fitting_precision,
+                    bavg=self.bias_atom_e,
+                    name=layer_suffix,
+                    seed=self.seed,
+                    trainable=self.trainable[-1],
+                )
+            )
+
+        if self.numb_fparam > 0:
+            if self.fparam_avg is None:
+                self.fparam_avg = 0.0
+            if self.fparam_inv_std is None:
+                self.fparam_inv_std = 1.0
+        if self.numb_aparam > 0:
+            if self.aparam_avg is None:
+                self.aparam_avg = 0.0
+            if self.aparam_inv_std is None:
+                self.aparam_inv_std = 1.0
+
+        if self.numb_fparam > 0:
+            self.register_buffer(
+                "t_fparam_avg",
+                paddle.to_tensor(self.fparam_avg),
+            )
+            self.register_buffer(
+                "t_fparam_istd",
+                paddle.to_tensor(self.fparam_inv_std),
+            )
+        if self.numb_aparam > 0:
+            self.register_buffer(
+                "t_aparam_avg",
+                paddle.to_tensor(self.aparam_avg),
+            )
+            self.register_buffer(
+                "t_aparam_istd",
+                paddle.to_tensor(self.aparam_inv_std),
+            )
+
     def get_numb_fparam(self) -> int:
         """Get the number of frame parameters."""
         return self.numb_fparam
@@ -237,6 +321,7 @@ def compute_output_stats(self, all_stat: dict, mixed_type: bool = False) -> None
         self.bias_atom_e = self._compute_output_stats(
             all_stat, rcond=self.rcond, mixed_type=mixed_type
         )
+        paddle.assign(self.bias_atom_e, self.t_bias_atom_e)
 
     def _compute_output_stats(self, all_stat, rcond=1e-3, mixed_type=False):
         data = all_stat["energy"]
@@ -335,7 +420,7 @@ def compute_input_stats(self, all_stat: dict, protection: float = 1e-2) -> None:
     def _compute_std(self, sumv2, sumv, sumn):
         return np.sqrt(sumv2 / sumn - np.multiply(sumv / sumn, sumv / sumn))
 
-    @cast_precision
+    # @cast_precision
     def _build_lower(
         self,
         start_index,
@@ -346,109 +431,64 @@ def _build_lower(
         bias_atom_e=0.0,
         type_suffix="",
         suffix="",
-        reuse=None,
+        type_i=None,
     ):
         # cut-out inputs
-        inputs_i = tf.slice(inputs, [0, start_index, 0], [-1, natoms, -1])
-        inputs_i = tf.reshape(inputs_i, [-1, self.dim_descrpt])
+        inputs_i = paddle.slice(
+            inputs,
+            [0, 1, 2],
+            [0, start_index, 0],
+            [inputs.shape[0], start_index + natoms, inputs.shape[2]],
+        )
+        inputs_i = paddle.reshape(inputs_i, [-1, self.dim_descrpt])  # [natoms, M1*M2]
         layer = inputs_i
         if fparam is not None:
-            ext_fparam = tf.tile(fparam, [1, natoms])
-            ext_fparam = tf.reshape(ext_fparam, [-1, self.numb_fparam])
-            ext_fparam = tf.cast(ext_fparam, self.fitting_precision)
-            layer = tf.concat([layer, ext_fparam], axis=1)
+            ext_fparam = paddle.tile(fparam, [1, natoms])
+            ext_fparam = paddle.reshape(ext_fparam, [-1, self.numb_fparam])
+            ext_fparam = paddle.cast(ext_fparam, self.fitting_precision)
+            layer = paddle.concat([layer, ext_fparam], axis=1)
         if aparam is not None:
-            ext_aparam = tf.slice(
+            ext_aparam = paddle.slice(
                 aparam,
+                [0, 1],
                 [0, start_index * self.numb_aparam],
-                [-1, natoms * self.numb_aparam],
+                [
+                    aparam.shape[0],
+                    start_index * self.numb_aparam + natoms * self.numb_aparam,
+                ],
             )
-            ext_aparam = tf.reshape(ext_aparam, [-1, self.numb_aparam])
-            ext_aparam = tf.cast(ext_aparam, self.fitting_precision)
-            layer = tf.concat([layer, ext_aparam], axis=1)
+            ext_aparam = paddle.reshape(ext_aparam, [-1, self.numb_aparam])
+            ext_aparam = paddle.cast(ext_aparam, self.fitting_precision)
+            layer = paddle.concat([layer, ext_aparam], axis=1)
 
-        if nvnmd_cfg.enable:
-            one_layer = one_layer_nvnmd
-        else:
-            one_layer = one_layer_deepmd
         for ii in range(0, len(self.n_neuron)):
-            if self.layer_name is not None and self.layer_name[ii] is not None:
-                layer_suffix = "share_" + self.layer_name[ii] + type_suffix
-                layer_reuse = tf.AUTO_REUSE
-            else:
-                layer_suffix = "layer_" + str(ii) + type_suffix + suffix
-                layer_reuse = reuse
             if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii - 1]:
-                layer += one_layer(
-                    layer,
-                    self.n_neuron[ii],
-                    name=layer_suffix,
-                    reuse=layer_reuse,
-                    seed=self.seed,
-                    use_timestep=self.resnet_dt,
-                    activation_fn=self.fitting_activation_fn,
-                    precision=self.fitting_precision,
-                    trainable=self.trainable[ii],
-                    uniform_seed=self.uniform_seed,
-                    initial_variables=self.fitting_net_variables,
-                    mixed_prec=self.mixed_prec,
-                )
+                layer += self.one_layers[type_i][ii](layer)
             else:
-                layer = one_layer(
-                    layer,
-                    self.n_neuron[ii],
-                    name=layer_suffix,
-                    reuse=layer_reuse,
-                    seed=self.seed,
-                    activation_fn=self.fitting_activation_fn,
-                    precision=self.fitting_precision,
-                    trainable=self.trainable[ii],
-                    uniform_seed=self.uniform_seed,
-                    initial_variables=self.fitting_net_variables,
-                    mixed_prec=self.mixed_prec,
-                )
+                layer = self.one_layers[type_i][ii](layer)
             if (not self.uniform_seed) and (self.seed is not None):
                 self.seed += self.seed_shift
-        if self.layer_name is not None and self.layer_name[-1] is not None:
-            layer_suffix = "share_" + self.layer_name[-1] + type_suffix
-            layer_reuse = tf.AUTO_REUSE
-        else:
-            layer_suffix = "final_layer" + type_suffix + suffix
-            layer_reuse = reuse
-        final_layer = one_layer(
-            layer,
-            1,
-            activation_fn=None,
-            bavg=bias_atom_e,
-            name=layer_suffix,
-            reuse=layer_reuse,
-            seed=self.seed,
-            precision=self.fitting_precision,
-            trainable=self.trainable[-1],
-            uniform_seed=self.uniform_seed,
-            initial_variables=self.fitting_net_variables,
-            mixed_prec=self.mixed_prec,
-            final_layer=True,
-        )
+
+        final_layer = self.final_layers[type_i](layer)
         if (not self.uniform_seed) and (self.seed is not None):
             self.seed += self.seed_shift
 
         return final_layer
 
-    def build(
+    def forward(
         self,
-        inputs: tf.Tensor,
-        natoms: tf.Tensor,
+        inputs: paddle.Tensor,
+        natoms: paddle.Tensor,
         input_dict: Optional[dict] = None,
         reuse: Optional[bool] = None,
         suffix: str = "",
-    ) -> tf.Tensor:
+    ) -> paddle.Tensor:
         """Build the computational graph for fitting net.
 
         Parameters
         ----------
         inputs
-            The input descriptor
+            The input descriptor, [1, all_atoms, M1*M2]
         input_dict
             Additional dict for inputs.
             if numb_fparam > 0, should have input_dict['fparam']
@@ -504,59 +544,18 @@ def build(
                     self.bias_atom_e[type_i] = self.bias_atom_e[type_i]
             self.bias_atom_e = self.bias_atom_e[:ntypes_atom]
 
-        with tf.variable_scope("fitting_attr" + suffix, reuse=reuse):
-            t_dfparam = tf.constant(self.numb_fparam, name="dfparam", dtype=tf.int32)
-            t_daparam = tf.constant(self.numb_aparam, name="daparam", dtype=tf.int32)
-            self.t_bias_atom_e = tf.get_variable(
-                "t_bias_atom_e",
-                self.bias_atom_e.shape,
-                dtype=GLOBAL_TF_FLOAT_PRECISION,
-                trainable=False,
-                initializer=tf.constant_initializer(self.bias_atom_e),
-            )
-            if self.numb_fparam > 0:
-                t_fparam_avg = tf.get_variable(
-                    "t_fparam_avg",
-                    self.numb_fparam,
-                    dtype=GLOBAL_TF_FLOAT_PRECISION,
-                    trainable=False,
-                    initializer=tf.constant_initializer(self.fparam_avg),
-                )
-                t_fparam_istd = tf.get_variable(
-                    "t_fparam_istd",
-                    self.numb_fparam,
-                    dtype=GLOBAL_TF_FLOAT_PRECISION,
-                    trainable=False,
-                    initializer=tf.constant_initializer(self.fparam_inv_std),
-                )
-            if self.numb_aparam > 0:
-                t_aparam_avg = tf.get_variable(
-                    "t_aparam_avg",
-                    self.numb_aparam,
-                    dtype=GLOBAL_TF_FLOAT_PRECISION,
-                    trainable=False,
-                    initializer=tf.constant_initializer(self.aparam_avg),
-                )
-                t_aparam_istd = tf.get_variable(
-                    "t_aparam_istd",
-                    self.numb_aparam,
-                    dtype=GLOBAL_TF_FLOAT_PRECISION,
-                    trainable=False,
-                    initializer=tf.constant_initializer(self.aparam_inv_std),
-                )
-
-        inputs = tf.reshape(inputs, [-1, natoms[0], self.dim_descrpt])
+        inputs = paddle.reshape(inputs, [-1, natoms[0], self.dim_descrpt])
         if len(self.atom_ener):
             # only for atom_ener
             nframes = input_dict.get("nframes")
             if nframes is not None:
                 # like inputs, but we don't want to add a dependency on inputs
-                inputs_zero = tf.zeros(
+                inputs_zero = paddle.zeros(
                     (nframes, natoms[0], self.dim_descrpt),
-                    dtype=GLOBAL_TF_FLOAT_PRECISION,
+                    dtype=GLOBAL_PD_FLOAT_PRECISION,
                 )
             else:
-                inputs_zero = tf.zeros_like(inputs, dtype=GLOBAL_TF_FLOAT_PRECISION)
+                inputs_zero = paddle.zeros_like(inputs, dtype=GLOBAL_PD_FLOAT_PRECISION)
 
         if bias_atom_e is not None:
             assert len(bias_atom_e) == self.ntypes
@@ -564,37 +563,42 @@ def build(
         fparam = None
         if self.numb_fparam > 0:
             fparam = input_dict["fparam"]
-            fparam = tf.reshape(fparam, [-1, self.numb_fparam])
-            fparam = (fparam - t_fparam_avg) * t_fparam_istd
+            fparam = paddle.reshape(fparam, [-1, self.numb_fparam])
+            fparam = (fparam - self.t_fparam_avg) * self.t_fparam_istd
 
         aparam = None
         if not self.use_aparam_as_mask:
             if self.numb_aparam > 0:
                 aparam = input_dict["aparam"]
-                aparam = tf.reshape(aparam, [-1, self.numb_aparam])
-                aparam = (aparam - t_aparam_avg) * t_aparam_istd
-                aparam = tf.reshape(aparam, [-1, self.numb_aparam * natoms[0]])
+                aparam = paddle.reshape(aparam, [-1, self.numb_aparam])
+                aparam = (aparam - self.t_aparam_avg) * self.t_aparam_istd
+                aparam = paddle.reshape(aparam, [-1, self.numb_aparam * natoms[0]])
 
-        atype_nall = tf.reshape(atype, [-1, natoms[1]])
-        self.atype_nloc = tf.slice(
-            atype_nall, [0, 0], [-1, natoms[0]]
+        atype_nall = paddle.reshape(atype, [-1, natoms[1]])
+        self.atype_nloc = paddle.slice(
+            atype_nall, [0, 1], [0, 0], [atype_nall.shape[0], natoms[0]]
         )  ## lammps will make error
-        atype_filter = tf.cast(self.atype_nloc >= 0, GLOBAL_TF_FLOAT_PRECISION)
-        self.atype_nloc = tf.reshape(self.atype_nloc, [-1])
+        atype_filter = paddle.cast(self.atype_nloc >= 0, GLOBAL_PD_FLOAT_PRECISION)
+        self.atype_nloc = paddle.reshape(self.atype_nloc, [-1])
         # prevent embedding_lookup error,
         # but the filter will be applied anyway
-        self.atype_nloc = tf.clip_by_value(self.atype_nloc, 0, self.ntypes - 1)
+        self.atype_nloc = paddle.clip(self.atype_nloc, 0, self.ntypes - 1)
 
         ## if spin is used
         if self.spin is not None:
-            self.atype_nloc = tf.slice(
-                atype_nall, [0, 0], [-1, tf.reduce_sum(natoms[2 : 2 + ntypes_atom])]
+            self.atype_nloc = paddle.slice(
+                atype_nall,
+                [0, 1],
+                [0, 0],
+                [-1, paddle.sum(natoms[2 : 2 + ntypes_atom]).item()],
             )
-            atype_filter = tf.cast(self.atype_nloc >= 0, GLOBAL_TF_FLOAT_PRECISION)
-            self.atype_nloc = tf.reshape(self.atype_nloc, [-1])
+            atype_filter = paddle.cast(self.atype_nloc >= 0, GLOBAL_PD_FLOAT_PRECISION)
+            self.atype_nloc = paddle.reshape(self.atype_nloc, [-1])
 
         if type_embedding is not None:
-            atype_embed = tf.nn.embedding_lookup(type_embedding, self.atype_nloc)
+            atype_embed = paddle.nn.functional.embedding(
+                self.atype_nloc, type_embedding
+            )
         else:
             atype_embed = None
 
@@ -613,7 +617,7 @@ def build(
                     bias_atom_e=0.0,
                     type_suffix="_type_" + str(type_i),
                     suffix=suffix,
-                    reuse=reuse,
+                    type_i=type_i,
                 )
                 # concat the results
                 if type_i < len(self.atom_ener) and self.atom_ener[type_i] is not None:
@@ -626,62 +630,57 @@ def build(
                         bias_atom_e=0.0,
                         type_suffix="_type_" + str(type_i),
                         suffix=suffix,
-                        reuse=True,
+                        type_i=type_i,
                     )
                     final_layer -= zero_layer
-                final_layer = tf.reshape(
-                    final_layer, [tf.shape(inputs)[0], natoms[2 + type_i]]
+                final_layer = paddle.reshape(
+                    final_layer, [paddle.shape(inputs)[0], natoms[2 + type_i]]
                 )
                 outs_list.append(final_layer)
                 start_index += natoms[2 + type_i]
             # concat the results
             # concat once may be faster than multiple concat
-            outs = tf.concat(outs_list, axis=1)
+            outs = paddle.concat(outs_list, axis=1)
         # with type embedding
         else:
-            atype_embed = tf.cast(atype_embed, GLOBAL_TF_FLOAT_PRECISION)
-            type_shape = atype_embed.get_shape().as_list()
-            inputs = tf.concat(
-                [tf.reshape(inputs, [-1, self.dim_descrpt]), atype_embed], axis=1
+            atype_embed = paddle.cast(atype_embed, GLOBAL_PD_FLOAT_PRECISION)
+            type_shape = atype_embed.shape
+            inputs = paddle.concat(
+                [paddle.reshape(inputs, [-1, self.dim_descrpt]), atype_embed], axis=1
             )
             original_dim_descrpt = self.dim_descrpt
             self.dim_descrpt = self.dim_descrpt + type_shape[1]
-            inputs = tf.reshape(inputs, [-1, natoms[0], self.dim_descrpt])
-            final_layer = self._build_lower(
-                0,
-                natoms[0],
-                inputs,
-                fparam,
-                aparam,
-                bias_atom_e=0.0,
-                suffix=suffix,
-                reuse=reuse,
-            )
+            inputs = paddle.reshape(inputs, [-1, natoms[0], self.dim_descrpt])
+            final_layer = inputs
+            for layer_j in range(0 * ntypes_atom, (0 + 1) * ntypes_atom):
+                final_layer = self.one_layers[layer_j](final_layer)
+            final_layer = self.final_layers[0](final_layer)
             if len(self.atom_ener):
                 # remove contribution in vacuum
-                inputs_zero = tf.concat(
-                    [tf.reshape(inputs_zero, [-1, original_dim_descrpt]), atype_embed],
+                inputs_zero = paddle.concat(
+                    [
+                        paddle.reshape(inputs_zero, [-1, original_dim_descrpt]),
+                        atype_embed,
+                    ],
                     axis=1,
                 )
-                inputs_zero = tf.reshape(inputs_zero, [-1, natoms[0], self.dim_descrpt])
-                zero_layer = self._build_lower(
-                    0,
-                    natoms[0],
-                    inputs_zero,
-                    fparam,
-                    aparam,
-                    bias_atom_e=0.0,
-                    suffix=suffix,
-                    reuse=True,
+                inputs_zero = paddle.reshape(
+                    inputs_zero, [-1, natoms[0], self.dim_descrpt]
                 )
-                # atomic energy will be stored in `self.t_bias_atom_e` which is not trainable
+                zero_layer = inputs_zero
+                for layer_j in range(0 * ntypes_atom, (0 + 1) * ntypes_atom):
+                    zero_layer = self.one_layers[layer_j](zero_layer)
+                zero_layer = self.final_layers[0](zero_layer)
+
                 final_layer -= zero_layer
-            outs = tf.reshape(final_layer, [tf.shape(inputs)[0], natoms[0]])
+            outs = paddle.reshape(final_layer, [paddle.shape(inputs)[0], natoms[0]])
         # add bias
         self.atom_ener_before = outs * atype_filter
-        self.add_type = tf.reshape(
-            tf.nn.embedding_lookup(self.t_bias_atom_e, self.atype_nloc),
-            [tf.shape(inputs)[0], tf.reduce_sum(natoms[2 : 2 + ntypes_atom])],
+        self.add_type = paddle.reshape(
+            paddle.nn.functional.embedding(
+                self.atype_nloc, self.t_bias_atom_e.reshape([2, -1])
+            ),
+            [paddle.shape(inputs)[0], paddle.sum(natoms[2 : 2 + ntypes_atom]).item()],
         )
         outs = outs + self.add_type
         outs *= atype_filter
@@ -689,19 +688,19 @@ def build(
 
         if self.tot_ener_zero:
             force_tot_ener = 0.0
-            outs = tf.reshape(outs, [-1, tf.reduce_sum(natoms[2 : 2 + ntypes_atom])])
-            outs_mean = tf.reshape(tf.reduce_mean(outs, axis=1), [-1, 1])
-            outs_mean = outs_mean - tf.ones_like(
-                outs_mean, dtype=GLOBAL_TF_FLOAT_PRECISION
+            outs = paddle.reshape(
+                outs, [-1, paddle.sum(natoms[2 : 2 + ntypes_atom]).item()]
+            )
+            outs_mean = paddle.reshape(paddle.mean(outs, axis=1), [-1, 1])
+            outs_mean = outs_mean - paddle.ones_like(
+                outs_mean, dtype=GLOBAL_PD_FLOAT_PRECISION
             ) * (
                 force_tot_ener
-                / global_cvt_2_tf_float(tf.reduce_sum(natoms[2 : 2 + ntypes_atom]))
+                / global_cvt_2_pd_float(paddle.sum(natoms[2 : 2 + ntypes_atom]))
             )
             outs = outs - outs_mean
-            outs = tf.reshape(outs, [-1])
-
-        tf.summary.histogram("fitting_net_output", outs)
-        return tf.reshape(outs, [-1])
+            outs = paddle.reshape(outs, [-1])
+        return paddle.reshape(outs, [-1])  # [all_atoms]
 
     def init_variables(
         self,
diff --git a/deepmd/infer/deep_eval.py b/deepmd/infer/deep_eval.py
index 799cd6fd3b..27c6af754f 100644
--- a/deepmd/infer/deep_eval.py
+++ b/deepmd/infer/deep_eval.py
@@ -10,11 +10,22 @@
 
 import numpy as np
 
+from deepmd.common import (
+    j_loader,
+    j_must_have,
+)
+from deepmd.descriptor import (
+    DescrptSeA,
+)
 from deepmd.env import (
     MODEL_VERSION,
     default_tf_session_config,
+    paddle,
     tf,
 )
+from deepmd.model import (
+    EnerModel,
+)
 from deepmd.utils.batch_size import (
     AutoBatchSize,
 )
@@ -53,18 +64,95 @@ def __init__(
         default_tf_graph: bool = False,
         auto_batch_size: Union[bool, int, AutoBatchSize] = False,
     ):
-        self.graph = self._load_graph(
-            model_file, prefix=load_prefix, default_tf_graph=default_tf_graph
+        jdata = j_loader("input.json")
+        model_param = j_must_have(jdata, "model")
+
+        descrpt_param = j_must_have(model_param, "descriptor")
+        explicit_ntypes_descrpt = ["se_atten"]
+        # hybrid_with_tebd = False
+        if descrpt_param["type"] in explicit_ntypes_descrpt:
+            descrpt_param["ntypes"] = len(model_param["type_map"])
+        elif descrpt_param["type"] == "hybrid":
+            for descrpt_item in descrpt_param["list"]:
+                if descrpt_item["type"] in explicit_ntypes_descrpt:
+                    descrpt_item["ntypes"] = len(model_param["type_map"])
+                    # hybrid_with_tebd = True
+
+        # if descrpt_param["type"] in ["se_e2_a", "se_a", "se_e2_r", "se_r", "hybrid"]:
+        descrpt_param["spin"] = None
+        descrpt_param["type_one_side"] = False
+
+        descrpt_param.pop("type", None)
+        descrpt_param.pop("_comment", None)
+        self.spin = None
+        # descrpt_param["spin"] = self.spin
+        self.descrpt = DescrptSeA(**descrpt_param)
+
+        self.multi_task_mode = "fitting_net_dict" in model_param
+        fitting_param = (
+            j_must_have(model_param, "fitting_net")
+            if not self.multi_task_mode
+            else j_must_have(model_param, "fitting_net_dict")
+        )
+        from deepmd.fit import (
+            EnerFitting,
+        )
+
+        # fitting_param.pop("type", None)
+        fitting_param.pop("_comment", None)
+        fitting_param["descrpt"] = self.descrpt
+        self.fitting = EnerFitting(**fitting_param)
+
+        self.typeebd = None
+
+        self.model = EnerModel(
+            self.descrpt,
+            self.fitting,
+            self.typeebd,
+            model_param.get("type_map"),
+            model_param.get("data_stat_nbatch", 10),
+            model_param.get("data_stat_protect", 1e-2),
+            model_param.get("use_srtab"),
+            model_param.get("smin_alpha"),
+            model_param.get("sw_rmin"),
+            model_param.get("sw_rmax"),
+            self.spin,
         )
+        model_file_str = str(model_file)
+        if model_file_str.endswith((".pdmodel", ".pdiparams")):
+            st_model_prefix = model_file_str.rsplit(".", 1)[0]
+            self.st_model = paddle.jit.load(st_model_prefix)
+        else:
+            load_state_dict = paddle.load(str(model_file))
+            for k, v in load_state_dict.items():
+                if k in self.model.state_dict():
+                    if load_state_dict[k].dtype != self.model.state_dict()[k].dtype:
+                        # print(
+                        #     f"convert {k}'s dtype from {load_state_dict[k].dtype} to {self.model.state_dict()[k].dtype}"
+                        # )
+                        load_state_dict[k] = load_state_dict[k].astype(
+                            self.model.state_dict()[k].dtype
+                        )
+                    if list(load_state_dict[k].shape) != list(
+                        self.model.state_dict()[k].shape
+                    ):
+                        # print(
+                        #     f"convert {k}'s shape from {load_state_dict[k].shape} to {self.model.state_dict()[k].shape}"
+                        # )
+                        load_state_dict[k] = load_state_dict[k].reshape(
+                            self.model.state_dict()[k].shape
+                        )
+            self.model.set_state_dict(load_state_dict)
+        print(f"==>> Load pretraied model successfully from: {str(model_file)}")
         self.load_prefix = load_prefix
 
         # graph_compatable should be called after graph and prefix are set
-        if not self._graph_compatable():
-            raise RuntimeError(
-                f"model in graph (version {self.model_version}) is incompatible"
-                f"with the model (version {MODEL_VERSION}) supported by the current code."
-                "See https://deepmd.rtfd.io/compatability/ for details."
-            )
+        # if not self._graph_compatable():
+        #     raise RuntimeError(
+        #         f"model in graph (version {self.model_version}) is incompatible"
+        #         f"with the model (version {MODEL_VERSION}) supported by the current code."
+        #         "See https://deepmd.rtfd.io/compatability/ for details."
+        #     )
 
         # set default to False, as subclasses may not support
         if isinstance(auto_batch_size, bool):
@@ -82,13 +170,15 @@ def __init__(
     @property
     @lru_cache(maxsize=None)
     def model_type(self) -> str:
+        return "ener"
         """Get type of model.
 
         :type:str
         """
-        t_mt = self._get_tensor("model_attr/model_type:0")
-        [mt] = run_sess(self.sess, [t_mt], feed_dict={})
-        return mt.decode("utf-8")
+        # t_mt = self._get_tensor("model_attr/model_type:0")
+        # [mt] = run_sess(self.sess, [t_mt], feed_dict={})
+        # return mt.decode("utf-8")
+        self._model_type = self.model.t_mt
 
     @property
     @lru_cache(maxsize=None)
@@ -100,6 +190,7 @@ def model_version(self) -> str:
         str
             version of model
         """
+        return "0.1.0"
         try:
             t_mt = self._get_tensor("model_attr/model_version:0")
         except KeyError:
@@ -117,6 +208,7 @@ def sess(self) -> tf.Session:
         return tf.Session(graph=self.graph, config=default_tf_session_config)
 
     def _graph_compatable(self) -> bool:
+        return True
         """Check the model compatability.
 
         Returns
@@ -135,7 +227,7 @@ def _graph_compatable(self) -> bool:
         else:
             return True
 
-    def _get_tensor(
+    def _get_value(
         self, tensor_name: str, attr_name: Optional[str] = None
     ) -> tf.Tensor:
         """Get TF graph tensor and assign it to class namespace.
@@ -147,15 +239,12 @@ def _get_tensor(
         attr_name : Optional[str], optional
             if specified, class attribute with this name will be created and tensor will
             be assigned to it, by default None
-
-        Returns
-        -------
-        tf.Tensor
-            loaded tensor
         """
         # do not use os.path.join as it doesn't work on Windows
-        tensor_path = "/".join((self.load_prefix, tensor_name))
-        tensor = self.graph.get_tensor_by_name(tensor_path)
+        value = None
+        for name, tensor in self.model.named_buffers():
+            if tensor_name in name:
+                value = tensor.numpy()[0] if tensor.shape == [1] else tensor.numpy()
         if attr_name:
             setattr(self, attr_name, tensor)
             return tensor
@@ -194,7 +283,15 @@ def _load_graph(
                         name=prefix,
                         producer_op_list=None,
                     )
-
+                #     with tf.Session() as sess:
+                #         constant_ops = [op for op in graph.get_operations() if op.type == "Const"]
+                #         for constant_op in constant_ops:
+                #             param = sess.run(constant_op.outputs[0])
+                #             # print(type(param))
+                #             if hasattr(param, 'shape'):
+                #                 # print(param.shape)
+                #                 if param.shape == (2,):
+                #                     print(constant_op.outputs[0], param)
             return graph
 
     @staticmethod
diff --git a/deepmd/infer/deep_pot.py b/deepmd/infer/deep_pot.py
index 10fed52497..377c776320 100644
--- a/deepmd/infer/deep_pot.py
+++ b/deepmd/infer/deep_pot.py
@@ -13,8 +13,8 @@
 from deepmd.common import (
     make_default_mesh,
 )
-from deepmd.infer.data_modifier import (
-    DipoleChargeModifier,
+from deepmd.env import (
+    paddle,
 )
 from deepmd.infer.deep_eval import (
     DeepEval,
@@ -81,125 +81,51 @@ def __init__(
         self.tensors = dict(
             {
                 # descrpt attrs
-                "t_ntypes": "descrpt_attr/ntypes:0",
-                "t_rcut": "descrpt_attr/rcut:0",
+                "ntypes": "descrpt.ntypes",
+                "rcut": "descrpt.rcut",
                 # fitting attrs
-                "t_dfparam": "fitting_attr/dfparam:0",
-                "t_daparam": "fitting_attr/daparam:0",
-                # model attrs
-                "t_tmap": "model_attr/tmap:0",
-                # inputs
-                "t_coord": "t_coord:0",
-                "t_type": "t_type:0",
-                "t_natoms": "t_natoms:0",
-                "t_box": "t_box:0",
-                "t_mesh": "t_mesh:0",
-                # add output tensors
-                "t_energy": "o_energy:0",
-                "t_force": "o_force:0",
-                "t_virial": "o_virial:0",
-                "t_ae": "o_atom_energy:0",
-                "t_av": "o_atom_virial:0",
-                "t_descriptor": "o_descriptor:0",
+                "dfparam": "fitting.t_dfparam",
+                "daparam": "fitting.t_daparam",
             },
         )
         DeepEval.__init__(
             self,
             model_file,
             load_prefix=load_prefix,
-            default_tf_graph=default_tf_graph,
+            # default_tf_graph=default_tf_graph,
             auto_batch_size=auto_batch_size,
         )
 
-        # load optional tensors
-        operations = [op.name for op in self.graph.get_operations()]
-        # check if the graph has these operations:
-        # if yes add them
-        if "t_efield" in operations:
-            self._get_tensor("t_efield:0", "t_efield")
-            self.has_efield = True
-        else:
-            log.debug("Could not get tensor 't_efield:0'")
-            self.t_efield = None
-            self.has_efield = False
-
-        if "load/t_fparam" in operations:
-            self.tensors.update({"t_fparam": "t_fparam:0"})
-            self.has_fparam = True
-        else:
-            log.debug("Could not get tensor 't_fparam:0'")
-            self.t_fparam = None
-            self.has_fparam = False
+        # # load optional tensors
+        self.has_efield = False
 
-        if "load/t_aparam" in operations:
-            self.tensors.update({"t_aparam": "t_aparam:0"})
-            self.has_aparam = True
-        else:
-            log.debug("Could not get tensor 't_aparam:0'")
-            self.t_aparam = None
-            self.has_aparam = False
+        self.has_fparam = False
 
-        if "load/spin_attr/ntypes_spin" in operations:
-            self.tensors.update({"t_ntypes_spin": "spin_attr/ntypes_spin:0"})
-            self.has_spin = True
-        else:
-            self.ntypes_spin = 0
-            self.has_spin = False
+        self.has_aparam = False
+        self.ntypes_spin = 0
+        self.has_spin = False
 
         # now load tensors to object attributes
         for attr_name, tensor_name in self.tensors.items():
             try:
-                self._get_tensor(tensor_name, attr_name)
+                self._get_value(tensor_name, attr_name)
             except KeyError:
                 if attr_name != "t_descriptor":
                     raise
 
-        self._run_default_sess()
-        self.tmap = self.tmap.decode("UTF-8").split()
+        self.ntypes = 2
+        self.rcut = 6.0
+        self.dfparam = 0
+        self.daparam = 0
+        self.t_tmap = ["O", "H"]
 
         # setup modifier
         try:
-            t_modifier_type = self._get_tensor("modifier_attr/type:0")
-            self.modifier_type = run_sess(self.sess, t_modifier_type).decode("UTF-8")
+            self.modifier_type = self._get_value("modifier_attr.type")
         except (ValueError, KeyError):
             self.modifier_type = None
-
-        try:
-            t_jdata = self._get_tensor("train_attr/training_script:0")
-            jdata = run_sess(self.sess, t_jdata).decode("UTF-8")
-            import json
-
-            jdata = json.loads(jdata)
-            self.descriptor_type = jdata["model"]["descriptor"]["type"]
-        except (ValueError, KeyError):
-            self.descriptor_type = None
-
-        if self.modifier_type == "dipole_charge":
-            t_mdl_name = self._get_tensor("modifier_attr/mdl_name:0")
-            t_mdl_charge_map = self._get_tensor("modifier_attr/mdl_charge_map:0")
-            t_sys_charge_map = self._get_tensor("modifier_attr/sys_charge_map:0")
-            t_ewald_h = self._get_tensor("modifier_attr/ewald_h:0")
-            t_ewald_beta = self._get_tensor("modifier_attr/ewald_beta:0")
-            [mdl_name, mdl_charge_map, sys_charge_map, ewald_h, ewald_beta] = run_sess(
-                self.sess,
-                [
-                    t_mdl_name,
-                    t_mdl_charge_map,
-                    t_sys_charge_map,
-                    t_ewald_h,
-                    t_ewald_beta,
-                ],
-            )
-            mdl_name = mdl_name.decode("UTF-8")
-            mdl_charge_map = [int(ii) for ii in mdl_charge_map.decode("UTF-8").split()]
-            sys_charge_map = [int(ii) for ii in sys_charge_map.decode("UTF-8").split()]
-            self.dm = DipoleChargeModifier(
-                mdl_name,
-                mdl_charge_map,
-                sys_charge_map,
-                ewald_h=ewald_h,
-                ewald_beta=ewald_beta,
-            )
+        self.modifier_type = None
+        self.descriptor_type = "se_e2_a"
 
     def _run_default_sess(self):
         if self.has_spin is True:
@@ -247,7 +173,7 @@ def get_rcut(self) -> float:
 
     def get_type_map(self) -> List[str]:
         """Get the type map (element name of the atom types) of this model."""
-        return self.tmap
+        return self.t_tmap
 
     def get_sel_type(self) -> List[int]:
         """Unsupported in this model."""
@@ -259,11 +185,11 @@ def get_descriptor_type(self) -> List[int]:
 
     def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this DP."""
-        return self.dfparam
+        return self.model.fitting.numb_fparam
 
     def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this DP."""
-        return self.daparam
+        return self.model.fitting.numb_aparam
 
     def _eval_func(self, inner_func: Callable, numb_test: int, natoms: int) -> Callable:
         """Wrapper method with auto batch size.
@@ -369,7 +295,7 @@ def eval(
         # reshape coords before getting shape
         natoms, numb_test = self._get_natoms_and_nframes(
             coords, atom_types, mixed_type=mixed_type
-        )
+        )  # 192, 30
         output = self._eval_func(self._eval_inner, numb_test, natoms)(
             coords,
             cells,
@@ -460,46 +386,20 @@ def _prepare_feed_dict(
                 )
 
         # sort inputs
-        coords, atom_types, imap = self.sort_input(
-            coords, atom_types, mixed_type=mixed_type
-        )
-        if self.has_efield:
-            efield = np.reshape(efield, [nframes, natoms, 3])
-            efield = efield[:, imap, :]
-            efield = np.reshape(efield, [nframes, natoms * 3])
+        # coords, atom_types, imap = self.sort_input(
+        #     coords, atom_types, mixed_type=mixed_type
+        # )
+        # if self.has_efield:
+        #     efield = np.reshape(efield, [nframes, natoms, 3])
+        #     efield = efield[:, imap, :]
+        #     efield = np.reshape(efield, [nframes, natoms * 3])
 
         # make natoms_vec and default_mesh
         natoms_vec = self.make_natoms_vec(atom_types, mixed_type=mixed_type)
         assert natoms_vec[0] == natoms
 
         # evaluate
-        feed_dict_test = {}
-        feed_dict_test[self.t_natoms] = natoms_vec
-        if mixed_type:
-            feed_dict_test[self.t_type] = atom_types.reshape([-1])
-        else:
-            feed_dict_test[self.t_type] = np.tile(atom_types, [nframes, 1]).reshape(
-                [-1]
-            )
-        feed_dict_test[self.t_coord] = np.reshape(coords, [-1])
-
-        if len(self.t_box.shape) == 1:
-            feed_dict_test[self.t_box] = np.reshape(cells, [-1])
-        elif len(self.t_box.shape) == 2:
-            feed_dict_test[self.t_box] = cells
-        else:
-            raise RuntimeError
-        if self.has_efield:
-            feed_dict_test[self.t_efield] = np.reshape(efield, [-1])
-        if pbc:
-            feed_dict_test[self.t_mesh] = make_default_mesh(cells)
-        else:
-            feed_dict_test[self.t_mesh] = np.array([], dtype=np.int32)
-        if self.has_fparam:
-            feed_dict_test[self.t_fparam] = np.reshape(fparam, [-1])
-        if self.has_aparam:
-            feed_dict_test[self.t_aparam] = np.reshape(aparam, [-1])
-        return feed_dict_test, imap, natoms_vec
+        return None, None, natoms_vec
 
     def _eval_inner(
         self,
@@ -519,41 +419,66 @@ def _eval_inner(
             coords, cells, atom_types, fparam, aparam, efield, mixed_type=mixed_type
         )
 
-        t_out = [self.t_energy, self.t_force, self.t_virial]
-        if atomic:
-            t_out += [self.t_ae, self.t_av]
+        eval_inputs = {}
+        eval_inputs["coord"] = paddle.to_tensor(
+            np.reshape(coords, [-1]), dtype="float64"
+        )
+        eval_inputs["type"] = paddle.to_tensor(
+            np.tile(atom_types, [nframes, 1]).reshape([-1]), dtype="int32"
+        )
+        eval_inputs["natoms_vec"] = paddle.to_tensor(
+            natoms_vec, dtype="int32", place="cpu"
+        )
+        eval_inputs["box"] = paddle.to_tensor(np.reshape(cells, [-1]), dtype="float64")
 
-        v_out = run_sess(self.sess, t_out, feed_dict=feed_dict_test)
-        energy = v_out[0]
-        force = v_out[1]
-        virial = v_out[2]
-        if atomic:
-            ae = v_out[3]
-            av = v_out[4]
+        if self.has_fparam:
+            eval_inputs["fparam"] = paddle.to_tensor(
+                np.reshape(fparam, [-1], dtype="float64")
+            )
+        if self.has_aparam:
+            eval_inputs["aparam"] = paddle.to_tensor(
+                np.reshape(aparam, [-1], dtype="float64")
+            )
+        eval_inputs["default_mesh"] = paddle.to_tensor(
+            make_default_mesh(cells), dtype="int32"
+        )
 
-        if self.has_spin:
-            ntypes_real = self.ntypes - self.ntypes_spin
-            natoms_real = sum(
-                [
-                    np.count_nonzero(np.array(atom_types) == ii)
-                    for ii in range(ntypes_real)
-                ]
+        if hasattr(self, "st_model"):
+            # NOTE: 使用静态图模型推理
+            eval_outputs = self.st_model(
+                eval_inputs["coord"],
+                eval_inputs["type"],
+                eval_inputs["natoms_vec"],
+                eval_inputs["box"],
+                eval_inputs["default_mesh"],
             )
+            eval_outputs = {
+                "atom_ener": eval_outputs[0],
+                "atom_virial": eval_outputs[1],
+                "atype": eval_outputs[2],
+                "coord": eval_outputs[3],
+                "energy": eval_outputs[4],
+                "force": eval_outputs[5],
+                "virial": eval_outputs[6],
+            }
         else:
-            natoms_real = natoms
-
-        # reverse map of the outputs
-        force = self.reverse_map(np.reshape(force, [nframes, -1, 3]), imap)
-        if atomic:
-            ae = self.reverse_map(np.reshape(ae, [nframes, -1, 1]), imap[:natoms_real])
-            av = self.reverse_map(np.reshape(av, [nframes, -1, 9]), imap)
-
-        energy = np.reshape(energy, [nframes, 1])
-        force = np.reshape(force, [nframes, natoms, 3])
-        virial = np.reshape(virial, [nframes, 9])
+            # NOTE: 使用动态图模型推理
+            eval_outputs = self.model(
+                eval_inputs["coord"],
+                eval_inputs["type"],
+                eval_inputs["natoms_vec"],
+                eval_inputs["box"],
+                eval_inputs["default_mesh"],
+                eval_inputs,
+                suffix="",
+                reuse=False,
+            )
+        energy = eval_outputs["energy"].numpy()
+        force = eval_outputs["force"].numpy()
+        virial = eval_outputs["virial"].numpy()
         if atomic:
-            ae = np.reshape(ae, [nframes, natoms_real, 1])
-            av = np.reshape(av, [nframes, natoms, 9])
+            ae = eval_outputs["atom_ener"].numpy()
+            av = eval_outputs["atom_virial"].numpy()
             return energy, force, virial, ae, av
         else:
             return energy, force, virial
diff --git a/deepmd/loss/ener.py b/deepmd/loss/ener.py
index 07c97b09bc..d11177ee3a 100644
--- a/deepmd/loss/ener.py
+++ b/deepmd/loss/ener.py
@@ -10,6 +10,7 @@
 from deepmd.env import (
     global_cvt_2_ener_float,
     global_cvt_2_tf_float,
+    paddle,
     tf,
 )
 from deepmd.utils.sess import (
@@ -82,11 +83,12 @@ def __init__(
                 default=1.0,
             )
 
-    def build(self, learning_rate, natoms, model_dict, label_dict, suffix):
+    def compute_loss(self, learning_rate, natoms, model_dict, label_dict, suffix):
         energy = model_dict["energy"]
         force = model_dict["force"]
         virial = model_dict["virial"]
         atom_ener = model_dict["atom_ener"]
+
         energy_hat = label_dict["energy"]
         force_hat = label_dict["force"]
         virial_hat = label_dict["virial"]
@@ -108,152 +110,163 @@ def build(self, learning_rate, natoms, model_dict, label_dict, suffix):
             # E = - E(A) - E(B) + E(C) + E(D)
             # A, B, C, D could be put far away from each other
             atom_ener_coeff = label_dict["atom_ener_coeff"]
-            atom_ener_coeff = tf.reshape(atom_ener_coeff, tf.shape(atom_ener))
-            energy = tf.reduce_sum(atom_ener_coeff * atom_ener, 1)
+            atom_ener_coeff = paddle.reshape(atom_ener_coeff, paddle.shape(atom_ener))
+            energy = paddle.sum(atom_ener_coeff * atom_ener, axis=1)
         if self.has_e:
-            l2_ener_loss = tf.reduce_mean(
-                tf.square(energy - energy_hat), name="l2_" + suffix
+            l2_ener_loss = paddle.mean(
+                paddle.square(energy - energy_hat), name="l2_" + suffix
             )
 
         if self.has_f or self.has_pf or self.relative_f:
-            force_reshape = tf.reshape(force, [-1])
-            force_hat_reshape = tf.reshape(force_hat, [-1])
+            force_reshape = paddle.reshape(force, [-1])
+            force_hat_reshape = paddle.reshape(force_hat, [-1])
             diff_f = force_hat_reshape - force_reshape
 
         if self.relative_f is not None:
-            force_hat_3 = tf.reshape(force_hat, [-1, 3])
-            norm_f = tf.reshape(tf.norm(force_hat_3, axis=1), [-1, 1]) + self.relative_f
-            diff_f_3 = tf.reshape(diff_f, [-1, 3])
+            force_hat_3 = paddle.reshape(force_hat, [-1, 3])
+            norm_f = (
+                paddle.reshape(paddle.linalg.norm(force_hat_3, axis=1), [-1, 1])
+                + self.relative_f
+            )
+            diff_f_3 = paddle.reshape(diff_f, [-1, 3])
             diff_f_3 = diff_f_3 / norm_f
-            diff_f = tf.reshape(diff_f_3, [-1])
+            diff_f = paddle.reshape(diff_f_3, [-1])
 
         if self.has_f:
-            l2_force_loss = tf.reduce_mean(tf.square(diff_f), name="l2_force_" + suffix)
+            l2_force_loss = paddle.mean(
+                paddle.square(diff_f), name="l2_force_" + suffix
+            )
 
         if self.has_pf:
-            atom_pref_reshape = tf.reshape(atom_pref, [-1])
-            l2_pref_force_loss = tf.reduce_mean(
-                tf.multiply(tf.square(diff_f), atom_pref_reshape),
+            atom_pref_reshape = paddle.reshape(atom_pref, [-1])
+            l2_pref_force_loss = paddle.mean(
+                paddle.multiply(paddle.square(diff_f), atom_pref_reshape),
                 name="l2_pref_force_" + suffix,
             )
 
         if self.has_v:
-            virial_reshape = tf.reshape(virial, [-1])
-            virial_hat_reshape = tf.reshape(virial_hat, [-1])
-            l2_virial_loss = tf.reduce_mean(
-                tf.square(virial_hat_reshape - virial_reshape),
+            virial_reshape = paddle.reshape(virial, [-1])
+            virial_hat_reshape = paddle.reshape(virial_hat, [-1])
+            l2_virial_loss = paddle.mean(
+                paddle.square(virial_hat_reshape - virial_reshape),
                 name="l2_virial_" + suffix,
             )
 
         if self.has_ae:
-            atom_ener_reshape = tf.reshape(atom_ener, [-1])
-            atom_ener_hat_reshape = tf.reshape(atom_ener_hat, [-1])
-            l2_atom_ener_loss = tf.reduce_mean(
-                tf.square(atom_ener_hat_reshape - atom_ener_reshape),
+            atom_ener_reshape = paddle.reshape(atom_ener, [-1])
+            atom_ener_hat_reshape = paddle.reshape(atom_ener_hat, [-1])
+            l2_atom_ener_loss = paddle.mean(
+                paddle.square(atom_ener_hat_reshape - atom_ener_reshape),
                 name="l2_atom_ener_" + suffix,
             )
 
-        atom_norm = 1.0 / global_cvt_2_tf_float(natoms[0])
-        atom_norm_ener = 1.0 / global_cvt_2_ener_float(natoms[0])
-        pref_e = global_cvt_2_ener_float(
-            find_energy
-            * (
-                self.limit_pref_e
-                + (self.start_pref_e - self.limit_pref_e)
-                * learning_rate
-                / self.starter_learning_rate
-            )
-        )
-        pref_f = global_cvt_2_tf_float(
-            find_force
-            * (
-                self.limit_pref_f
-                + (self.start_pref_f - self.limit_pref_f)
-                * learning_rate
-                / self.starter_learning_rate
-            )
-        )
-        pref_v = global_cvt_2_tf_float(
-            find_virial
-            * (
-                self.limit_pref_v
-                + (self.start_pref_v - self.limit_pref_v)
-                * learning_rate
-                / self.starter_learning_rate
-            )
-        )
-        pref_ae = global_cvt_2_tf_float(
-            find_atom_ener
-            * (
-                self.limit_pref_ae
-                + (self.start_pref_ae - self.limit_pref_ae)
-                * learning_rate
-                / self.starter_learning_rate
-            )
-        )
-        pref_pf = global_cvt_2_tf_float(
-            find_atom_pref
-            * (
-                self.limit_pref_pf
-                + (self.start_pref_pf - self.limit_pref_pf)
-                * learning_rate
-                / self.starter_learning_rate
-            )
+        atom_norm = 1.0 / (natoms[0])
+        atom_norm_ener = 1.0 / (natoms[0])
+        pref_e = find_energy * (
+            self.limit_pref_e
+            + (self.start_pref_e - self.limit_pref_e)
+            * learning_rate
+            / self.starter_learning_rate
+        )
+        pref_f = find_force * (
+            self.limit_pref_f
+            + (self.start_pref_f - self.limit_pref_f)
+            * learning_rate
+            / self.starter_learning_rate
+        )
+        pref_v = find_virial * (
+            self.limit_pref_v
+            + (self.start_pref_v - self.limit_pref_v)
+            * learning_rate
+            / self.starter_learning_rate
+        )
+        pref_ae = find_atom_ener * (
+            self.limit_pref_ae
+            + (self.start_pref_ae - self.limit_pref_ae)
+            * learning_rate
+            / self.starter_learning_rate
+        )
+        pref_pf = find_atom_pref * (
+            self.limit_pref_pf
+            + (self.start_pref_pf - self.limit_pref_pf)
+            * learning_rate
+            / self.starter_learning_rate
         )
 
         l2_loss = 0
         more_loss = {}
-        if self.has_e:
+        if self.has_e:  # true
             l2_loss += atom_norm_ener * (pref_e * l2_ener_loss)
             more_loss["l2_ener_loss"] = l2_ener_loss
-        if self.has_f:
-            l2_loss += global_cvt_2_ener_float(pref_f * l2_force_loss)
+        if self.has_f:  # true
+            l2_loss += pref_f * l2_force_loss
             more_loss["l2_force_loss"] = l2_force_loss
-        if self.has_v:
-            l2_loss += global_cvt_2_ener_float(atom_norm * (pref_v * l2_virial_loss))
+        if self.has_v:  # false
+            l2_loss += atom_norm * (pref_v * l2_virial_loss)
             more_loss["l2_virial_loss"] = l2_virial_loss
-        if self.has_ae:
-            l2_loss += global_cvt_2_ener_float(pref_ae * l2_atom_ener_loss)
+        if self.has_ae:  # false
+            l2_loss += pref_ae * l2_atom_ener_loss
             more_loss["l2_atom_ener_loss"] = l2_atom_ener_loss
-        if self.has_pf:
-            l2_loss += global_cvt_2_ener_float(pref_pf * l2_pref_force_loss)
+        if self.has_pf:  # false
+            l2_loss += pref_pf * l2_pref_force_loss
             more_loss["l2_pref_force_loss"] = l2_pref_force_loss
 
-        # only used when tensorboard was set as true
-        self.l2_loss_summary = tf.summary.scalar("l2_loss_" + suffix, tf.sqrt(l2_loss))
-        if self.has_e:
-            self.l2_loss_ener_summary = tf.summary.scalar(
-                "l2_ener_loss_" + suffix,
-                global_cvt_2_tf_float(tf.sqrt(l2_ener_loss))
-                / global_cvt_2_tf_float(natoms[0]),
-            )
-        if self.has_f:
-            self.l2_loss_force_summary = tf.summary.scalar(
-                "l2_force_loss_" + suffix, tf.sqrt(l2_force_loss)
-            )
-        if self.has_v:
-            self.l2_loss_virial_summary = tf.summary.scalar(
-                "l2_virial_loss_" + suffix,
-                tf.sqrt(l2_virial_loss) / global_cvt_2_tf_float(natoms[0]),
-            )
-
         self.l2_l = l2_loss
         self.l2_more = more_loss
         return l2_loss, more_loss
 
-    def eval(self, sess, feed_dict, natoms):
-        placeholder = self.l2_l
+    def eval(self, model, batch_data, natoms):
+        # placeholder = self.l2_l
+
+        model_inputs = {}
+        for kk in batch_data.keys():
+            if kk == "find_type" or kk == "type":
+                continue
+            prec = "float64"
+            if "find_" in kk:
+                model_inputs[kk] = paddle.to_tensor(batch_data[kk], dtype="float64")
+            else:
+                model_inputs[kk] = paddle.to_tensor(
+                    np.reshape(batch_data[kk], [-1]), dtype=prec
+                )
+
+        for ii in ["type"]:
+            model_inputs[ii] = paddle.to_tensor(
+                np.reshape(batch_data[ii], [-1]), dtype="int32"
+            )
+        for ii in ["natoms_vec", "default_mesh"]:
+            model_inputs[ii] = paddle.to_tensor(batch_data[ii], dtype="int32")
+        model_inputs["is_training"] = paddle.to_tensor(False)
+        model_inputs["natoms_vec"] = paddle.to_tensor(
+            model_inputs["natoms_vec"], place="cpu"
+        )
+
+        model_pred = model(
+            model_inputs["coord"],
+            model_inputs["type"],
+            model_inputs["natoms_vec"],
+            model_inputs["box"],
+            model_inputs["default_mesh"],
+            model_inputs,
+            suffix="",
+            reuse=False,
+        )
+        l2_l, l2_more = self.compute_loss(
+            0.0,
+            model_inputs["natoms_vec"],
+            model_pred,
+            model_inputs,
+            suffix="test",
+        )
         run_data = [
-            self.l2_l,
-            self.l2_more["l2_ener_loss"] if self.has_e else placeholder,
-            self.l2_more["l2_force_loss"] if self.has_f else placeholder,
-            self.l2_more["l2_virial_loss"] if self.has_v else placeholder,
-            self.l2_more["l2_atom_ener_loss"] if self.has_ae else placeholder,
-            self.l2_more["l2_pref_force_loss"] if self.has_pf else placeholder,
+            (float(l2_l)),
+            (float(l2_more["l2_ener_loss"]) if self.has_e else 0.0),
+            (float(l2_more["l2_force_loss"]) if self.has_f else 0.0),
+            (float(l2_more["l2_virial_loss"]) if self.has_v else 0.0),
+            (float(l2_more["l2_atom_ener_loss"]) if self.has_ae else 0.0),
+            (float(l2_more["l2_pref_force_loss"]) if self.has_pf else 0.0),
         ]
-        error, error_e, error_f, error_v, error_ae, error_pf = run_sess(
-            sess, run_data, feed_dict=feed_dict
-        )
+        error, error_e, error_f, error_v, error_ae, error_pf = run_data
         results = {"natoms": natoms[0], "rmse": np.sqrt(error)}
         if self.has_e:
             results["rmse_e"] = np.sqrt(error_e) / natoms[0]
diff --git a/deepmd/loss/loss.py b/deepmd/loss/loss.py
index f666445e6e..423d9e118d 100644
--- a/deepmd/loss/loss.py
+++ b/deepmd/loss/loss.py
@@ -15,7 +15,7 @@
 class Loss(metaclass=ABCMeta):
     """The abstract class for the loss function."""
 
-    @abstractmethod
+    # @abstractmethod
     def build(
         self,
         learning_rate: tf.Tensor,
@@ -46,6 +46,7 @@ def build(
         dict[str, tf.Tensor]
             A dictionary that maps loss keys to more loss tensors
         """
+        pass
 
     @abstractmethod
     def eval(
diff --git a/deepmd/model/ener.py b/deepmd/model/ener.py
index f9387c67fc..21c5ec6ee0 100644
--- a/deepmd/model/ener.py
+++ b/deepmd/model/ener.py
@@ -1,14 +1,12 @@
 from typing import (
+    TYPE_CHECKING,
     List,
     Optional,
 )
 
-import numpy as np
-
 from deepmd.env import (
     MODEL_VERSION,
-    global_cvt_2_ener_float,
-    op_module,
+    paddle,
     tf,
 )
 from deepmd.utils.pair_tab import (
@@ -26,8 +24,13 @@
     merge_sys_stat,
 )
 
+if TYPE_CHECKING:
+    from deepmd.fit import (
+        ener,
+    )
+
 
-class EnerModel(Model):
+class EnerModel(Model, paddle.nn.Layer):
     """Energy model.
 
     Parameters
@@ -58,7 +61,7 @@ class EnerModel(Model):
     def __init__(
         self,
         descrpt,
-        fitting,
+        fitting: "ener.EnerFitting",
         typeebd=None,
         type_map: Optional[List[str]] = None,
         data_stat_nbatch: int = 10,
@@ -70,6 +73,7 @@ def __init__(
         spin: Optional[Spin] = None,
     ) -> None:
         """Constructor."""
+        super().__init__()
         # descriptor
         self.descrpt = descrpt
         self.rcut = self.descrpt.get_rcut()
@@ -97,6 +101,22 @@ def __init__(
         else:
             self.srtab = None
 
+        self.t_tmap = " ".join(self.type_map)
+        self.t_mt = self.model_type
+        self.t_ver = str(MODEL_VERSION)
+        # NOTE: workaround for string type is not supported in Paddle
+        self.register_buffer(
+            "buffer_t_type",
+            paddle.to_tensor([ord(c) for c in self.t_tmap], dtype="int32"),
+        )
+        self.register_buffer(
+            "buffer_t_mt", paddle.to_tensor([ord(c) for c in self.t_mt], dtype="int32")
+        )
+        self.register_buffer(
+            "buffer_t_ver",
+            paddle.to_tensor([ord(c) for c in self.t_ver], dtype="int32"),
+        )
+
     def get_rcut(self):
         return self.rcut
 
@@ -113,7 +133,6 @@ def data_stat(self, data):
             m_all_stat, protection=self.data_stat_protect, mixed_type=data.mixed_type
         )
         self._compute_output_stat(all_stat, mixed_type=data.mixed_type)
-        # self.bias_atom_e = data.compute_energy_shift(self.rcond)
 
     def _compute_input_stat(self, all_stat, protection=1e-2, mixed_type=False):
         if mixed_type:
@@ -144,7 +163,7 @@ def _compute_output_stat(self, all_stat, mixed_type=False):
         else:
             self.fitting.compute_output_stats(all_stat)
 
-    def build(
+    def forward(
         self,
         coord_,
         atype_,
@@ -159,158 +178,56 @@ def build(
     ):
         if input_dict is None:
             input_dict = {}
-        with tf.variable_scope("model_attr" + suffix, reuse=reuse):
-            t_tmap = tf.constant(" ".join(self.type_map), name="tmap", dtype=tf.string)
-            t_mt = tf.constant(self.model_type, name="model_type", dtype=tf.string)
-            t_ver = tf.constant(MODEL_VERSION, name="model_version", dtype=tf.string)
-
-            if self.srtab is not None:
-                tab_info, tab_data = self.srtab.get()
-                self.tab_info = tf.get_variable(
-                    "t_tab_info",
-                    tab_info.shape,
-                    dtype=tf.float64,
-                    trainable=False,
-                    initializer=tf.constant_initializer(tab_info, dtype=tf.float64),
-                )
-                self.tab_data = tf.get_variable(
-                    "t_tab_data",
-                    tab_data.shape,
-                    dtype=tf.float64,
-                    trainable=False,
-                    initializer=tf.constant_initializer(tab_data, dtype=tf.float64),
-                )
-
-        coord = tf.reshape(coord_, [-1, natoms[1] * 3])
-        atype = tf.reshape(atype_, [-1, natoms[1]])
-        input_dict["nframes"] = tf.shape(coord)[0]
 
-        # type embedding if any
-        if self.typeebd is not None:
-            type_embedding = self.typeebd.build(
-                self.ntypes,
-                reuse=reuse,
-                suffix=suffix,
-            )
-            input_dict["type_embedding"] = type_embedding
-        # spin if any
-        if self.spin is not None:
-            type_spin = self.spin.build(
-                reuse=reuse,
-                suffix=suffix,
-            )
+        coord = paddle.reshape(coord_, [-1, natoms[1] * 3])
+        atype = paddle.reshape(atype_, [-1, natoms[1]])
         input_dict["atype"] = atype_
 
-        dout = self.build_descrpt(
+        dout = self.descrpt(
             coord,
             atype,
             natoms,
             box,
             mesh,
             input_dict,
-            frz_model=frz_model,
-            ckpt_meta=ckpt_meta,
             suffix=suffix,
             reuse=reuse,
         )
 
-        if self.srtab is not None:
-            nlist, rij, sel_a, sel_r = self.descrpt.get_nlist()
-            nnei_a = np.cumsum(sel_a)[-1]
-            nnei_r = np.cumsum(sel_r)[-1]
-
-        atom_ener = self.fitting.build(
-            dout, natoms, input_dict, reuse=reuse, suffix=suffix
-        )
+        atom_ener = self.fitting(dout, natoms, input_dict, reuse=reuse, suffix=suffix)
         self.atom_ener = atom_ener
 
-        if self.srtab is not None:
-            sw_lambda, sw_deriv = op_module.soft_min_switch(
-                atype,
-                rij,
-                nlist,
-                natoms,
-                sel_a=sel_a,
-                sel_r=sel_r,
-                alpha=self.smin_alpha,
-                rmin=self.sw_rmin,
-                rmax=self.sw_rmax,
-            )
-            inv_sw_lambda = 1.0 - sw_lambda
-            # NOTICE:
-            # atom energy is not scaled,
-            # force and virial are scaled
-            tab_atom_ener, tab_force, tab_atom_virial = op_module.pair_tab(
-                self.tab_info,
-                self.tab_data,
-                atype,
-                rij,
-                nlist,
-                natoms,
-                sw_lambda,
-                sel_a=sel_a,
-                sel_r=sel_r,
-            )
-            energy_diff = tab_atom_ener - tf.reshape(atom_ener, [-1, natoms[0]])
-            tab_atom_ener = tf.reshape(sw_lambda, [-1]) * tf.reshape(
-                tab_atom_ener, [-1]
-            )
-            atom_ener = tf.reshape(inv_sw_lambda, [-1]) * atom_ener
-            energy_raw = tab_atom_ener + atom_ener
-        else:
-            energy_raw = atom_ener
+        energy_raw = atom_ener
 
         nloc_atom = (
             natoms[0]
             if self.spin is None
-            else tf.reduce_sum(natoms[2 : 2 + len(self.spin.use_spin)])
+            else paddle.sum(natoms[2 : 2 + len(self.spin.use_spin)]).item()
         )
-        energy_raw = tf.reshape(
+        energy_raw = paddle.reshape(
             energy_raw, [-1, nloc_atom], name="o_atom_energy" + suffix
         )
-        energy = tf.reduce_sum(
-            global_cvt_2_ener_float(energy_raw), axis=1, name="o_energy" + suffix
-        )
+        energy = paddle.sum(energy_raw, axis=1, name="o_energy" + suffix)
 
         force, virial, atom_virial = self.descrpt.prod_force_virial(atom_ener, natoms)
+        # force: [1, all_atoms*3]
+        # virial: [1, 9]
+        # atom_virial: [1, all_atoms*9]
 
-        if self.srtab is not None:
-            sw_force = op_module.soft_min_force(
-                energy_diff, sw_deriv, nlist, natoms, n_a_sel=nnei_a, n_r_sel=nnei_r
-            )
-            force = force + sw_force + tab_force
-
-        force = tf.reshape(force, [-1, 3 * natoms[1]])
+        force = paddle.reshape(force, [-1, 3 * natoms[1]])
         if self.spin is not None:
             # split and concatenate force to compute local atom force and magnetic force
-            judge = tf.equal(natoms[0], natoms[1])
-            force = tf.cond(
+            judge = paddle.equal(natoms[0], natoms[1])
+            force = paddle.where(
                 judge,
-                lambda: self.natoms_match(force, natoms),
-                lambda: self.natoms_not_match(force, natoms, atype),
+                self.natoms_match(force, natoms),
+                self.natoms_not_match(force, natoms, atype),
             )
 
-        force = tf.reshape(force, [-1, 3 * natoms[1]], name="o_force" + suffix)
-
-        if self.srtab is not None:
-            sw_virial, sw_atom_virial = op_module.soft_min_virial(
-                energy_diff,
-                sw_deriv,
-                rij,
-                nlist,
-                natoms,
-                n_a_sel=nnei_a,
-                n_r_sel=nnei_r,
-            )
-            atom_virial = atom_virial + sw_atom_virial + tab_atom_virial
-            virial = (
-                virial
-                + sw_virial
-                + tf.reduce_sum(tf.reshape(tab_atom_virial, [-1, natoms[1], 9]), axis=1)
-            )
+        force = paddle.reshape(force, [-1, 3 * natoms[1]], name="o_force" + suffix)
 
-        virial = tf.reshape(virial, [-1, 9], name="o_virial" + suffix)
-        atom_virial = tf.reshape(
+        virial = paddle.reshape(virial, [-1, 9], name="o_virial" + suffix)
+        atom_virial = paddle.reshape(
             atom_virial, [-1, 9 * natoms[1]], name="o_atom_virial" + suffix
         )
 
@@ -322,7 +239,6 @@ def build(
         model_dict["atom_virial"] = atom_virial
         model_dict["coord"] = coord
         model_dict["atype"] = atype
-
         return model_dict
 
     def init_variables(
diff --git a/deepmd/model/model.py b/deepmd/model/model.py
index 8e6ffad910..7c07e1d987 100644
--- a/deepmd/model/model.py
+++ b/deepmd/model/model.py
@@ -1,6 +1,5 @@
 from abc import (
     ABC,
-    abstractmethod,
 )
 from enum import (
     Enum,
@@ -21,7 +20,7 @@
 
 
 class Model(ABC):
-    @abstractmethod
+    # @abstractmethod
     def build(
         self,
         coord_: tf.Tensor,
diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py
index 580d434533..cc46dc9801 100644
--- a/deepmd/train/trainer.py
+++ b/deepmd/train/trainer.py
@@ -1,23 +1,16 @@
 #!/usr/bin/env python3
-import glob
 import logging
 import os
-import platform
-import shutil
 import time
 from typing import (
     Dict,
     List,
 )
 
-import google.protobuf.message
 import numpy as np
 from packaging.version import (
     Version,
 )
-from tensorflow.python.client import (
-    timeline,
-)
 
 # load grad of force module
 import deepmd.op  # noqa: F401
@@ -26,19 +19,17 @@
     get_precision,
     j_must_have,
 )
-from deepmd.descriptor.descriptor import (
-    Descriptor,
-)
 from deepmd.env import (
     GLOBAL_ENER_FLOAT_PRECISION,
     GLOBAL_TF_FLOAT_PRECISION,
     TF_VERSION,
     get_tf_session_config,
+    paddle,
     tf,
-    tfv2,
 )
 from deepmd.fit import (
     Fitting,
+    ener,
 )
 from deepmd.loss import (
     DOSLoss,
@@ -54,7 +45,6 @@
     MultiModel,
     PolarModel,
 )
-from deepmd.utils import random as dp_random
 from deepmd.utils.argcheck import (
     type_embedding_args,
 )
@@ -62,7 +52,6 @@
     DeepmdDataSystem,
 )
 from deepmd.utils.errors import (
-    GraphTooLargeError,
     GraphWithoutTensorError,
 )
 from deepmd.utils.graph import (
@@ -158,7 +147,8 @@ def _init_param(self, jdata):
             descrpt_param["multi_task"] = True
         if descrpt_param["type"] in ["se_e2_a", "se_a", "se_e2_r", "se_r", "hybrid"]:
             descrpt_param["spin"] = self.spin
-        self.descrpt = Descriptor(**descrpt_param)
+        descrpt_param.pop("type")
+        self.descrpt = deepmd.descriptor.se_a.DescrptSeA(**descrpt_param)
 
         # fitting net
         if not self.multi_task_mode:
@@ -167,7 +157,8 @@ def _init_param(self, jdata):
             fitting_param["descrpt"] = self.descrpt
             if fitting_type == "ener":
                 fitting_param["spin"] = self.spin
-            self.fitting = Fitting(**fitting_param)
+                fitting_param.pop("type")
+            self.fitting = ener.EnerFitting(**fitting_param)
         else:
             self.fitting_dict = {}
             self.fitting_type_dict = {}
@@ -316,7 +307,7 @@ def get_lr_and_coef(lr_param):
 
         # loss
         # infer loss type by fitting_type
-        def loss_init(_loss_param, _fitting_type, _fitting, _lr):
+        def loss_init(_loss_param, _fitting_type, _fitting, _lr) -> EnerStdLoss:
             _loss_type = _loss_param.get("type", "ener")
             if _fitting_type == "ener":
                 _loss_param.pop("type", None)
@@ -576,10 +567,10 @@ def build(self, data=None, stop_batch=0, origin_type_map=None, suffix=""):
             # for fparam or aparam settings in 'ener' type fitting net
             self.fitting.init_variables(graph, graph_def)
 
-        if self.is_compress or self.model_type == "compressed_model":
-            tf.constant("compressed_model", name="model_type", dtype=tf.string)
-        else:
-            tf.constant("original_model", name="model_type", dtype=tf.string)
+        # if self.is_compress or self.model_type == "compressed_model":
+        #     tf.constant("compressed_model", name="model_type", dtype=tf.string)
+        # else:
+        #     tf.constant("original_model", name="model_type", dtype=tf.string)
 
         if self.mixed_prec is not None:
             self.descrpt.enable_mixed_precision(self.mixed_prec)
@@ -593,17 +584,17 @@ def build(self, data=None, stop_batch=0, origin_type_map=None, suffix=""):
 
         self._build_lr()
         self._build_network(data, suffix)
-        self._build_training()
+        # self._build_training()
 
     def _build_lr(self):
-        self._extra_train_ops = []
-        self.global_step = tf.train.get_or_create_global_step()
+        # self._extra_train_ops = []
+        self.global_step = 0
         if not self.multi_task_mode:
             self.learning_rate = self.lr.build(self.global_step, self.stop_batch)
         else:
             self.learning_rate_dict = {}
             for fitting_key in self.fitting_type_dict:
-                self.learning_rate_dict[fitting_key] = self.lr_dict[fitting_key].build(
+                self.lr_scheduler[fitting_key] = self.lr.build(
                     self.global_step, self.stop_batch
                 )
 
@@ -678,7 +669,7 @@ def _build_network(self, data, suffix=""):
             reuse=False,
         )
 
-        self.l2_l, self.l2_more = self._build_loss()
+        # self.l2_l, self.l2_more = self._build_loss()
 
         log.info("built network")
 
@@ -813,12 +804,12 @@ def _init_session(self):
                 log.info("receive global variables from task#0")
             run_sess(self.sess, bcast_op)
 
-    def train(self, train_data=None, valid_data=None):
+    def train(self, train_data=None, valid_data=None, stop_batch: int = 10):
         # if valid_data is None:  # no validation set specified.
         #     valid_data = train_data  # using training set as validation set.
 
-        stop_batch = self.stop_batch
-        self._init_session()
+        # stop_batch = self.stop_batch
+        # self._init_session()
 
         # Before data shard is enabled, only cheif do evaluation and record it
         # self.print_head()
@@ -826,15 +817,18 @@ def train(self, train_data=None, valid_data=None):
         if self.run_opt.is_chief:
             fp = open(self.disp_file, "a")
 
-        cur_batch = run_sess(self.sess, self.global_step)
+        cur_batch = self.global_step
         is_first_step = True
         self.cur_batch = cur_batch
+        self.optimizer = paddle.optimizer.Adam(
+            learning_rate=self.learning_rate, parameters=self.model.parameters()
+        )
         if not self.multi_task_mode:
             log.info(
                 "start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e"
                 % (
-                    run_sess(self.sess, self.learning_rate),
-                    self.lr.value(cur_batch),
+                    self.learning_rate.get_lr(),
+                    self.learning_rate.get_lr(),
                     self.lr.decay_steps_,
                     self.lr.decay_rate_,
                     self.lr.value(stop_batch),
@@ -846,56 +840,18 @@ def train(self, train_data=None, valid_data=None):
                     "%s: start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e"
                     % (
                         fitting_key,
-                        run_sess(self.sess, self.learning_rate_dict[fitting_key]),
-                        self.lr_dict[fitting_key].value(cur_batch),
+                        self.learning_rate[fitting_key].base_lr,
+                        self.lr_dict[fitting_key].get_lr(),
                         self.lr_dict[fitting_key].decay_steps_,
                         self.lr_dict[fitting_key].decay_rate_,
                         self.lr_dict[fitting_key].value(stop_batch),
                     )
                 )
 
-        prf_options = None
-        prf_run_metadata = None
-        if self.profiling:
-            prf_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
-            prf_run_metadata = tf.RunMetadata()
-
-        # set tensorboard execution environment
-        if self.tensorboard:
-            summary_merged_op = tf.summary.merge_all()
-            # Remove TB old logging directory from previous run
-            try:
-                shutil.rmtree(self.tensorboard_log_dir)
-            except FileNotFoundError:
-                pass  # directory does not exist, this is OK
-            except Exception as e:
-                # general error when removing directory, warn user
-                log.exception(
-                    f"Could not remove old tensorboard logging directory: "
-                    f"{self.tensorboard_log_dir}. Error: {e}"
-                )
-            else:
-                log.debug("Removing old tensorboard log directory.")
-            tb_train_writer = tf.summary.FileWriter(
-                self.tensorboard_log_dir + "/train", self.sess.graph
-            )
-            tb_valid_writer = tf.summary.FileWriter(self.tensorboard_log_dir + "/test")
-        else:
-            tb_train_writer = None
-            tb_valid_writer = None
-        if self.enable_profiler:
-            # https://www.tensorflow.org/guide/profiler
-            tfv2.profiler.experimental.start(self.tensorboard_log_dir)
-
         train_time = 0
         total_train_time = 0.0
         wall_time_tic = time.time()
 
-        next_batch_train_op = None
-        next_fitting_key = None
-        next_train_batch_list = None
-        next_datasetloader = None
-
         # dataset loader op
         if not self.multi_task_mode:
             datasetloader = DatasetLoader(train_data)
@@ -908,35 +864,7 @@ def train(self, train_data=None, valid_data=None):
                 data_op[fitting_key] = datasetloader[fitting_key].build()
 
         while cur_batch < stop_batch:
-            # first round validation:
-            if is_first_step:
-                if not self.multi_task_mode:
-                    train_batch = train_data.get_batch()
-                    batch_train_op = self.train_op
-                else:
-                    fitting_idx = dp_random.choice(
-                        np.arange(self.nfitting), p=np.array(self.fitting_prob)
-                    )
-                    fitting_key = self.fitting_key_list[fitting_idx]
-                    train_batch = train_data[fitting_key].get_batch()
-                    batch_train_op = self.train_op[fitting_key]
-            else:
-                train_batch = next_datasetloader.get_data_dict(next_train_batch_list)
-                batch_train_op = next_batch_train_op
-                fitting_key = next_fitting_key
-            # for next round
-            if not self.multi_task_mode:
-                next_datasetloader = datasetloader
-                next_batch_train_op = self.train_op
-                next_train_batch_op = data_op
-            else:
-                fitting_idx = dp_random.choice(
-                    np.arange(self.nfitting), p=np.array(self.fitting_prob)
-                )
-                next_fitting_key = self.fitting_key_list[fitting_idx]
-                next_datasetloader = datasetloader[next_fitting_key]
-                next_batch_train_op = self.train_op[fitting_key]
-                next_train_batch_op = data_op[fitting_key]
+            train_batch = datasetloader.get_data_dict()
 
             if self.display_in_training and is_first_step:
                 if self.run_opt.is_chief:
@@ -982,32 +910,77 @@ def train(self, train_data=None, valid_data=None):
 
             if self.timing_in_training:
                 tic = time.time()
-            train_feed_dict = self.get_feed_dict(train_batch, is_training=True)
             # use tensorboard to visualize the training of deepmd-kit
             # it will takes some extra execution time to generate the tensorboard data
             if self.tensorboard and (cur_batch % self.tensorboard_freq == 0):
-                summary, _, next_train_batch_list = run_sess(
-                    self.sess,
-                    [summary_merged_op, batch_train_op, next_train_batch_op],
-                    feed_dict=train_feed_dict,
-                    options=prf_options,
-                    run_metadata=prf_run_metadata,
+                model_pred = self.model(
+                    paddle.to_tensor(train_batch["coord"], "float32"),
+                    paddle.to_tensor(train_batch["type"], "int32"),
+                    paddle.to_tensor(train_batch["natoms_vec"], "int32", "cpu"),
+                    paddle.to_tensor(train_batch["box"], "float32"),
+                    paddle.to_tensor(train_batch["default_mesh"], "int32"),
+                    train_batch,
+                    suffix="",
+                    reuse=False,
                 )
-                tb_train_writer.add_summary(summary, cur_batch)
             else:
-                _, next_train_batch_list = run_sess(
-                    self.sess,
-                    [batch_train_op, next_train_batch_op],
-                    feed_dict=train_feed_dict,
-                    options=prf_options,
-                    run_metadata=prf_run_metadata,
+                model_inputs = {}
+                for kk in train_batch.keys():
+                    if kk == "find_type" or kk == "type":
+                        continue
+                    prec = "float64"
+                    if "find_" in kk:
+                        model_inputs[kk] = paddle.to_tensor(
+                            train_batch[kk], dtype="float64"
+                        )
+                    else:
+                        model_inputs[kk] = paddle.to_tensor(
+                            np.reshape(train_batch[kk], [-1]), dtype=prec
+                        )
+
+                for ii in ["type"]:
+                    model_inputs[ii] = paddle.to_tensor(
+                        np.reshape(train_batch[ii], [-1]), dtype="int32"
+                    )
+                for ii in ["natoms_vec", "default_mesh"]:
+                    model_inputs[ii] = paddle.to_tensor(train_batch[ii], dtype="int32")
+                model_inputs["is_training"] = paddle.to_tensor(True)
+                model_inputs["natoms_vec"] = paddle.to_tensor(
+                    model_inputs["natoms_vec"], place="cpu"
                 )
+                model_pred = self.model(
+                    model_inputs["coord"],
+                    model_inputs["type"],
+                    model_inputs["natoms_vec"],
+                    model_inputs["box"],
+                    model_inputs["default_mesh"],
+                    model_inputs,
+                    suffix="",
+                    reuse=False,
+                )
+
+                # print(f"{self.cur_batch} {self.learning_rate.get_lr():.10f}")
+                l2_l, l2_more = self.loss.compute_loss(
+                    self.learning_rate.get_lr(),
+                    model_inputs["natoms_vec"],
+                    model_pred,
+                    model_inputs,
+                    suffix="train",
+                )
+
+                self.optimizer.clear_grad()
+                l2_l.backward()
+                self.optimizer.step()
+                self.global_step += 1
+
             if self.timing_in_training:
                 toc = time.time()
             if self.timing_in_training:
                 train_time += toc - tic
-            cur_batch = run_sess(self.sess, self.global_step)
+            cur_batch = self.global_step
             self.cur_batch = cur_batch
+            if (cur_batch % self.lr.decay_steps_) == 0:
+                self.learning_rate.step()
 
             # on-the-fly validation
             if self.display_in_training and (cur_batch % self.disp_freq == 0):
@@ -1060,12 +1033,10 @@ def train(self, train_data=None, valid_data=None):
                 if (
                     self.save_freq > 0
                     and cur_batch % self.save_freq == 0
-                    and self.saver is not None
+                    # and self.saver is not None
                 ):
                     self.save_checkpoint(cur_batch)
-        if (
-            self.save_freq == 0 or cur_batch == 0 or cur_batch % self.save_freq != 0
-        ) and self.saver is not None:
+        if self.save_freq == 0 or cur_batch == 0 or cur_batch % self.save_freq != 0:
             self.save_checkpoint(cur_batch)
         if self.run_opt.is_chief:
             fp.close()
@@ -1083,42 +1054,9 @@ def train(self, train_data=None, valid_data=None):
                     total_train_time / (stop_batch // self.disp_freq * self.disp_freq),
                 )
 
-        if self.profiling and self.run_opt.is_chief:
-            fetched_timeline = timeline.Timeline(prf_run_metadata.step_stats)
-            chrome_trace = fetched_timeline.generate_chrome_trace_format()
-            with open(self.profiling_file, "w") as f:
-                f.write(chrome_trace)
-        if self.enable_profiler and self.run_opt.is_chief:
-            tfv2.profiler.experimental.stop()
-
     def save_checkpoint(self, cur_batch: int):
-        try:
-            ckpt_prefix = self.saver.save(
-                self.sess,
-                os.path.join(os.getcwd(), self.save_ckpt),
-                global_step=cur_batch,
-            )
-        except google.protobuf.message.DecodeError as e:
-            raise GraphTooLargeError(
-                "The graph size exceeds 2 GB, the hard limitation of protobuf."
-                " Then a DecodeError was raised by protobuf. You should "
-                "reduce the size of your model."
-            ) from e
-        # make symlinks from prefix with step to that without step to break nothing
-        # get all checkpoint files
-        original_files = glob.glob(ckpt_prefix + ".*")
-        for ori_ff in original_files:
-            new_ff = self.save_ckpt + ori_ff[len(ckpt_prefix) :]
-            try:
-                # remove old one
-                os.remove(new_ff)
-            except OSError:
-                pass
-            if platform.system() != "Windows":
-                # by default one does not have access to create symlink on Windows
-                os.symlink(ori_ff, new_ff)
-            else:
-                shutil.copyfile(ori_ff, new_ff)
+        paddle.save(self.model.state_dict(), f"Model_{cur_batch}.pdparams")
+        paddle.save(self.optimizer.state_dict(), f"Optimier_{cur_batch}.pdopt")
         log.info("saved checkpoint %s" % self.save_ckpt)
 
     def get_feed_dict(self, batch, is_training):
@@ -1127,18 +1065,18 @@ def get_feed_dict(self, batch, is_training):
             if kk == "find_type" or kk == "type" or kk == "real_natoms_vec":
                 continue
             if "find_" in kk:
-                feed_dict[self.place_holders[kk]] = batch[kk]
+                feed_dict[kk] = batch[kk]
             else:
-                feed_dict[self.place_holders[kk]] = np.reshape(batch[kk], [-1])
+                feed_dict[kk] = np.reshape(batch[kk], [-1])
         for ii in ["type"]:
-            feed_dict[self.place_holders[ii]] = np.reshape(batch[ii], [-1])
+            feed_dict[ii] = np.reshape(batch[ii], [-1])
         for ii in ["natoms_vec", "default_mesh"]:
-            feed_dict[self.place_holders[ii]] = batch[ii]
-        feed_dict[self.place_holders["is_training"]] = is_training
+            feed_dict[ii] = batch[ii]
+        feed_dict["is_training"] = is_training
         return feed_dict
 
     def get_global_step(self):
-        return run_sess(self.sess, self.global_step)
+        return self.global_step
 
     # def print_head (self) :  # depreciated
     #     if self.run_opt.is_chief:
@@ -1157,7 +1095,7 @@ def valid_on_the_fly(
 
         cur_batch = self.cur_batch
         if not self.multi_task_mode:
-            current_lr = run_sess(self.sess, self.learning_rate)
+            current_lr = self.learning_rate.get_lr()
         else:
             assert (
                 fitting_key is not None
@@ -1263,8 +1201,8 @@ def print_on_training(
         fp.write(print_str)
         fp.flush()
 
-    @staticmethod
-    def eval_single_list(single_batch_list, loss, sess, get_feed_dict_func, prefix=""):
+    # @staticmethod
+    def eval_single_list(self, single_batch_list, loss, prefix=""):
         if single_batch_list is None:
             return None
         numb_batch = len(single_batch_list)
@@ -1273,8 +1211,8 @@ def eval_single_list(single_batch_list, loss, sess, get_feed_dict_func, prefix="
         for i in range(numb_batch):
             batch = single_batch_list[i]
             natoms = batch["natoms_vec"]
-            feed_dict = get_feed_dict_func(batch, is_training=False)
-            results = loss.eval(sess, feed_dict, natoms)
+            # feed_dict = get_feed_dict_func(batch, is_training=False)
+            results = loss.eval(self.model, batch, natoms)
 
             for k, v in results.items():
                 if k == "natoms":
@@ -1290,9 +1228,7 @@ def eval_single_list(single_batch_list, loss, sess, get_feed_dict_func, prefix="
 
     def get_evaluation_results(self, batch_list):
         if not self.multi_task_mode:
-            avg_results = self.eval_single_list(
-                batch_list, self.loss, self.sess, self.get_feed_dict
-            )
+            avg_results = self.eval_single_list(batch_list, self.loss)
         else:
             avg_results = {}
             for fitting_key in batch_list:
@@ -1474,9 +1410,11 @@ def get_train_batch() -> List[np.ndarray]:
             batch_data = tuple([batch_data[kk] for kk in self.data_keys])
             return batch_data
 
-        return tf.py_func(get_train_batch, [], self.data_types, name="train_data")
+        return get_train_batch
 
-    def get_data_dict(self, batch_list: List[np.ndarray]) -> Dict[str, np.ndarray]:
+    def get_data_dict(
+        self, batch_list: List[np.ndarray] = None
+    ) -> Dict[str, np.ndarray]:
         """Generate a dict of the loaded data.
 
         Parameters
@@ -1489,4 +1427,8 @@ def get_data_dict(self, batch_list: List[np.ndarray]) -> Dict[str, np.ndarray]:
         Dict[str, np.ndarray]
             The dict of the loaded data.
         """
-        return {kk: vv for kk, vv in zip(self.data_keys, batch_list)}
+        batch_data = self.train_data.get_batch()
+        # convert dict to list of arryas
+        batch_data = tuple([batch_data[kk] for kk in self.data_keys])
+        return {kk: vv for kk, vv in zip(self.data_keys, batch_data)}
+        # return {kk: vv for kk, vv in zip(self.data_keys, batch_list)}
diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index 16fcbfc7c5..2749a45594 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -458,7 +458,7 @@ def _load_set(self, set_name: DPPath):
                     self.data_dict[kk]["ndof"],
                     atomic=self.data_dict[kk]["atomic"],
                     high_prec=self.data_dict[kk]["high_prec"],
-                    must=self.data_dict[kk]["must"],
+                    must=False,
                     type_sel=self.data_dict[kk]["type_sel"],
                     repeat=self.data_dict[kk]["repeat"],
                     default=self.data_dict[kk]["default"],
diff --git a/deepmd/utils/learning_rate.py b/deepmd/utils/learning_rate.py
index 324f4f7fff..0f1ccdf5cf 100644
--- a/deepmd/utils/learning_rate.py
+++ b/deepmd/utils/learning_rate.py
@@ -3,6 +3,9 @@
 )
 
 import numpy as np
+from paddle.optimizer import (
+    lr,
+)
 
 from deepmd.env import (
     tf,
@@ -89,12 +92,9 @@ def build(
                 np.log(self.stop_lr_ / self.start_lr_) / (stop_step / self.decay_steps_)
             )
 
-        return tf.train.exponential_decay(
+        return lr.ExponentialDecay(
             self.start_lr_,
-            global_step,
-            self.decay_steps_,
-            self.decay_rate_,
-            staircase=True,
+            gamma=self.decay_rate_,
         )
 
     def start_lr(self) -> float:
diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
index 9b23bc9d76..966645996a 100644
--- a/deepmd/utils/neighbor_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -6,19 +6,14 @@
 )
 
 import numpy as np
+import paddle
 
 from deepmd.env import (
-    GLOBAL_NP_FLOAT_PRECISION,
-    default_tf_session_config,
     op_module,
-    tf,
 )
 from deepmd.utils.data_system import (
     DeepmdDataSystem,
 )
-from deepmd.utils.parallel_op import (
-    ParallelOp,
-)
 
 log = logging.getLogger(__name__)
 
@@ -48,45 +43,6 @@ def __init__(
         self.rcut = rcut
         self.ntypes = ntypes
         self.one_type = one_type
-        sub_graph = tf.Graph()
-
-        def builder():
-            place_holders = {}
-            for ii in ["coord", "box"]:
-                place_holders[ii] = tf.placeholder(
-                    GLOBAL_NP_FLOAT_PRECISION, [None, None], name="t_" + ii
-                )
-            place_holders["type"] = tf.placeholder(
-                tf.int32, [None, None], name="t_type"
-            )
-            place_holders["natoms_vec"] = tf.placeholder(
-                tf.int32, [self.ntypes + 2], name="t_natoms"
-            )
-            place_holders["default_mesh"] = tf.placeholder(
-                tf.int32, [None], name="t_mesh"
-            )
-            t_type = place_holders["type"]
-            t_natoms = place_holders["natoms_vec"]
-            if self.one_type:
-                # all types = 0, natoms_vec = [natoms, natoms, natoms]
-                t_type = tf.clip_by_value(t_type, -1, 0)
-                t_natoms = tf.tile(t_natoms[0:1], [3])
-
-            _max_nbor_size, _min_nbor_dist = op_module.neighbor_stat(
-                place_holders["coord"],
-                t_type,
-                t_natoms,
-                place_holders["box"],
-                place_holders["default_mesh"],
-                rcut=self.rcut,
-            )
-            place_holders["dir"] = tf.placeholder(tf.string)
-            return place_holders, (_max_nbor_size, _min_nbor_dist, place_holders["dir"])
-
-        with sub_graph.as_default():
-            self.p = ParallelOp(builder, config=default_tf_session_config)
-
-        self.sub_sess = tf.Session(graph=sub_graph, config=default_tf_session_config)
 
     def get_stat(self, data: DeepmdDataSystem) -> Tuple[float, List[int]]:
         """Get the data statistics of the training data, including nearest nbor distance between atoms, max nbor size of atoms.
@@ -108,44 +64,61 @@ def get_stat(self, data: DeepmdDataSystem) -> Tuple[float, List[int]]:
         if not self.one_type:
             self.max_nbor_size *= self.ntypes
 
-        def feed():
-            for ii in range(len(data.system_dirs)):
-                for jj in data.data_systems[ii].dirs:
-                    data_set = data.data_systems[ii]._load_set(jj)
-                    for kk in range(np.array(data_set["type"]).shape[0]):
-                        yield {
-                            "coord": np.array(data_set["coord"])[kk].reshape(
-                                [-1, data.natoms[ii] * 3]
-                            ),
-                            "type": np.array(data_set["type"])[kk].reshape(
-                                [-1, data.natoms[ii]]
-                            ),
-                            "natoms_vec": np.array(data.natoms_vec[ii]),
-                            "box": np.array(data_set["box"])[kk].reshape([-1, 9]),
-                            "default_mesh": np.array(data.default_mesh[ii]),
-                            "dir": str(jj),
-                        }
-
-        for mn, dt, jj in self.p.generate(self.sub_sess, feed()):
-            if dt.size != 0:
-                dt = np.min(dt)
-            else:
-                dt = self.rcut
-                log.warning(
-                    "Atoms with no neighbors found in %s. Please make sure it's what you expected."
-                    % jj
-                )
-            if dt < self.min_nbor_dist:
-                if math.isclose(dt, 0.0, rel_tol=1e-6):
-                    # it's unexpected that the distance between two atoms is zero
-                    # zero distance will cause nan (#874)
-                    raise RuntimeError(
-                        "Some atoms are overlapping in %s. Please check your"
-                        " training data to remove duplicated atoms." % jj
+        for ii in range(len(data.system_dirs)):
+            for jj in data.data_systems[ii].dirs:
+                data_set = data.data_systems[ii]._load_set(jj)
+                for kk in range(np.array(data_set["type"]).shape[0]):
+                    coord = np.array(data_set["coord"])[kk].reshape(
+                        [-1, data.natoms[ii] * 3]
+                    )
+                    coord = paddle.to_tensor(coord, dtype="float32", place="cpu")
+
+                    _type = np.array(data_set["type"])[kk].reshape(
+                        [-1, data.natoms[ii]]
+                    )
+                    _type = paddle.to_tensor(_type, dtype="int32", place="cpu")
+
+                    natoms_vec = np.array(data.natoms_vec[ii])
+                    natoms_vec = paddle.to_tensor(
+                        natoms_vec, dtype="int64", place="cpu"
+                    )
+
+                    box = np.array(data_set["box"])[kk].reshape([-1, 9])
+                    box = paddle.to_tensor(box, dtype="float32", place="cpu")
+
+                    default_mesh = np.array(data.default_mesh[ii])
+                    default_mesh = paddle.to_tensor(
+                        default_mesh, dtype="int32", place="cpu"
+                    )
+
+                    rcut = self.rcut
+                    mn, dt = op_module.neighbor_stat(
+                        coord,
+                        _type,
+                        natoms_vec,
+                        box,
+                        default_mesh,
+                        rcut,
                     )
-                self.min_nbor_dist = dt
-            var = np.max(mn, axis=0)
-            self.max_nbor_size = np.maximum(var, self.max_nbor_size)
+                    if dt.size != 0:
+                        dt = paddle.min(dt).item()
+                    else:
+                        dt = self.rcut
+                        log.warning(
+                            "Atoms with no neighbors found in %s. Please make sure it's what you expected."
+                            % jj
+                        )
+                    if dt < self.min_nbor_dist:
+                        if math.isclose(dt, 0.0, rel_tol=1e-6):
+                            # it's unexpected that the distance between two atoms is zero
+                            # zero distance will cause nan (#874)
+                            raise RuntimeError(
+                                "Some atoms are overlapping in %s. Please check your"
+                                " training data to remove duplicated atoms." % jj
+                            )
+                        self.min_nbor_dist = dt
+                    var = paddle.max(mn, axis=0).numpy()
+                    self.max_nbor_size = np.maximum(var, self.max_nbor_size)
 
         log.info("training data with min nbor dist: " + str(self.min_nbor_dist))
         log.info("training data with max nbor size: " + str(self.max_nbor_size))
diff --git a/deepmd/utils/network.py b/deepmd/utils/network.py
index a718da0b26..58e6378215 100644
--- a/deepmd/utils/network.py
+++ b/deepmd/utils/network.py
@@ -4,7 +4,9 @@
     get_precision,
 )
 from deepmd.env import (
+    GLOBAL_PD_FLOAT_PRECISION,
     GLOBAL_TF_FLOAT_PRECISION,
+    paddle,
     tf,
 )
 
@@ -296,3 +298,194 @@ def variable_summaries(var: tf.Variable, name: str):
         tf.summary.scalar("max", tf.reduce_max(var))
         tf.summary.scalar("min", tf.reduce_min(var))
         tf.summary.histogram("histogram", var)
+
+
+class OneLayer(paddle.nn.Layer):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        activation_fn=paddle.nn.functional.tanh,
+        precision=GLOBAL_PD_FLOAT_PRECISION,
+        stddev=1.0,
+        bavg=0.0,
+        name="linear",
+        seed=None,
+        use_timestep=False,
+        trainable=True,
+        useBN=False,
+    ):
+        super().__init__(name)
+        self.out_features = out_features
+        self.activation_fn = activation_fn
+        self.use_timestep = use_timestep
+        self.useBN = useBN
+        self.seed = seed
+        paddle.seed(seed)
+
+        self.weight = self.create_parameter(
+            shape=[in_features, out_features],
+            dtype=precision,
+            is_bias=False,
+            attr=paddle.ParamAttr(trainable=trainable),
+            default_initializer=paddle.nn.initializer.Normal(
+                std=stddev / np.sqrt(in_features + out_features)
+            ),
+        )
+        self.bias = self.create_parameter(
+            shape=[out_features],
+            dtype=precision,
+            is_bias=True,
+            attr=paddle.ParamAttr(trainable=trainable),
+            default_initializer=paddle.nn.initializer.Normal(
+                mean=bavg if isinstance(bavg, float) else bavg[0], std=stddev
+            ),
+        )
+        if self.activation_fn is not None and self.use_timestep:
+            self.idt = self.create_parameter(
+                shape=[out_features],
+                dtype=precision,
+                attr=paddle.ParamAttr(trainable=trainable),
+                default_initializer=paddle.nn.initializer.Normal(mean=0.1, std=0.001),
+            )
+
+    def forward(self, input):
+        hidden = paddle.matmul(input, self.weight) + self.bias
+        if self.activation_fn is not None:
+            if self.useBN:
+                None
+                # hidden_bn = self._batch_norm(hidden, name=name+'_normalization', reuse=reuse)
+                # return activation_fn(hidden_bn)
+            else:
+                if self.use_timestep:
+                    hidden = (
+                        paddle.reshape(
+                            self.activation_fn(hidden), [-1, self.out_features]
+                        )
+                        * self.idt
+                    )
+                else:
+                    hidden = paddle.reshape(
+                        self.activation_fn(hidden), [-1, self.out_features]
+                    )
+        return hidden
+
+
+class EmbeddingNet(paddle.nn.Layer):
+    """Parameters
+    ----------
+    xx : Tensor
+        Input tensor of shape [-1,1]
+    network_size: list of int
+        Size of the embedding network. For example [16,32,64]
+    precision:
+        Precision of network weights. For example, tf.float64
+    activation_fn:
+        Activation function
+    resnet_dt: boolean
+        Using time-step in the ResNet construction
+    name_suffix: str
+        The name suffix append to each variable.
+    stddev: float
+        Standard deviation of initializing network parameters
+    bavg: float
+        Mean of network intial bias
+    seed: int
+        Random seed for initializing network parameters
+    trainable: boolean
+        If the netowk is trainable
+    """
+
+    def __init__(
+        self,
+        network_size,
+        precision,
+        activation_fn=paddle.nn.functional.tanh,
+        resnet_dt=False,
+        stddev=1.0,
+        bavg=0.0,
+        seed=42,
+        trainable=True,
+        name="",
+    ):
+        super().__init__(name)
+        self.name = name
+        self.outputs_size = [1] + network_size
+        self.activation_fn = activation_fn
+        self.resnet_dt = resnet_dt
+        self.seed = seed
+        paddle.seed(seed)
+
+        outputs_size = self.outputs_size
+        weight = []
+        bias = []
+        idt = []
+        for ii in range(1, len(outputs_size)):
+            weight.append(
+                self.create_parameter(
+                    shape=[outputs_size[ii - 1], outputs_size[ii]],
+                    dtype=precision,
+                    is_bias=False,
+                    attr=paddle.ParamAttr(trainable=trainable),
+                    default_initializer=paddle.nn.initializer.Normal(
+                        std=stddev / np.sqrt(outputs_size[ii] + outputs_size[ii - 1])
+                    ),
+                )
+            )
+            bias.append(
+                self.create_parameter(
+                    shape=[1, outputs_size[ii]],
+                    dtype=precision,
+                    is_bias=True,
+                    attr=paddle.ParamAttr(trainable=trainable),
+                    default_initializer=paddle.nn.initializer.Normal(
+                        mean=bavg, std=stddev
+                    ),
+                )
+            )
+            if resnet_dt:
+                idt.append(
+                    self.create_parameter(
+                        shape=[1, outputs_size[ii]],
+                        dtype=precision,
+                        attr=paddle.ParamAttr(trainable=trainable),
+                        default_initializer=paddle.nn.initializer.Normal(
+                            mean=1.0, std=0.001
+                        ),
+                    )
+                )
+
+        self.weight = paddle.nn.ParameterList(weight)
+        self.bias = paddle.nn.ParameterList(bias)
+        self.idt = paddle.nn.ParameterList(idt)
+
+    def forward(self, xx):
+        outputs_size = self.outputs_size
+        for ii in range(1, len(outputs_size)):
+            if self.activation_fn is not None:
+                hidden = paddle.reshape(
+                    self.activation_fn(
+                        paddle.matmul(xx, self.weight[ii - 1]) + self.bias[ii - 1]
+                    ),
+                    [-1, outputs_size[ii]],
+                )
+            else:
+                hidden = paddle.reshape(
+                    paddle.matmul(xx, self.weight[ii - 1]) + self.bias[ii - 1],
+                    [-1, outputs_size[ii]],
+                )
+
+            if outputs_size[ii] == outputs_size[ii - 1]:
+                if self.resnet_dt:
+                    xx += hidden * self.idt[ii]
+                else:
+                    xx += hidden
+            elif outputs_size[ii] == outputs_size[ii - 1] * 2:
+                if self.resnet_dt:
+                    xx = paddle.concat([xx, xx], axis=1) + hidden * self.idt[ii]
+                else:
+                    xx = paddle.concat([xx, xx], axis=1) + hidden
+            else:
+                xx = hidden
+
+        return xx
diff --git a/deepmd/utils/type_embed.py b/deepmd/utils/type_embed.py
index 7a3e0925b8..132c7d2db3 100644
--- a/deepmd/utils/type_embed.py
+++ b/deepmd/utils/type_embed.py
@@ -9,6 +9,7 @@
     get_precision,
 )
 from deepmd.env import (
+    paddle,
     tf,
 )
 from deepmd.utils.graph import (
@@ -47,15 +48,15 @@ def embed_atom_type(
         The embedded type of each atom.
         It has the shape of [numb_atoms, embedding_dim]
     """
-    te_out_dim = type_embedding.get_shape().as_list()[-1]
+    te_out_dim = type_embedding.shape[-1]
     atype = []
     for ii in range(ntypes):
-        atype.append(tf.tile([ii], [natoms[2 + ii]]))
-    atype = tf.concat(atype, axis=0)
-    atm_embed = tf.nn.embedding_lookup(
-        type_embedding, tf.cast(atype, dtype=tf.int32)
+        atype.append(paddle.tile([ii], [natoms[2 + ii]]))
+    atype = paddle.concat(atype, axis=0)
+    atm_embed = paddle.nn.functional.embedding(
+        paddle.cast(atype, dtype=paddle.int32), type_embedding
     )  # (nf*natom)*nchnl
-    atm_embed = tf.reshape(atm_embed, [-1, te_out_dim])
+    atm_embed = paddle.reshape(atm_embed, [-1, te_out_dim])
     return atm_embed
 
 
diff --git a/doc/getting-started/quick_start.ipynb b/doc/getting-started/quick_start.ipynb
index e743b5cf5c..31209ae381 100644
--- a/doc/getting-started/quick_start.ipynb
+++ b/doc/getting-started/quick_start.ipynb
@@ -1,5 +1,5 @@
 {
- "cells": [
+  "cells": [
   {
    "attachments": {},
    "cell_type": "markdown",
diff --git a/doc/install/install-tf.1.14-gpu.md b/doc/install/install-tf.1.14-gpu.md
index 4e9fcaf7fc..46d676d1b9 100644
--- a/doc/install/install-tf.1.14-gpu.md
+++ b/doc/install/install-tf.1.14-gpu.md
@@ -72,21 +72,21 @@ Would you like to interactively configure ./WORKSPACE for Android builds? [y/N]:
 Not configuring the WORKSPACE for Android builds.
 
 Preconfigured Bazel build configs. You can use any of the below by adding "--config=<>" to your build command. See .bazelrc for more details.
-	--config=mkl         	# Build with MKL support.
-	--config=monolithic  	# Config for mostly static monolithic build.
-	--config=gdr         	# Build with GDR support.
-	--config=verbs       	# Build with libverbs support.
-	--config=ngraph      	# Build with Intel nGraph support.
-	--config=numa        	# Build with NUMA support.
-	--config=dynamic_kernels	# (Experimental) Build kernels into separate shared objects.
-	--config=v2          	# Build TensorFlow 2.x instead of 1.x.
+    --config=mkl             # Build with MKL support.
+    --config=monolithic      # Config for mostly static monolithic build.
+    --config=gdr             # Build with GDR support.
+    --config=verbs           # Build with libverbs support.
+    --config=ngraph          # Build with Intel nGraph support.
+    --config=numa            # Build with NUMA support.
+    --config=dynamic_kernels    # (Experimental) Build kernels into separate shared objects.
+    --config=v2              # Build TensorFlow 2.x instead of 1.x.
 Preconfigured Bazel build configs to DISABLE default on features:
-	--config=noaws       	# Disable AWS S3 filesystem support.
-	--config=nogcp       	# Disable GCP support.
-	--config=nohdfs      	# Disable HDFS support.
-	--config=noignite    	# Disable Apache Ignite support.
-	--config=nokafka     	# Disable Apache Kafka support.
-	--config=nonccl      	# Disable NVIDIA NCCL support.
+    --config=noaws           # Disable AWS S3 filesystem support.
+    --config=nogcp           # Disable GCP support.
+    --config=nohdfs          # Disable HDFS support.
+    --config=noignite        # Disable Apache Ignite support.
+    --config=nokafka         # Disable Apache Kafka support.
+    --config=nonccl          # Disable NVIDIA NCCL support.
 Configuration finished
 ```
 
diff --git a/doc/logo.md b/doc/logo.md
index 420f378336..67c303f651 100644
--- a/doc/logo.md
+++ b/doc/logo.md
@@ -1,5 +1,5 @@
-# Logo
-
-<picture><source media="(prefers-color-scheme: dark)" srcset="./_static/logo-dark.svg"><source media="(prefers-color-scheme: light)" srcset="./_static/logo.svg"><img alt="DeePMD-kit logo" src="./_static/logo.svg"></picture>
-
-The logo of DeePMD-kit is a beaver. Beavers were widely distributed in Europe and Asia but became nearly extinct due to hunting. Listed as a first-class state-protected animal in China, the population of beavers in China is less than the giant pandas. We hope that users of DeePMD-kit can enhance the awareness to protect beavers.
+# Logo
+
+<picture><source media="(prefers-color-scheme: dark)" srcset="./_static/logo-dark.svg"><source media="(prefers-color-scheme: light)" srcset="./_static/logo.svg"><img alt="DeePMD-kit logo" src="./_static/logo.svg"></picture>
+
+The logo of DeePMD-kit is a beaver. Beavers were widely distributed in Europe and Asia but became nearly extinct due to hunting. Listed as a first-class state-protected animal in China, the population of beavers in China is less than the giant pandas. We hope that users of DeePMD-kit can enhance the awareness to protect beavers.
diff --git a/doc/model/dplr.md b/doc/model/dplr.md
index 27bfadcf00..46ff1ac25c 100644
--- a/doc/model/dplr.md
+++ b/doc/model/dplr.md
@@ -15,20 +15,20 @@ $deepmd_source_dir/examples/water/dplr/train/
 It is noted that **the tutorial dataset is not enough for training a productive model**.
 Two settings make the training input script different from an energy training input:
 ```json
-	"fitting_net": {
-	    "type":		"dipole",
-	    "dipole_type":	[0],
-	    "neuron":		[128, 128, 128],
-	    "seed":		1
-	},
+    "fitting_net": {
+        "type":        "dipole",
+        "dipole_type":    [0],
+        "neuron":        [128, 128, 128],
+        "seed":        1
+    },
 ```
 The type of fitting is set to {ref}`dipole <model/fitting_net[dipole]>`. The dipole is associated with type 0 atoms (oxygens), by the setting `"dipole_type": [0]`. What we trained is the displacement of the WC from the corresponding oxygen atom. It shares the same training input as the atomic dipole because both are 3-dimensional vectors defined on atoms.
 The loss section is provided as follows
 ```json
     "loss": {
-	"type":		"tensor",
-	"pref":		0.0,
-	"pref_atomic":	1.0
+    "type":        "tensor",
+    "pref":        0.0,
+    "pref_atomic":    1.0
     },
 ```
 so that the atomic dipole is trained as labels. Note that the NumPy compressed file `atomic_dipole.npy` should be provided in each dataset.
@@ -128,8 +128,8 @@ Type 1 and 2 (O and H) are `real_atom`s, while type 3 (WCs) are `virtual_atom`s.
 # kspace_style "pppm/dplr" should be used. in addition the
 # gewald(1/distance) should be set the same as that used in
 # training. Currently only ik differentiation is supported.
-kspace_style	pppm/dplr 1e-5
-kspace_modify	gewald ${BETA} diff ik mesh ${KMESH} ${KMESH} ${KMESH}
+kspace_style    pppm/dplr 1e-5
+kspace_modify    gewald ${BETA} diff ik mesh ${KMESH} ${KMESH} ${KMESH}
 ```
 The long-range part is calculated by the `kspace` support of LAMMPS. The `kspace_style` `pppm/dplr` is required. The spread parameter set by variable `BETA` should be set the same as that used in training. The `KMESH` should be set dense enough so the long-range calculation is converged.
 
@@ -139,18 +139,18 @@ The long-range part is calculated by the `kspace` support of LAMMPS. The `kspace
 # atoms. "type_associate" associates the real atom type its
 # corresponding virtual atom type. "bond_type" gives the type of the
 # bond between the real and virtual atoms.
-fix		0 all dplr model ener.pb type_associate 1 3 bond_type 1
-fix_modify	0 virial yes
+fix        0 all dplr model ener.pb type_associate 1 3 bond_type 1
+fix_modify    0 virial yes
 ```
 The fix command `dplr` calculates the position of WCs by the DW model and back-propagates the long-range interaction on virtual atoms to real toms.
 At this time, the training parameter {ref}`type_map <model/type_map>` will be mapped to LAMMPS atom types.
 
 ```lammps
 # compute the temperature of real atoms, excluding virtual atom contribution
-compute		real_temp real_atom temp
-compute		real_press all pressure real_temp
-fix		1 real_atom nvt temp ${TEMP} ${TEMP} ${TAU_T}
-fix_modify	1 temp real_temp
+compute        real_temp real_atom temp
+compute        real_press all pressure real_temp
+fix        1 real_atom nvt temp ${TEMP} ${TEMP} ${TAU_T}
+fix_modify    1 temp real_temp
 ```
 The temperature of the system should be computed from the real atoms. The kinetic contribution in the pressure tensor is also computed from the real atoms. The thermostat is applied to only real atoms. The computed temperature and pressure of real atoms can be accessed by, e.g.
 ```lammps
diff --git a/doc/model/overall.md b/doc/model/overall.md
index 3d4052e464..e13971afc2 100644
--- a/doc/model/overall.md
+++ b/doc/model/overall.md
@@ -3,7 +3,7 @@
 A model has two parts, a descriptor that maps atomic configuration to a set of symmetry invariant features, and a fitting net that takes descriptor as input and predicts the atomic contribution to the target physical property. It's defined in the {ref}`model <model>` section of the `input.json`, for example,
 ```json
     "model": {
-        "type_map":	["O", "H"],
+        "type_map":    ["O", "H"],
         "descriptor" :{
             "...": "..."
         },
diff --git a/doc/model/train-energy-spin.md b/doc/model/train-energy-spin.md
index d155ec977d..19af54d3d0 100644
--- a/doc/model/train-energy-spin.md
+++ b/doc/model/train-energy-spin.md
@@ -36,15 +36,15 @@ pref_fr(t) = start_pref_fr * ( lr(t) / start_lr ) + limit_pref_fr * ( 1 - lr(t)
 The {ref}`loss <loss>` section in the `input.json` is
 ```json
     "loss" :{
-	"type":		        "ener_spin",
-	"start_pref_e":	    0.02,
-	"limit_pref_e":	    1,
-	"start_pref_fr":	1000,
-    "limit_pref_fr":	1.0,
-	"start_pref_fm":	10000,
-	"limit_pref_fm":	10.0,
-	"start_pref_v":	    0,
-	"limit_pref_v":	    0,
+    "type":                "ener_spin",
+    "start_pref_e":        0.02,
+    "limit_pref_e":        1,
+    "start_pref_fr":    1000,
+    "limit_pref_fr":    1.0,
+    "start_pref_fm":    10000,
+    "limit_pref_fm":    10.0,
+    "start_pref_v":        0,
+    "limit_pref_v":        0,
     },
 ```
 The options {ref}`start_pref_e <loss[ener_spin]/start_pref_e>`, {ref}`limit_pref_e <loss[ener_spin]/limit_pref_e>`, {ref}`start_pref_fr <loss[ener_spin]/start_pref_fr>`, {ref}`limit_pref_fm <loss[ener_spin]/limit_pref_fm>`, {ref}`start_pref_v <loss[ener_spin]/start_pref_v>` and {ref}`limit_pref_v <loss[ener_spin]/limit_pref_v>` determine the start and limit prefactors of energy, atomic force, magnatic force and virial, respectively.
diff --git a/doc/model/train-energy.md b/doc/model/train-energy.md
index af3e4969b3..5d57e5f631 100644
--- a/doc/model/train-energy.md
+++ b/doc/model/train-energy.md
@@ -6,11 +6,11 @@ In this section, we will take `$deepmd_source_dir/examples/water/se_e2_a/input.j
 
 The construction of the fitting net is given by section {ref}`fitting_net <model/fitting_net>`
 ```json
-	"fitting_net" : {
-	    "neuron":		[240, 240, 240],
-	    "resnet_dt":	true,
-	    "seed":		1
-	},
+    "fitting_net" : {
+        "neuron":        [240, 240, 240],
+        "resnet_dt":    true,
+        "seed":        1
+    },
 ```
 * {ref}`neuron <model/fitting_net[ener]/neuron>` specifies the size of the fitting net. If two neighboring layers are of the same size, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is built between them.
 * If the option {ref}`resnet_dt <model/fitting_net[ener]/resnet_dt>` is set to `true`, then a timestep is used in the ResNet.
@@ -34,12 +34,12 @@ pref_f(t) = start_pref_f * ( lr(t) / start_lr ) + limit_pref_f * ( 1 - lr(t) / s
 The {ref}`loss <loss>` section in the `input.json` is
 ```json
     "loss" : {
-	"start_pref_e":	0.02,
-	"limit_pref_e":	1,
-	"start_pref_f":	1000,
-	"limit_pref_f":	1,
-	"start_pref_v":	0,
-	"limit_pref_v":	0
+    "start_pref_e":    0.02,
+    "limit_pref_e":    1,
+    "start_pref_f":    1000,
+    "limit_pref_f":    1,
+    "start_pref_v":    0,
+    "limit_pref_v":    0
     }
 ```
 The options {ref}`start_pref_e <loss[ener]/start_pref_e>`, {ref}`limit_pref_e <loss[ener]/limit_pref_e>`, {ref}`start_pref_f <loss[ener]/start_pref_f>`, {ref}`limit_pref_f <loss[ener]/limit_pref_f>`, {ref}`start_pref_v <loss[ener]/start_pref_v>` and {ref}`limit_pref_v <loss[ener]/limit_pref_v>` determine the start and limit prefactors of energy, force and virial, respectively.
diff --git a/doc/model/train-fitting-dos.md b/doc/model/train-fitting-dos.md
index bbe5b50690..1a9800dfed 100644
--- a/doc/model/train-fitting-dos.md
+++ b/doc/model/train-fitting-dos.md
@@ -21,15 +21,15 @@ The {ref}`fitting_net <model/fitting_net>` section tells DP which fitting net to
 The JSON of `dos` type should be provided like
 
 ```json
-	"fitting_net" : {
-		"type": "dos",
-		"numb_dos": 250,
-		"sel_type": [0],
-		"neuron": [120,120,120],
-		"resnet_dt": true,
-		"fparam": 0,
-		"seed": 1,
-	},
+    "fitting_net" : {
+        "type": "dos",
+        "numb_dos": 250,
+        "sel_type": [0],
+        "neuron": [120,120,120],
+        "resnet_dt": true,
+        "fparam": 0,
+        "seed": 1,
+    },
 ```
 
 -   `type` specifies which type of fitting net should be used. It should be `dos`.
@@ -49,17 +49,17 @@ loss = pref * global_loss + pref_atomic * atomic_loss
 The loss section should be provided like
 
 ```json
-	"loss" : {
-		"type": "dos",
-		"start_pref_dos": 0.0,
-		"limit_pref_dos": 0.0,
-		"start_pref_cdf": 0.0,
-		"limit_pref_cdf": 0.0,
-		"start_pref_ados": 1.0,
-		"limit_pref_ados": 1.0,
-		"start_pref_acdf": 0.0,
-		"limit_pref_acdf": 0.0
-	},
+    "loss" : {
+        "type": "dos",
+        "start_pref_dos": 0.0,
+        "limit_pref_dos": 0.0,
+        "start_pref_cdf": 0.0,
+        "limit_pref_cdf": 0.0,
+        "start_pref_ados": 1.0,
+        "limit_pref_ados": 1.0,
+        "start_pref_acdf": 0.0,
+        "limit_pref_acdf": 0.0
+    },
 ```
 
 -   {ref}`type <loss/type>` should be written as `dos` as a distinction from `ener` mode.
diff --git a/doc/model/train-fitting-tensor.md b/doc/model/train-fitting-tensor.md
index d7c06a25ed..c941ef93ee 100644
--- a/doc/model/train-fitting-tensor.md
+++ b/doc/model/train-fitting-tensor.md
@@ -1,123 +1,123 @@
-# Fit `tensor` like `Dipole` and `Polarizability`
-
-Unlike `energy`, which is a scalar, one may want to fit some high dimensional physical quantity, like `dipole` (vector) and `polarizability` (matrix, shorted as `polar`). Deep Potential has provided different APIs to do this. In this example, we will show you how to train a model to fit a water system. A complete training input script of the examples can be found in
-
-```bash
-$deepmd_source_dir/examples/water_tensor/dipole/dipole_input.json
-$deepmd_source_dir/examples/water_tensor/polar/polar_input.json
-```
-
-The training and validation data are also provided our examples. But note that **the data provided along with the examples are of limited amount, and should not be used to train a production model.**
-
-Similar to the `input.json` used in `ener` mode, training JSON is also divided into {ref}`model <model>`, {ref}`learning_rate <learning_rate>`, {ref}`loss <loss>` and {ref}`training <training>`. Most keywords remain the same as `ener` mode, and their meaning can be found [here](train-se-e2-a.md). To fit a tensor, one needs to modify {ref}`model/fitting_net <model/fitting_net>` and {ref}`loss <loss>`.
-
-## The fitting Network
-
-The {ref}`fitting_net <model/fitting_net>` section tells DP which fitting net to use.
-
-The JSON of `dipole` type should be provided like
-
-```json
-	"fitting_net" : {
-		"type": "dipole",
-		"sel_type": [0],
-		"neuron": [100,100,100],
-		"resnet_dt": true,
-		"seed": 1,
-	},
-```
-
-The JSON of `polar` type should be provided like
-
-```json
-	"fitting_net" : {
-	   	"type": "polar",
-		"sel_type": [0],
-		"neuron": [100,100,100],
-		"resnet_dt": true,
-		"seed": 1,
-	},
-```
-
--   `type` specifies which type of fitting net should be used. It should be either `dipole` or `polar`. Note that `global_polar` mode in version 1.x is already **deprecated** and is merged into `polar`. To specify whether a system is global or atomic, please see [here](train-se-e2-a.md).
--   `sel_type` is a list specifying which type of atoms have the quantity you want to fit. For example, in the water system, `sel_type` is `[0]` since `0` represents atom `O`. If left unset, all types of atoms will be fitted.
--   The rest arguments have the same meaning as they do in `ener` mode.
-
-## Loss
-
-DP supports a combinational training of the global system (only a global `tensor` label, i.e. dipole or polar, is provided in a frame) and atomic system (labels for **each** atom included in `sel_type` are provided). In a global system, each frame has just **one** `tensor` label. For example, when fitting `polar`, each frame will just provide a `1 x 9` vector which gives the elements of the polarizability tensor of that frame in order XX, XY, XZ, YX, YY, YZ, XZ, ZY, ZZ. By contrast, in an atomic system, each atom in `sel_type` has a `tensor` label. For example, when fitting a dipole, each frame will provide a `#sel_atom x 3` matrices, where `#sel_atom` is the number of atoms whose type are in `sel_type`.
-
-The {ref}`loss <loss>` section tells DP the weight of these two kinds of loss, i.e.
-
-```python
-loss = pref * global_loss + pref_atomic * atomic_loss
-```
-
-The loss section should be provided like
-
-```json
-	"loss" : {
-		"type":		"tensor",
-		"pref":		1.0,
-		"pref_atomic":	1.0
-	},
-```
-
--   {ref}`type <loss/type>` should be written as `tensor` as a distinction from `ener` mode.
--   {ref}`pref <loss[tensor]/pref>` and {ref}`pref_atomic <loss[tensor]/pref_atomic>` respectively specify the weight of global loss and atomic loss. It can not be left unset. If set to 0, the corresponding label will NOT be included in the training process.
-
-## Training Data Preparation
-
-In tensor mode, the identification of the label's type (global or atomic) is derived from the file name. The global label should be named `dipole.npy/raw` or `polarizability.npy/raw`, while the atomic label should be named `atomic_dipole.npy/raw` or `atomic_polarizability.npy/raw`. If wrongly named, DP will report an error
-
-```bash
-ValueError: cannot reshape array of size xxx into shape (xx,xx). This error may occur when your label mismatch it's name, i.e. you might store global tensor in `atomic_tensor.npy` or atomic tensor in `tensor.npy`.
-```
-
-In this case, please check the file name of the label.
-
-## Train the Model
-
-The training command is the same as `ener` mode, i.e.
-
-```bash
-dp train input.json
-```
-
-The detailed loss can be found in `lcurve.out`:
-
-```
-#  step    rmse_val   rmse_trn  rmse_lc_val rmse_lc_trn rmse_gl_val rmse_gl_trn  lr
-     0     8.34e+00   8.26e+00   8.34e+00   8.26e+00    0.00e+00    0.00e+00   1.0e-02
-   100     3.51e-02   8.55e-02   0.00e+00   8.55e-02    4.38e-03    0.00e+00   5.0e-03
-   200     4.77e-02   5.61e-02   0.00e+00   5.61e-02    5.96e-03    0.00e+00   2.5e-03
-   300     5.68e-02   1.47e-02   0.00e+00   0.00e+00    7.10e-03    1.84e-03   1.3e-03
-   400     3.73e-02   3.48e-02   1.99e-02   0.00e+00    2.18e-03    4.35e-03   6.3e-04
-   500     2.77e-02   5.82e-02   1.08e-02   5.82e-02    2.11e-03    0.00e+00   3.2e-04
-   600     2.81e-02   5.43e-02   2.01e-02   0.00e+00    1.01e-03    6.79e-03   1.6e-04
-   700     2.97e-02   3.28e-02   2.03e-02   0.00e+00    1.17e-03    4.10e-03   7.9e-05
-   800     2.25e-02   6.19e-02   9.05e-03   0.00e+00    1.68e-03    7.74e-03   4.0e-05
-   900     3.18e-02   5.54e-02   9.93e-03   5.54e-02    2.74e-03    0.00e+00   2.0e-05
-  1000     2.63e-02   5.02e-02   1.02e-02   5.02e-02    2.01e-03    0.00e+00   1.0e-05
-  1100     3.27e-02   5.89e-02   2.13e-02   5.89e-02    1.43e-03    0.00e+00   5.0e-06
-  1200     2.85e-02   2.42e-02   2.85e-02   0.00e+00    0.00e+00    3.02e-03   2.5e-06
-  1300     3.47e-02   5.71e-02   1.07e-02   5.71e-02    3.00e-03    0.00e+00   1.3e-06
-  1400     3.13e-02   5.76e-02   3.13e-02   5.76e-02    0.00e+00    0.00e+00   6.3e-07
-  1500     3.34e-02   1.11e-02   2.09e-02   0.00e+00    1.57e-03    1.39e-03   3.2e-07
-  1600     3.11e-02   5.64e-02   3.11e-02   5.64e-02    0.00e+00    0.00e+00   1.6e-07
-  1700     2.97e-02   5.05e-02   2.97e-02   5.05e-02    0.00e+00    0.00e+00   7.9e-08
-  1800     2.64e-02   7.70e-02   1.09e-02   0.00e+00    1.94e-03    9.62e-03   4.0e-08
-  1900     3.28e-02   2.56e-02   3.28e-02   0.00e+00    0.00e+00    3.20e-03   2.0e-08
-  2000     2.59e-02   5.71e-02   1.03e-02   5.71e-02    1.94e-03    0.00e+00   1.0e-08
-```
-
-One may notice that in each step, some of the local loss and global loss will be `0.0`. This is because our training data and validation data consist of the global system and atomic system, i.e.
-```
-	--training_data
-		>atomic_system
-		>global_system
-	--validation_data
-		>atomic_system
-		>global_system
-```
-During training, at each step when the `lcurve.out` is printed, the system used for evaluating the training (validation) error may be either with only global or only atomic labels, thus the corresponding atomic or global errors are missing and are printed as zeros.
+# Fit `tensor` like `Dipole` and `Polarizability`
+
+Unlike `energy`, which is a scalar, one may want to fit some high dimensional physical quantity, like `dipole` (vector) and `polarizability` (matrix, shorted as `polar`). Deep Potential has provided different APIs to do this. In this example, we will show you how to train a model to fit a water system. A complete training input script of the examples can be found in
+
+```bash
+$deepmd_source_dir/examples/water_tensor/dipole/dipole_input.json
+$deepmd_source_dir/examples/water_tensor/polar/polar_input.json
+```
+
+The training and validation data are also provided our examples. But note that **the data provided along with the examples are of limited amount, and should not be used to train a production model.**
+
+Similar to the `input.json` used in `ener` mode, training JSON is also divided into {ref}`model <model>`, {ref}`learning_rate <learning_rate>`, {ref}`loss <loss>` and {ref}`training <training>`. Most keywords remain the same as `ener` mode, and their meaning can be found [here](train-se-e2-a.md). To fit a tensor, one needs to modify {ref}`model/fitting_net <model/fitting_net>` and {ref}`loss <loss>`.
+
+## The fitting Network
+
+The {ref}`fitting_net <model/fitting_net>` section tells DP which fitting net to use.
+
+The JSON of `dipole` type should be provided like
+
+```json
+    "fitting_net" : {
+        "type": "dipole",
+        "sel_type": [0],
+        "neuron": [100,100,100],
+        "resnet_dt": true,
+        "seed": 1,
+    },
+```
+
+The JSON of `polar` type should be provided like
+
+```json
+    "fitting_net" : {
+           "type": "polar",
+        "sel_type": [0],
+        "neuron": [100,100,100],
+        "resnet_dt": true,
+        "seed": 1,
+    },
+```
+
+-   `type` specifies which type of fitting net should be used. It should be either `dipole` or `polar`. Note that `global_polar` mode in version 1.x is already **deprecated** and is merged into `polar`. To specify whether a system is global or atomic, please see [here](train-se-e2-a.md).
+-   `sel_type` is a list specifying which type of atoms have the quantity you want to fit. For example, in the water system, `sel_type` is `[0]` since `0` represents atom `O`. If left unset, all types of atoms will be fitted.
+-   The rest arguments have the same meaning as they do in `ener` mode.
+
+## Loss
+
+DP supports a combinational training of the global system (only a global `tensor` label, i.e. dipole or polar, is provided in a frame) and atomic system (labels for **each** atom included in `sel_type` are provided). In a global system, each frame has just **one** `tensor` label. For example, when fitting `polar`, each frame will just provide a `1 x 9` vector which gives the elements of the polarizability tensor of that frame in order XX, XY, XZ, YX, YY, YZ, XZ, ZY, ZZ. By contrast, in an atomic system, each atom in `sel_type` has a `tensor` label. For example, when fitting a dipole, each frame will provide a `#sel_atom x 3` matrices, where `#sel_atom` is the number of atoms whose type are in `sel_type`.
+
+The {ref}`loss <loss>` section tells DP the weight of these two kinds of loss, i.e.
+
+```python
+loss = pref * global_loss + pref_atomic * atomic_loss
+```
+
+The loss section should be provided like
+
+```json
+    "loss" : {
+        "type":        "tensor",
+        "pref":        1.0,
+        "pref_atomic":    1.0
+    },
+```
+
+-   {ref}`type <loss/type>` should be written as `tensor` as a distinction from `ener` mode.
+-   {ref}`pref <loss[tensor]/pref>` and {ref}`pref_atomic <loss[tensor]/pref_atomic>` respectively specify the weight of global loss and atomic loss. It can not be left unset. If set to 0, the corresponding label will NOT be included in the training process.
+
+## Training Data Preparation
+
+In tensor mode, the identification of the label's type (global or atomic) is derived from the file name. The global label should be named `dipole.npy/raw` or `polarizability.npy/raw`, while the atomic label should be named `atomic_dipole.npy/raw` or `atomic_polarizability.npy/raw`. If wrongly named, DP will report an error
+
+```bash
+ValueError: cannot reshape array of size xxx into shape (xx,xx). This error may occur when your label mismatch it's name, i.e. you might store global tensor in `atomic_tensor.npy` or atomic tensor in `tensor.npy`.
+```
+
+In this case, please check the file name of the label.
+
+## Train the Model
+
+The training command is the same as `ener` mode, i.e.
+
+```bash
+dp train input.json
+```
+
+The detailed loss can be found in `lcurve.out`:
+
+```
+#  step    rmse_val   rmse_trn  rmse_lc_val rmse_lc_trn rmse_gl_val rmse_gl_trn  lr
+     0     8.34e+00   8.26e+00   8.34e+00   8.26e+00    0.00e+00    0.00e+00   1.0e-02
+   100     3.51e-02   8.55e-02   0.00e+00   8.55e-02    4.38e-03    0.00e+00   5.0e-03
+   200     4.77e-02   5.61e-02   0.00e+00   5.61e-02    5.96e-03    0.00e+00   2.5e-03
+   300     5.68e-02   1.47e-02   0.00e+00   0.00e+00    7.10e-03    1.84e-03   1.3e-03
+   400     3.73e-02   3.48e-02   1.99e-02   0.00e+00    2.18e-03    4.35e-03   6.3e-04
+   500     2.77e-02   5.82e-02   1.08e-02   5.82e-02    2.11e-03    0.00e+00   3.2e-04
+   600     2.81e-02   5.43e-02   2.01e-02   0.00e+00    1.01e-03    6.79e-03   1.6e-04
+   700     2.97e-02   3.28e-02   2.03e-02   0.00e+00    1.17e-03    4.10e-03   7.9e-05
+   800     2.25e-02   6.19e-02   9.05e-03   0.00e+00    1.68e-03    7.74e-03   4.0e-05
+   900     3.18e-02   5.54e-02   9.93e-03   5.54e-02    2.74e-03    0.00e+00   2.0e-05
+  1000     2.63e-02   5.02e-02   1.02e-02   5.02e-02    2.01e-03    0.00e+00   1.0e-05
+  1100     3.27e-02   5.89e-02   2.13e-02   5.89e-02    1.43e-03    0.00e+00   5.0e-06
+  1200     2.85e-02   2.42e-02   2.85e-02   0.00e+00    0.00e+00    3.02e-03   2.5e-06
+  1300     3.47e-02   5.71e-02   1.07e-02   5.71e-02    3.00e-03    0.00e+00   1.3e-06
+  1400     3.13e-02   5.76e-02   3.13e-02   5.76e-02    0.00e+00    0.00e+00   6.3e-07
+  1500     3.34e-02   1.11e-02   2.09e-02   0.00e+00    1.57e-03    1.39e-03   3.2e-07
+  1600     3.11e-02   5.64e-02   3.11e-02   5.64e-02    0.00e+00    0.00e+00   1.6e-07
+  1700     2.97e-02   5.05e-02   2.97e-02   5.05e-02    0.00e+00    0.00e+00   7.9e-08
+  1800     2.64e-02   7.70e-02   1.09e-02   0.00e+00    1.94e-03    9.62e-03   4.0e-08
+  1900     3.28e-02   2.56e-02   3.28e-02   0.00e+00    0.00e+00    3.20e-03   2.0e-08
+  2000     2.59e-02   5.71e-02   1.03e-02   5.71e-02    1.94e-03    0.00e+00   1.0e-08
+```
+
+One may notice that in each step, some of the local loss and global loss will be `0.0`. This is because our training data and validation data consist of the global system and atomic system, i.e.
+```
+    --training_data
+        >atomic_system
+        >global_system
+    --validation_data
+        >atomic_system
+        >global_system
+```
+During training, at each step when the `lcurve.out` is printed, the system used for evaluating the training (validation) error may be either with only global or only atomic labels, thus the corresponding atomic or global errors are missing and are printed as zeros.
diff --git a/doc/model/train-hybrid.md b/doc/model/train-hybrid.md
index 37666668c7..65c52c6763 100644
--- a/doc/model/train-hybrid.md
+++ b/doc/model/train-hybrid.md
@@ -8,12 +8,12 @@ To use the descriptor in DeePMD-kit, one firstly set the {ref}`type <model/descr
             "type": "hybrid",
             "list" : [
                 {
-		    "type" : "se_e2_a",
-		    ...
+            "type" : "se_e2_a",
+            ...
                 },
                 {
-		    "type" : "se_e2_r",
-		    ...
+            "type" : "se_e2_r",
+            ...
                 }
             ]
         },
diff --git a/doc/model/train-se-a-mask.md b/doc/model/train-se-a-mask.md
index 17c211ec73..1f07ff4fd1 100644
--- a/doc/model/train-se-a-mask.md
+++ b/doc/model/train-se-a-mask.md
@@ -25,15 +25,15 @@ $deepmd_source_dir/examples/zinc_protein/zinc_se_a_mask.json
 
 The construction of the descriptor is given by section {ref}`descriptor <model/descriptor>`. An example of the descriptor is provided as follows
 ```json
-	"descriptor" :{
-	    "type":	"se_a_mask",
-	    "sel":		[36, 16, 24, 64, 6, 1],
-	    "neuron":		[25, 50, 100],
-		"axis_neuron": 16,
-	    "type_one_side":	false,
-	    "resnet_dt":	false,
-	    "seed":		1
-	}
+    "descriptor" :{
+        "type":    "se_a_mask",
+        "sel":        [36, 16, 24, 64, 6, 1],
+        "neuron":        [25, 50, 100],
+        "axis_neuron": 16,
+        "type_one_side":    false,
+        "resnet_dt":    false,
+        "seed":        1
+    }
 ```
 * The {ref}`type <model/descriptor/type>` of the descriptor is set to `"se_a_mask"`.
 * {ref}`sel <model/descriptor[se_a_mask]/sel>` gives the maximum number of atoms in input coordinates. It is a list, the length of which is the same as the number of atom types in the system, and `sel[i]` denotes the maximum number of atoms with type `i`.
@@ -45,13 +45,13 @@ The construction of the descriptor is given by section {ref}`descriptor <model/d
 
 To make the `aparam.npy` used for descriptor `se_a_mask`, two variables in `fitting_net` section are needed.
 ```json
-	"fitting_net" :{
-	    "neuron": [240, 240, 240],
-      	"resnet_dt": true,
-      	"seed": 1,
-      	"numb_aparam": 1,
-      	"use_aparam_as_mask": true
-	}
+    "fitting_net" :{
+        "neuron": [240, 240, 240],
+          "resnet_dt": true,
+          "seed": 1,
+          "numb_aparam": 1,
+          "use_aparam_as_mask": true
+    }
 ```
 * `neuron`, `resnet_dt` and `seed` are the same as the {ref}`fitting_net <model/fitting_net[ener]>` section for fitting energy.
 * {ref}`numb_aparam <model/fitting_net[ener]/numb_aparam>` gives the dimesion of the `aparam.npy` file. In this example, it is set to 1 and stores the real/virtual sign of the atoms. For real/virtual atoms, the corresponding sign in `aparam.npy` is set to 1/0.
diff --git a/doc/model/train-se-atten.md b/doc/model/train-se-atten.md
index 8b006346a9..1dfa7eae52 100644
--- a/doc/model/train-se-atten.md
+++ b/doc/model/train-se-atten.md
@@ -28,20 +28,20 @@ With the training input script, data are also provided in the example directory.
 
 An example of the DPA-1 descriptor is provided as follows
 ```json
-	"descriptor" :{
-          "type":		"se_atten",
-          "rcut_smth":	0.50,
-          "rcut":		6.00,
-          "sel":		120,
-          "neuron":		[25, 50, 100],
-          "axis_neuron":	16,
-          "resnet_dt":	false,
-          "attn":	128,
-          "attn_layer":	2,
-          "attn_mask":	false,
-          "attn_dotr":	true,
-          "seed":	1
-	}
+    "descriptor" :{
+          "type":        "se_atten",
+          "rcut_smth":    0.50,
+          "rcut":        6.00,
+          "sel":        120,
+          "neuron":        [25, 50, 100],
+          "axis_neuron":    16,
+          "resnet_dt":    false,
+          "attn":    128,
+          "attn_layer":    2,
+          "attn_mask":    false,
+          "attn_dotr":    true,
+          "seed":    1
+    }
 ```
 * The {ref}`type <model/descriptor/type>` of the descriptor is set to `"se_atten"`, which will use DPA-1 structures.
 * {ref}`rcut <model/descriptor[se_atten]/rcut>` is the cut-off radius for neighbor searching, and the {ref}`rcut_smth <model/descriptor[se_atten]/rcut_smth>` gives where the smoothing starts.
diff --git a/doc/model/train-se-e2-a-tebd.md b/doc/model/train-se-e2-a-tebd.md
index 7528202ff2..da55ef262f 100644
--- a/doc/model/train-se-e2-a-tebd.md
+++ b/doc/model/train-se-e2-a-tebd.md
@@ -8,27 +8,27 @@ The training input script is similar to that of [`se_e2_a`](train-se-e2-a.md), b
 The {ref}`model <model>` defines how the model is constructed, adding a section of type embedding net:
 ```json
     "model": {
-	"type_map":	["O", "H"],
-	"type_embedding":{
-			...
-	},
-	"descriptor" :{
+    "type_map":    ["O", "H"],
+    "type_embedding":{
             ...
-	},
-	"fitting_net" : {
+    },
+    "descriptor" :{
             ...
-	}
+    },
+    "fitting_net" : {
+            ...
+    }
     }
 ```
 The model will automatically apply the type embedding approach and generate type embedding vectors. If the type embedding vector is detected, the descriptor and fitting net would take it as a part of the input.
 
 The construction of type embedding net is given by {ref}`type_embedding <model/type_embedding>`. An example of {ref}`type_embedding <model/type_embedding>` is provided as follows
 ```json
-	"type_embedding":{
-	    "neuron":		[2, 4, 8],
-	    "resnet_dt":	false,
-	    "seed":		1
-	}
+    "type_embedding":{
+        "neuron":        [2, 4, 8],
+        "resnet_dt":    false,
+        "seed":        1
+    }
 ```
 * The {ref}`neuron <model/type_embedding/neuron>` specifies the size of the type embedding net. From left to right the members denote the sizes of each hidden layer from the input end to the output end, respectively. It takes a one-hot vector as input and output dimension equals to the last dimension of the {ref}`neuron <model/type_embedding/neuron>` list. If the outer layer is twice the size of the inner layer, then the inner layer is copied and concatenated, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is built between them.
 * If the option {ref}`resnet_dt <model/type_embedding/resnet_dt>` is set to `true`, then a timestep is used in the ResNet.
diff --git a/doc/model/train-se-e2-a.md b/doc/model/train-se-e2-a.md
index a043f64716..bad42c9a31 100644
--- a/doc/model/train-se-e2-a.md
+++ b/doc/model/train-se-e2-a.md
@@ -12,17 +12,17 @@ With the training input script, data are also provided in the example directory.
 
 The construction of the descriptor is given by section {ref}`descriptor <model/descriptor>`. An example of the descriptor is provided as follows
 ```json
-	"descriptor" :{
-	    "type":		"se_e2_a",
-	    "rcut_smth":	0.50,
-	    "rcut":		6.00,
-	    "sel":		[46, 92],
-	    "neuron":		[25, 50, 100],
-	    "type_one_side":	true,
-	    "axis_neuron":	16,
-	    "resnet_dt":	false,
-	    "seed":		1
-	}
+    "descriptor" :{
+        "type":        "se_e2_a",
+        "rcut_smth":    0.50,
+        "rcut":        6.00,
+        "sel":        [46, 92],
+        "neuron":        [25, 50, 100],
+        "type_one_side":    true,
+        "axis_neuron":    16,
+        "resnet_dt":    false,
+        "seed":        1
+    }
 ```
 * The {ref}`type <model/descriptor/type>` of the descriptor is set to `"se_e2_a"`.
 * {ref}`rcut <model/descriptor[se_e2_a]/rcut>` is the cut-off radius for neighbor searching, and the {ref}`rcut_smth <model/descriptor[se_e2_a]/rcut_smth>` gives where the smoothing starts.
diff --git a/doc/model/train-se-e2-r.md b/doc/model/train-se-e2-r.md
index f48e10c17b..ea9955c90e 100644
--- a/doc/model/train-se-e2-r.md
+++ b/doc/model/train-se-e2-r.md
@@ -9,15 +9,15 @@ $deepmd_source_dir/examples/water/se_e2_r/input.json
 
 The training input script is very similar to that of [`se_e2_a`](train-se-e2-a.md). The only difference lies in the {ref}`descriptor <model/descriptor>` section
 ```json
-	"descriptor": {
-	    "type":		"se_e2_r",
-	    "sel":		[46, 92],
-	    "rcut_smth":	0.50,
-	    "rcut":		6.00,
-	    "neuron":		[5, 10, 20],
-	    "resnet_dt":	false,
-	    "seed":		1,
-	    "_comment": " that's all"
-	},
+    "descriptor": {
+        "type":        "se_e2_r",
+        "sel":        [46, 92],
+        "rcut_smth":    0.50,
+        "rcut":        6.00,
+        "neuron":        [5, 10, 20],
+        "resnet_dt":    false,
+        "seed":        1,
+        "_comment": " that's all"
+    },
 ```
 The type of the descriptor is set by the key {ref}`type <model/descriptor/type>`.
diff --git a/doc/model/train-se-e3.md b/doc/model/train-se-e3.md
index d59f11b264..e7387005f1 100644
--- a/doc/model/train-se-e3.md
+++ b/doc/model/train-se-e3.md
@@ -9,15 +9,15 @@ $deepmd_source_dir/examples/water/se_e3/input.json
 
 The training input script is very similar to that of [`se_e2_a`](train-se-e2-a.md). The only difference lies in the `descriptor <model/descriptor>` section
 ```json
-	"descriptor": {
-	    "type":		"se_e3",
-	    "sel":		[40, 80],
-	    "rcut_smth":	0.50,
-	    "rcut":		6.00,
-	    "neuron":		[2, 4, 8],
-	    "resnet_dt":	false,
-	    "seed":		1,
-	    "_comment":		" that's all"
-	},
+    "descriptor": {
+        "type":        "se_e3",
+        "sel":        [40, 80],
+        "rcut_smth":    0.50,
+        "rcut":        6.00,
+        "neuron":        [2, 4, 8],
+        "resnet_dt":    false,
+        "seed":        1,
+        "_comment":        " that's all"
+    },
 ```
 The type of the descriptor is set by the key {ref}`type <model/descriptor/type>`.
diff --git a/doc/third-party/ipi.md b/doc/third-party/ipi.md
index 59decdf3bb..50fff4c18c 100644
--- a/doc/third-party/ipi.md
+++ b/doc/third-party/ipi.md
@@ -9,16 +9,16 @@ It is noted that multiple instances of the client allow for computing, in parall
 `water.json` is the parameter file for the client `dp_ipi`, and an example is provided:
 ```json
 {
-    "verbose":		false,
-    "use_unix":		true,
-    "port":		31415,
-    "host":		"localhost",
-    "graph_file":	"graph.pb",
-    "coord_file":	"conf.xyz",
+    "verbose":        false,
+    "use_unix":        true,
+    "port":        31415,
+    "host":        "localhost",
+    "graph_file":    "graph.pb",
+    "coord_file":    "conf.xyz",
     "atom_type" : {
-	"OW":		0,
-	"HW1":		1,
-	"HW2":		1
+    "OW":        0,
+    "HW1":        1,
+    "HW2":        1
     }
 }
 ```
diff --git a/doc/third-party/lammps-command.md b/doc/third-party/lammps-command.md
index eae377eb55..9b4f229c6d 100644
--- a/doc/third-party/lammps-command.md
+++ b/doc/third-party/lammps-command.md
@@ -123,10 +123,10 @@ dump            1 all custom 100 water.dump id type c_dipole[1] c_dipole[2] c_di
 ## Long-range interaction
 The reciprocal space part of the long-range interaction can be calculated by LAMMPS command `kspace_style`. To use it with DeePMD-kit, one writes
 ```lammps
-pair_style	deepmd graph.pb
+pair_style    deepmd graph.pb
 pair_coeff  * *
-kspace_style	pppm 1.0e-5
-kspace_modify	gewald 0.45
+kspace_style    pppm 1.0e-5
+kspace_modify    gewald 0.45
 ```
 Please notice that the DeePMD does nothing to the direct space part of the electrostatic interaction, because this part is assumed to be fitted in the DeePMD model (the direct space cut-off is thus the cut-off of the DeePMD model). The splitting parameter `gewald` is modified by the `kspace_modify` command.
 
diff --git a/doc/third-party/out-of-deepmd-kit.md b/doc/third-party/out-of-deepmd-kit.md
index 6cd5769fbf..fcf3e30451 100644
--- a/doc/third-party/out-of-deepmd-kit.md
+++ b/doc/third-party/out-of-deepmd-kit.md
@@ -1,35 +1,35 @@
-# Interfaces out of DeePMD-kit
-
-The codes of the following interfaces are not a part of the DeePMD-kit package and maintained by other repositories. We list these interfaces here for user convenience.
-
-## dpdata
-
-[dpdata](https://github.com/deepmodeling/dpdata) provides the `predict` method for `System` class:
-
-```py
-import dpdata
-dsys = dpdata.LabeledSystem('OUTCAR')
-dp_sys = dsys.predict("frozen_model_compressed.pb")
-```
-
-By inferring with the DP model `frozen_model_compressed.pb`, dpdata will generate a new labeled system `dp_sys` with inferred energies, forces, and virials.
-
-## OpenMM plugin for DeePMD-kit
-
-An [OpenMM](https://github.com/openmm/openmm) plugin is provided from [JingHuangLab/openmm_deepmd_plugin](https://github.com/JingHuangLab/openmm_deepmd_plugin), written by the [Huang Lab](http://www.compbiophysics.org/) at Westlake University.
-
-## AMBER interface to DeePMD-kit
-
-An [AMBER](https://ambermd.org/) interface to DeePMD-kit is written by the [York [Lab](https://theory.rutgers.edu/) from Rutgers University. It is open-source at [GitLab RutgersLBSR/AmberDPRc](https://gitlab.com/RutgersLBSR/AmberDPRc/). Details can be found in [this paper](https://doi.org/10.1021/acs.jctc.1c00201).
-
-## DP-GEN
-
-[DP-GEN](https://github.com/deepmodeling/dpgen) provides a workflow to generate accurate DP models by calling DeePMD-kit's command line interface (CLI) in the local or remote server. Details can be found in [this paper](https://doi.org/10.1016/j.cpc.2020.107206).
-
-## MLatom
-
-[Mlatom](http://mlatom.com/) provides an interface to the DeePMD-kit within MLatom's workflow by calling DeePMD-kit's CLI. Details can be found in [this paper](https://doi.org/10.1007/s41061-021-00339-5).
-
-## ABACUS
-
-[ABACUS](https://github.com/deepmodeling/abacus-develop/) can run molecular dynamics with a DP model. User is required to [build ABACUS with DeePMD-kit](https://abacus.deepmodeling.com/en/latest/advanced/install.html#build-with-deepmd-kit).
+# Interfaces out of DeePMD-kit
+
+The codes of the following interfaces are not a part of the DeePMD-kit package and maintained by other repositories. We list these interfaces here for user convenience.
+
+## dpdata
+
+[dpdata](https://github.com/deepmodeling/dpdata) provides the `predict` method for `System` class:
+
+```py
+import dpdata
+dsys = dpdata.LabeledSystem('OUTCAR')
+dp_sys = dsys.predict("frozen_model_compressed.pb")
+```
+
+By inferring with the DP model `frozen_model_compressed.pb`, dpdata will generate a new labeled system `dp_sys` with inferred energies, forces, and virials.
+
+## OpenMM plugin for DeePMD-kit
+
+An [OpenMM](https://github.com/openmm/openmm) plugin is provided from [JingHuangLab/openmm_deepmd_plugin](https://github.com/JingHuangLab/openmm_deepmd_plugin), written by the [Huang Lab](http://www.compbiophysics.org/) at Westlake University.
+
+## AMBER interface to DeePMD-kit
+
+An [AMBER](https://ambermd.org/) interface to DeePMD-kit is written by the [York [Lab](https://theory.rutgers.edu/) from Rutgers University. It is open-source at [GitLab RutgersLBSR/AmberDPRc](https://gitlab.com/RutgersLBSR/AmberDPRc/). Details can be found in [this paper](https://doi.org/10.1021/acs.jctc.1c00201).
+
+## DP-GEN
+
+[DP-GEN](https://github.com/deepmodeling/dpgen) provides a workflow to generate accurate DP models by calling DeePMD-kit's command line interface (CLI) in the local or remote server. Details can be found in [this paper](https://doi.org/10.1016/j.cpc.2020.107206).
+
+## MLatom
+
+[Mlatom](http://mlatom.com/) provides an interface to the DeePMD-kit within MLatom's workflow by calling DeePMD-kit's CLI. Details can be found in [this paper](https://doi.org/10.1007/s41061-021-00339-5).
+
+## ABACUS
+
+[ABACUS](https://github.com/deepmodeling/abacus-develop/) can run molecular dynamics with a DP model. User is required to [build ABACUS with DeePMD-kit](https://abacus.deepmodeling.com/en/latest/advanced/install.html#build-with-deepmd-kit).
diff --git a/doc/train/tensorboard.md b/doc/train/tensorboard.md
index 4846005216..fd17ebf254 100644
--- a/doc/train/tensorboard.md
+++ b/doc/train/tensorboard.md
@@ -23,29 +23,29 @@ directory by modifying the input script, setting {ref}`tensorboard <training/ten
 
 ```json
     "training" : {
-	"systems":	["../data/"],
-	"set_prefix":	"set",
-	"stop_batch":	1000000,
-	"batch_size":	1,
-
-	"seed":		1,
-
-	"_comment": " display and restart",
-	"_comment": " frequencies counted in batch",
-	"disp_file":	"lcurve.out",
-	"disp_freq":	100,
-	"numb_test":	10,
-	"save_freq":	1000,
-	"save_ckpt":	"model.ckpt",
-
-	"disp_training":true,
-	"time_training":true,
-	"tensorboard":	true,
-	"tensorboard_log_dir":"log",
-	"tensorboard_freq": 1000,
-	"profiling":	false,
-	"profiling_file":"timeline.json",
-	"_comment":	"that's all"
+    "systems":    ["../data/"],
+    "set_prefix":    "set",
+    "stop_batch":    1000000,
+    "batch_size":    1,
+
+    "seed":        1,
+
+    "_comment": " display and restart",
+    "_comment": " frequencies counted in batch",
+    "disp_file":    "lcurve.out",
+    "disp_freq":    100,
+    "numb_test":    10,
+    "save_freq":    1000,
+    "save_ckpt":    "model.ckpt",
+
+    "disp_training":true,
+    "time_training":true,
+    "tensorboard":    true,
+    "tensorboard_log_dir":"log",
+    "tensorboard_freq": 1000,
+    "profiling":    false,
+    "profiling_file":"timeline.json",
+    "_comment":    "that's all"
     }
 ```
 
diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md
index 39cf87d8b3..a1cbba0a35 100644
--- a/doc/train/training-advanced.md
+++ b/doc/train/training-advanced.md
@@ -7,11 +7,11 @@ In this section, we will take `$deepmd_source_dir/examples/water/se_e2_a/input.j
 The {ref}`learning_rate <learning_rate>` section in `input.json` is given as follows
 ```json
     "learning_rate" :{
-	"type":		"exp",
-	"start_lr":	0.001,
-	"stop_lr":	3.51e-8,
-	"decay_steps":	5000,
-	"_comment":	"that's all"
+    "type":        "exp",
+    "start_lr":    0.001,
+    "stop_lr":    3.51e-8,
+    "decay_steps":    5000,
+    "_comment":    "that's all"
     }
 ```
 * {ref}`start_lr <learning_rate[exp]/start_lr>` gives the learning rate at the beginning of the training.
@@ -31,25 +31,25 @@ where $t$ is the training step, $\alpha$ is the learning rate, $\alpha_0$ is the
 Other training parameters are given in the {ref}`training <training>` section.
 ```json
     "training": {
- 	"training_data": {
-	    "systems":		["../data_water/data_0/", "../data_water/data_1/", "../data_water/data_2/"],
-	    "batch_size":	"auto"
-	},
-	"validation_data":{
-	    "systems":		["../data_water/data_3"],
-	    "batch_size":	1,
-	    "numb_btch":	3
-	},
-	"mixed_precision": {
-	    "output_prec":      "float32",
-	    "compute_prec":     "float16"
-	},
-
-	"numb_steps":	1000000,
-	"seed":		1,
-	"disp_file":	"lcurve.out",
-	"disp_freq":	100,
-	"save_freq":	1000
+     "training_data": {
+        "systems":        ["../data_water/data_0/", "../data_water/data_1/", "../data_water/data_2/"],
+        "batch_size":    "auto"
+    },
+    "validation_data":{
+        "systems":        ["../data_water/data_3"],
+        "batch_size":    1,
+        "numb_btch":    3
+    },
+    "mixed_precision": {
+        "output_prec":      "float32",
+        "compute_prec":     "float16"
+    },
+
+    "numb_steps":    1000000,
+    "seed":        1,
+    "disp_file":    "lcurve.out",
+    "disp_freq":    100,
+    "save_freq":    1000
     }
 ```
 The sections {ref}`training_data <training/training_data>` and {ref}`validation_data <training/validation_data>` give the training dataset and validation dataset, respectively. Taking the training dataset for example, the keys are explained below:
@@ -62,19 +62,19 @@ The sections {ref}`training_data <training/training_data>` and {ref}`validation_
     * `"prob_sys_size; sidx_0:eidx_0:w_0; sidx_1:eidx_1:w_1;..."` the `list` of systems is divided into blocks. Block `i` has systems ranging from `sidx_i` to `eidx_i`. The probability of using a system from block `i` is proportional to `w_i`. Within one block, the probability of using a system is proportional to its size.
 * An example of using `"auto_prob"` is given below. The probability of using `systems[2]` is 0.4, and the sum of the probabilities of using `systems[0]` and `systems[1]` is 0.6. If the number of frames in `systems[1]` is twice of `system[0]`, then the probability of using `system[1]` is 0.4 and that of `system[0]` is 0.2.
 ```json
- 	"training_data": {
-	    "systems":		["../data_water/data_0/", "../data_water/data_1/", "../data_water/data_2/"],
-	    "auto_prob":	"prob_sys_size; 0:2:0.6; 2:3:0.4",
-	    "batch_size":	"auto"
-	}
+     "training_data": {
+        "systems":        ["../data_water/data_0/", "../data_water/data_1/", "../data_water/data_2/"],
+        "auto_prob":    "prob_sys_size; 0:2:0.6; 2:3:0.4",
+        "batch_size":    "auto"
+    }
 ```
 * The probability of using systems can also be specified explicitly with key {ref}`sys_probs <training/training_data/sys_probs>` which is a list having the length of the number of systems. For example
 ```json
- 	"training_data": {
-	    "systems":		["../data_water/data_0/", "../data_water/data_1/", "../data_water/data_2/"],
-	    "sys_probs":	[0.5, 0.3, 0.2],
-	    "batch_size":	"auto:32"
-	}
+     "training_data": {
+        "systems":        ["../data_water/data_0/", "../data_water/data_1/", "../data_water/data_2/"],
+        "sys_probs":    [0.5, 0.3, 0.2],
+        "batch_size":    "auto:32"
+    }
 ```
 * The key {ref}`batch_size <training/training_data/batch_size>` specifies the number of frames used to train or validate the model in a training step. It can be set to
     * `list`: the length of which is the same as the {ref}`systems`. The batch size of each system is given by the elements of the list.
@@ -145,15 +145,15 @@ One can set other environmental variables:
 One can use `--init-frz-model` features to adjust (increase or decrease) [`sel`](../model/sel.md) of a existing model. Firstly, one needs to adjust [`sel`](./train-input.rst) in `input.json`. For example, adjust from `[46, 92]` to `[23, 46]`.
 ```json
 "model": {
-	"descriptor": {
-		"sel": [23, 46]
-	}
+    "descriptor": {
+        "sel": [23, 46]
+    }
 }
 ```
 To obtain the new model at once, [`numb_steps`](./train-input.rst) should be set to zero:
 ```json
 "training": {
-	"numb_steps": 0
+    "numb_steps": 0
 }
 ```
 
diff --git a/examples/water/lmp/in.lammps b/examples/water/lmp/in.lammps
index ea3b5d52cd..5883016634 100644
--- a/examples/water/lmp/in.lammps
+++ b/examples/water/lmp/in.lammps
@@ -12,7 +12,7 @@ mass 		1 16
 mass		2 2
 
 # See https://deepmd.rtfd.io/lammps/ for usage
-pair_style	deepmd frozen_model.pb
+pair_style	deepmd Model_1000000_with_buffer
 # If atom names (O H in this example) are not set in the pair_coeff command, the type_map defined by the training parameter will be used by default.
 pair_coeff  * *	O H
 
diff --git a/examples/water/lmp/model.pb b/examples/water/lmp/model.pb
new file mode 100644
index 0000000000..fa246dffba
Binary files /dev/null and b/examples/water/lmp/model.pb differ
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index ccf9641795..b53e2bbe8b 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -2,6 +2,125 @@
 cmake_minimum_required(VERSION 3.16)
 project(DeePMD)
 
+macro(safe_set_static_flag)
+  foreach(flag_var
+          CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+          CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+    if(${flag_var} MATCHES "/MD")
+      string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+    endif(${flag_var} MATCHES "/MD")
+  endforeach(flag_var)
+endmacro()
+
+if(NOT DEFINED PADDLE_LIB)
+  message(
+    FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+set(PADDLE_LIB
+    ${PADDLE_LIB}
+    CACHE PATH "/path/paddle/lib")
+
+include_directories("${PADDLE_LIB}/")
+set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
+
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
+
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+
+# add custom operators
+option(USE_TENSORRT "Compile demo with TensorRT." OFF)
+
+if(WITH_GPU)
+  if(NOT WIN32)
+    set(CUDA_LIB
+        "/usr/local/cuda/lib64/"
+        CACHE STRING "CUDA Library")
+  else()
+    if(CUDA_LIB STREQUAL "")
+      set(CUDA_LIB
+          "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64"
+      )
+    endif()
+  endif(NOT WIN32)
+endif()
+
+if(NOT WIN32)
+  if(USE_TENSORRT AND WITH_GPU)
+    include_directories("${TENSORRT_INCLUDE_DIR}")
+    link_directories("${TENSORRT_LIB_DIR}")
+  endif()
+endif(NOT WIN32)
+
+if(WITH_STATIC_LIB)
+  set(DEPS
+      ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}
+  )
+else()
+  if(WIN32)
+    set(DEPS
+        ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}
+    )
+  else()
+    set(DEPS
+        ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX}
+    )
+  endif()
+endif()
+
+if(NOT WIN32)
+  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+  set(DEPS
+      ${DEPS}
+      ${MATH_LIB}
+      ${MKLDNN_LIB}
+      glog
+      gflags
+      protobuf
+      xxhash
+      ${EXTERNAL_LIB})
+else()
+  set(DEPS
+      ${DEPS}
+      ${MATH_LIB}
+      ${MKLDNN_LIB}
+      glog
+      gflags_static
+      libprotobuf
+      xxhash
+      ${EXTERNAL_LIB})
+  set(DEPS ${DEPS} shlwapi.lib)
+endif(NOT WIN32)
+
+if(WITH_GPU)
+  if(NOT WIN32)
+    if(USE_TENSORRT)
+      set(DEPS ${DEPS}
+               ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
+      set(DEPS
+          ${DEPS}
+          ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
+    endif()
+    set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+  else()
+    if(USE_TENSORRT)
+      set(DEPS ${DEPS}
+               ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS}
+               ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
+    endif()
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX})
+  endif()
+endif()
+
 option(BUILD_TESTING "Build test and enable converage" OFF)
 set(DEEPMD_C_ROOT
     ""
@@ -175,6 +294,7 @@ if(BUILD_CPP_IF)
   set(LIB_DEEPMD_CC "deepmd_cc")
   set(LIB_DEEPMD_C "deepmd_c")
   if(USE_CUDA_TOOLKIT)
+    set(LIB_DEEPMD_OP_DEVICE "deepmd_paddle_op_cuda")
     set(LIB_DEEPMD_OP_DEVICE "deepmd_op_cuda")
   elseif(USE_ROCM_TOOLKIT)
     set(LIB_DEEPMD_OP_DEVICE "deepmd_op_rocm")
@@ -260,6 +380,24 @@ if(BUILD_CPP_IF)
   endif()
 endif(BUILD_CPP_IF)
 
+# if(WIN32) if(USE_TENSORRT) add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+# COMMAND ${CMAKE_COMMAND} -E copy
+# ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}
+# ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} COMMAND ${CMAKE_COMMAND} -E copy
+# ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
+# ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} ) endif() if(WITH_MKL)
+# add_custom_command(TARGET ${DEMO_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E
+# copy ${MATH_LIB_PATH}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release COMMAND
+# ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll
+# ${CMAKE_BINARY_DIR}/Release COMMAND ${CMAKE_COMMAND} -E copy
+# ${MKLDNN_PATH}/lib/mkldnn.dll  ${CMAKE_BINARY_DIR}/Release ) else()
+# add_custom_command(TARGET ${DEMO_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E
+# copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release )
+# endif() if(NOT WITH_STATIC_LIB) add_custom_command(TARGET ${DEMO_NAME}
+# POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy
+# "${PADDLE_LIB}/paddle/lib/paddle_fluid.dll"
+# ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} ) endif() endif()
+
 # uninstall target
 configure_file(
   "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in"
diff --git a/source/api_cc/include/DeepPot.h b/source/api_cc/include/DeepPot.h
index 1c440e2668..a5bc0d8aa8 100644
--- a/source/api_cc/include/DeepPot.h
+++ b/source/api_cc/include/DeepPot.h
@@ -291,12 +291,18 @@ class DeepPot {
   void get_type_map(std::string& type_map);
 
  private:
+  std::shared_ptr<paddle_infer::Predictor> predictor = nullptr;
+  paddle_infer::Config config;
+  int math_lib_num_threads;
   tensorflow::Session* session;
   int num_intra_nthreads, num_inter_nthreads;
   tensorflow::GraphDef* graph_def;
   bool inited;
   template <class VT>
   VT get_scalar(const std::string& name) const;
+  template <class VT>
+  VT paddle_get_scalar(const std::string& name) const;
+
   // VALUETYPE get_rcut () const;
   // int get_ntypes () const;
   double rcut;
diff --git a/source/api_cc/include/common.h b/source/api_cc/include/common.h
index bed8e97e82..8d9a8ee2b1 100644
--- a/source/api_cc/include/common.h
+++ b/source/api_cc/include/common.h
@@ -14,6 +14,7 @@
 #else
 #include "tf_public.h"
 #endif
+#include "paddle/include/paddle_inference_api.h"
 
 namespace deepmd {
 
@@ -191,6 +192,17 @@ VT session_get_scalar(tensorflow::Session* session,
                       const std::string name,
                       const std::string scope = "");
 
+/**
+ * @brief Get the value of a tensor.
+ * @param[in] predictor Paddle inference predictor.
+ * @param[in] name The name of the tensor.
+ * @return The value of the tensor.
+ **/
+template <typename VT>
+VT predictor_get_scalar(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::string name_);
+
 /**
  * @brief Get the vector of a tensor.
  * @param[out] o_vec The output vector.
@@ -215,6 +227,16 @@ int session_get_dtype(tensorflow::Session* session,
                       const std::string name,
                       const std::string scope = "");
 
+/**
+ * @brief Get the type of a tensor.
+ * @param[in] predictor Paddle inference predictor.
+ * @param[in] name The name of the tensor.
+ * @return The type of the tensor.
+ **/
+paddle_infer::DataType predictor_get_dtype(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::string& name_);
+
 /**
  * @brief Get input tensors.
  * @param[out] input_tensors Input tensors.
@@ -270,6 +292,35 @@ int session_input_tensors(
     const int ago,
     const std::string scope = "");
 
+/**
+ * @brief Send input data into paddle tensor handles.
+ * @param[in] predictor The paddle predictor pointer.
+ * @param[in] dcoord_ Coordinates of atoms.
+ * @param[in] ntypes Number of atom types.
+ * @param[in] datype_ Atom types.
+ * @param[in] dlist Neighbor list.
+ * @param[in] fparam_ Frame parameters.
+ * @param[in] aparam_ Atom parameters.
+ * @param[in] atommap Atom map.
+ * @param[in] nghost Number of ghost atoms.
+ * @param[in] ago Update the internal neighbour list if ago is 0.
+ * @param[in] scope The scope of the tensors.
+ */
+template <typename MODELTYPE, typename VALUETYPE>
+int predictor_input_tensors(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::vector<VALUETYPE>& dcoord_,
+    const int& ntypes,
+    const std::vector<int>& datype_,
+    const std::vector<VALUETYPE>& dbox,
+    InputNlist& dlist,
+    const std::vector<VALUETYPE>& fparam_,
+    const std::vector<VALUETYPE>& aparam_,
+    const deepmd::AtomMap& atommap,
+    const int nghost,
+    const int ago,
+    const std::string scope = "");
+
 /**
  * @brief Get input tensors for mixed type.
  * @param[out] input_tensors Input tensors.
diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc
index d8f0d8a8fe..c76d339f5b 100644
--- a/source/api_cc/src/DeepPot.cc
+++ b/source/api_cc/src/DeepPot.cc
@@ -4,6 +4,8 @@
 
 #include "AtomMap.h"
 #include "device.h"
+#include "paddle/include/paddle_inference_api.h"
+// #include "glog/logging.h"
 
 using namespace tensorflow;
 using namespace deepmd;
@@ -115,6 +117,7 @@ template void run_model<float, float>(
     const int nframes,
     const int nghost);
 
+/*下面这个函数是接受转发参数，真正运行计算的函数*/
 template <typename MODELTYPE, typename VALUETYPE>
 static void run_model(
     std::vector<ENERGYTYPE>& dener,
@@ -215,6 +218,166 @@ static void run_model(
                               nframes, nall);
 }
 
+// paddle_run_model开始
+template <typename MODELTYPE, typename VALUETYPE>
+static void paddle_run_model(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<VALUETYPE>& dforce_,
+    std::vector<VALUETYPE>& dvirial,
+    std::vector<VALUETYPE>& datom_energy_,
+    std::vector<VALUETYPE>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost = 0) {
+  unsigned nloc = atommap.get_type().size();
+  unsigned nall = nloc + nghost;
+  dener.resize(nframes);
+  if (nloc == 0) {
+    // no backward map needed
+    // dforce of size nall * 3
+    dforce_.resize(nframes * nall * 3);
+    fill(dforce_.begin(), dforce_.end(), (VALUETYPE)0.0);
+    // dvirial of size 9
+    dvirial.resize(nframes * 9);
+    fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.0);
+    // datom_energy_ of size nall
+    datom_energy_.resize(nframes * nall);
+    fill(datom_energy_.begin(), datom_energy_.end(), (VALUETYPE)0.0);
+    // datom_virial_ of size nall * 9
+    datom_virial_.resize(nframes * nall * 9);
+    fill(datom_virial_.begin(), datom_virial_.end(), (VALUETYPE)0.0);
+    return;
+  }
+
+  /* Running inference */
+  if (!predictor->Run()) {
+    throw deepmd::deepmd_exception("Paddle inference failed");
+  }
+
+  /* Get output handles*/
+  auto output_names = predictor->GetOutputNames();
+  auto output_atom_ener_tensor = predictor->GetOutputHandle(output_names[0]);
+  auto output_atom_virial_tensor = predictor->GetOutputHandle(output_names[1]);
+  auto output_atype_tensor = predictor->GetOutputHandle(output_names[2]);
+  auto output_coord_tensor = predictor->GetOutputHandle(output_names[3]);
+  auto output_energy_tensor = predictor->GetOutputHandle(output_names[4]);
+  auto output_force_tensor = predictor->GetOutputHandle(output_names[5]);
+  auto output_virial_tensor = predictor->GetOutputHandle(output_names[6]);
+
+  // 获取 Output Tensor 的维度信息
+  std::vector<int> output_atom_ener_shape = output_atom_ener_tensor->shape();
+  int output_atom_ener_size =
+      std::accumulate(output_atom_ener_shape.begin(),
+                      output_atom_ener_shape.end(), 1, std::multiplies<int>());
+  std::vector<int> output_atom_virial_shape =
+      output_atom_virial_tensor->shape();
+  int output_atom_virial_size = std::accumulate(
+      output_atom_virial_shape.begin(), output_atom_virial_shape.end(), 1,
+      std::multiplies<int>());
+  std::vector<int> output_atype_shape = output_atype_tensor->shape();
+  int output_atype_size =
+      std::accumulate(output_atype_shape.begin(), output_atype_shape.end(), 1,
+                      std::multiplies<int>());
+  std::vector<int> output_coord_shape = output_coord_tensor->shape();
+  int output_coord_size =
+      std::accumulate(output_coord_shape.begin(), output_coord_shape.end(), 1,
+                      std::multiplies<int>());
+  std::vector<int> output_energy_shape = output_energy_tensor->shape();
+  int output_energy_size =
+      std::accumulate(output_energy_shape.begin(), output_energy_shape.end(), 1,
+                      std::multiplies<int>());
+  std::vector<int> output_force_shape = output_force_tensor->shape();
+  int output_force_size =
+      std::accumulate(output_force_shape.begin(), output_force_shape.end(), 1,
+                      std::multiplies<int>());
+  std::vector<int> output_virial_shape = output_virial_tensor->shape();
+  int output_virial_size =
+      std::accumulate(output_virial_shape.begin(), output_virial_shape.end(), 1,
+                      std::multiplies<int>());
+
+  // get data of output_atom_ener
+  std::vector<VALUETYPE> output_atom_ener_data;
+  output_atom_ener_data.resize(output_atom_ener_size);
+  output_atom_ener_tensor->CopyToCpu(output_atom_ener_data.data());
+  // get data of output_atom_virial
+  std::vector<VALUETYPE> output_atom_virial_data;
+  output_atom_virial_data.resize(output_atom_virial_size);
+  output_atom_virial_tensor->CopyToCpu(output_atom_virial_data.data());
+  // get data of output_atype
+  // std::vector<VALUETYPE> output_atype_data;
+  // output_atype_data.resize(output_atype_size);
+  // output_atype_tensor->CopyToCpu(output_atype_data.data());
+  // get data of output_coord
+  std::vector<VALUETYPE> output_coord_data;
+  output_coord_data.resize(output_coord_size);
+  output_coord_tensor->CopyToCpu(output_coord_data.data());
+  // get data of output_energy
+  std::vector<VALUETYPE> output_energy_data;
+  output_energy_data.resize(output_energy_size);
+  output_energy_tensor->CopyToCpu(output_energy_data.data());
+  // get data of output_force
+  std::vector<VALUETYPE> output_force_data;
+  output_force_data.resize(output_force_size);
+  output_force_tensor->CopyToCpu(output_force_data.data());
+  // get data of output_virial
+  // std::vector<VALUETYPE> output_virial_data;
+  // output_virial_data.resize(output_virial_size);
+  // output_virial_tensor->CopyToCpu(output_virial_data.data());
+
+  std::vector<VALUETYPE> dforce(nframes * 3 * nall);
+  std::vector<VALUETYPE> datom_energy(nframes * nall, 0);
+  std::vector<VALUETYPE> datom_virial(nframes * 9 * nall);
+  dvirial.resize(nframes * 9);
+  for (int ii = 0; ii < nframes; ++ii) {
+    dener[ii] = output_energy_data[ii];
+  }
+  for (int ii = 0; ii < nframes * nall * 3; ++ii) {
+    dforce[ii] = output_force_data[ii];
+  }
+  for (int ii = 0; ii < nframes; ++ii) {
+    for (int jj = 0; jj < nloc; ++jj) {
+      datom_energy[ii * nall + jj] = output_atom_ener_data[ii * nloc + jj];
+    }
+  }
+  for (int ii = 0; ii < nframes * nall * 9; ++ii) {
+    datom_virial[ii] = output_atom_virial_data[ii];
+  }
+  // set dvirial to zero, prevent input vector is not zero (#1123)
+  std::fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.);
+  for (int kk = 0; kk < nframes; ++kk) {
+    for (int ii = 0; ii < nall; ++ii) {
+      dvirial[kk * 9 + 0] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 0];
+      dvirial[kk * 9 + 1] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 1];
+      dvirial[kk * 9 + 2] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 2];
+      dvirial[kk * 9 + 3] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 3];
+      dvirial[kk * 9 + 4] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 4];
+      dvirial[kk * 9 + 5] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 5];
+      dvirial[kk * 9 + 6] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 6];
+      dvirial[kk * 9 + 7] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 7];
+      dvirial[kk * 9 + 8] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 8];
+    }
+  }
+  dforce_ = dforce;
+  datom_energy_ = datom_energy;
+  datom_virial_ = datom_virial;
+  atommap.backward<VALUETYPE>(dforce_.begin(), dforce.begin(), 3, nframes,
+                              nall);
+  atommap.backward<VALUETYPE>(datom_energy_.begin(), datom_energy.begin(), 1,
+                              nframes, nall);
+  atommap.backward<VALUETYPE>(datom_virial_.begin(), datom_virial.begin(), 9,
+                              nframes, nall);
+}
+
 template void run_model<double, double>(
     std::vector<ENERGYTYPE>& dener,
     std::vector<double>& dforce_,
@@ -263,6 +426,51 @@ template void run_model<float, float>(
     const int& nframes,
     const int& nghost);
 
+/*start paddle run_model*/
+template void paddle_run_model<double, double>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void paddle_run_model<double, float>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void paddle_run_model<float, double>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void paddle_run_model<float, float>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
 // end multiple frames
 
 // start single frame
@@ -325,6 +533,7 @@ template void run_model<float, float>(
     const int nframes,
     const int nghost);
 
+/*Forwarding function of tensorflow*/
 template <typename MODELTYPE, typename VALUETYPE>
 static void run_model(
     ENERGYTYPE& dener,
@@ -346,6 +555,27 @@ static void run_model(
   dener = dener_[0];
 }
 
+/*Forwarding function of paddle*/
+template <typename MODELTYPE, typename VALUETYPE>
+static void paddle_run_model(
+    ENERGYTYPE& dener,
+    std::vector<VALUETYPE>& dforce_,
+    std::vector<VALUETYPE>& dvirial,
+    std::vector<VALUETYPE>& datom_energy_,
+    std::vector<VALUETYPE>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes = 1,
+    const int& nghost = 0) {
+  assert(nframes == 1);
+  std::vector<ENERGYTYPE> dener_(1);
+  // call multi-frame version
+  paddle_run_model<MODELTYPE, VALUETYPE>(dener_, dforce_, dvirial,
+                                         datom_energy_, datom_virial_,
+                                         predictor, atommap, nframes, nghost);
+  dener = dener_[0];
+}
+
 template void run_model<double, double>(
     ENERGYTYPE& dener,
     std::vector<double>& dforce_,
@@ -394,6 +624,51 @@ template void run_model<float, float>(
     const int& nframes,
     const int& nghost);
 
+/*start paddle */
+template void paddle_run_model<double, double>(
+    ENERGYTYPE& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void paddle_run_model<double, float>(
+    ENERGYTYPE& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void paddle_run_model<float, double>(
+    ENERGYTYPE& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void paddle_run_model<float, float>(
+    ENERGYTYPE& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
 // end single frame
 
 DeepPot::DeepPot()
@@ -417,66 +692,90 @@ void DeepPot::init(const std::string& model,
               << std::endl;
     return;
   }
-  SessionOptions options;
-  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
-  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
-  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  deepmd::load_op_library();
-
-  if (file_content.size() == 0)
-    check_status(ReadBinaryProto(Env::Default(), model, graph_def));
-  else
-    (*graph_def).ParseFromString(file_content);
+  std::string pdmodel_path = "";
+  std::string pdiparams_path = "";
+  bool use_paddle_inference = false;
+  if (model.find(".pb") == std::string::npos) {
+    pdmodel_path = model + ".pdmodel";
+    pdiparams_path = model + ".pdiparams";
+    use_paddle_inference = true;
+  } else {
+    throw "[Error] Not found any inference model in";
+  }
+  math_lib_num_threads = 1;
+
+  if (use_paddle_inference) {
+    config.SetModel(pdmodel_path, pdiparams_path);
+    config.SwitchIrOptim(true);
+    config.EnableUseGpu(8192, 0);
+    // std::cout << "IR Optim is: " << config.ir_optim() << std::endl;
+    // config.EnableMKLDNN();
+    config.EnableMemoryOptim();
+    // config.EnableProfile();
+    predictor = paddle_infer::CreatePredictor(config);
+  }
   int gpu_num = -1;
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   DPGetDeviceCount(gpu_num);  // check current device environment
-  if (gpu_num > 0) {
-    options.config.set_allow_soft_placement(true);
-    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(
-        0.9);
-    options.config.mutable_gpu_options()->set_allow_growth(true);
-    DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
-    std::string str = "/gpu:";
-    str += std::to_string(gpu_rank % gpu_num);
-    graph::SetDefaultDevice(str, graph_def);
-  }
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  check_status(NewSession(options, &session));
-  check_status(session->Create(*graph_def));
-  try {
-    model_version = get_scalar<STRINGTYPE>("model_attr/model_version");
-  } catch (deepmd::tf_exception& e) {
-    // no model version defined in old models
-    model_version = "0.0";
-  }
-  if (!model_compatable(model_version)) {
-    throw deepmd::deepmd_exception(
-        "incompatable model: version " + model_version +
-        " in graph, but version " + global_model_version +
-        " supported "
-        "See https://deepmd.rtfd.io/compatability/ for details.");
-  }
-  dtype = session_get_dtype(session, "descrpt_attr/rcut");
-  if (dtype == tensorflow::DT_DOUBLE) {
-    rcut = get_scalar<double>("descrpt_attr/rcut");
-  } else {
-    rcut = get_scalar<float>("descrpt_attr/rcut");
-  }
-  cell_size = rcut;
-  ntypes = get_scalar<int>("descrpt_attr/ntypes");
-  try {
-    ntypes_spin = get_scalar<int>("spin_attr/ntypes_spin");
-  } catch (deepmd::deepmd_exception) {
+#endif                        // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  if (use_paddle_inference) {
+    /*
+    tensorflow::DT_DOUBLE = 2
+    tensorflow::DT_FLOAT = 1
+    paddle_infer::DataType::FLOAT64 = 7
+    paddle_infer::DataType::FLOAT32 = 0
+    * st_model.descrpt.buffer_rcut.name = generated_tensor_0
+    * st_model.descrpt.buffer_ntypes.name = generated_tensor_2
+    * st_model.fitting.buffer_dfparam.name = generated_tensor_9
+    * st_model.fitting.buffer_daparam.name = generated_tensor_10
+    [buffer_t_type, [3]] generated name in static_model is: generated_tensor_12
+    [buffer_t_mt, [4]] generated name in static_model is: generated_tensor_13
+    [buffer_t_ver, [1]] generated name in static_model is: generated_tensor_14
+    [descrpt.buffer_rcut, []] generated name in static_model is:
+    generated_tensor_0 [descrpt.buffer_ntypes_spin, []] generated name in
+    static_model is: generated_tensor_1 [descrpt.buffer_ntypes, []] generated
+    name in static_model is: generated_tensor_2 [descrpt.avg_zero, [2, 552]]
+    generated name in static_model is: eager_tmp_0 [descrpt.std_ones, [2, 552]]
+    generated name in static_model is: eager_tmp_1 [descrpt.t_rcut, []]
+    generated name in static_model is: generated_tensor_3 [descrpt.t_rcut, []]
+    generated name in static_model is: generated_tensor_3 [descrpt.t_rcut, []]
+    generated name in static_model is: generated_tensor_3 [descrpt.t_ntypes, []]
+    generated name in static_model is: generated_tensor_4 [descrpt.t_ntypes, []]
+    generated name in static_model is: generated_tensor_4 [descrpt.t_ntypes, []]
+    generated name in static_model is: generated_tensor_4 [descrpt.t_ndescrpt,
+    []] generated name in static_model is: generated_tensor_5 [descrpt.t_sel,
+    [2]] generated name in static_model is: generated_tensor_6 [descrpt.t_avg,
+    [2, 552]] generated name in static_model is: generated_tensor_7
+    [descrpt.t_std, [2, 552]] generated name in static_model is:
+    generated_tensor_8 [fitting.buffer_dfparam, []] generated name in
+    static_model is: generated_tensor_9 [fitting.buffer_daparam, []] generated
+    name in static_model is: generated_tensor_10
+    **/
+    model_version = paddle_get_scalar<std::string>("generated_tensor_14");
+    dtype = predictor_get_dtype(predictor, "generated_tensor_0");
+    if (dtype == paddle_infer::DataType::FLOAT64) {
+      rcut = paddle_get_scalar<double>("generated_tensor_0");
+    } else {
+      rcut = paddle_get_scalar<float>("generated_tensor_0");
+    }
+    ntypes = paddle_get_scalar<int32_t>("generated_tensor_2");
+    // ntypes_spin = paddle_get_scalar<int64_t>("buffer_ntypes_spin");
     ntypes_spin = 0;
+    dfparam = paddle_get_scalar<int64_t>("generated_tensor_9");
+    daparam = paddle_get_scalar<int64_t>("generated_tensor_10");
+    model_type = paddle_get_scalar<std::string>("generated_tensor_13");
+    ;
+    inited = true;
+    init_nbor = false;
+    return;
   }
-  dfparam = get_scalar<int>("fitting_attr/dfparam");
-  daparam = get_scalar<int>("fitting_attr/daparam");
-  if (dfparam < 0) dfparam = 0;
-  if (daparam < 0) daparam = 0;
-  model_type = get_scalar<STRINGTYPE>("model_attr/model_type");
-  inited = true;
-
-  init_nbor = false;
+  // if (!model_compatable(model_version)) {
+  //   throw deepmd::deepmd_exception(
+  //       "incompatable model: version " + model_version +
+  //       " in graph, but version " + global_model_version +
+  //       " supported "
+  //       "See https://deepmd.rtfd.io/compatability/ for details.");
+  // }
 }
 
 void DeepPot::print_summary(const std::string& pre) const {
@@ -488,6 +787,11 @@ VT DeepPot::get_scalar(const std::string& name) const {
   return session_get_scalar<VT>(session, name);
 }
 
+template <class VT>
+VT DeepPot::paddle_get_scalar(const std::string& name) const {
+  return predictor_get_scalar<VT>(predictor, name);
+}
+
 template <typename VALUETYPE>
 void DeepPot::validate_fparam_aparam(
     const int& nframes,
@@ -936,13 +1240,27 @@ void DeepPot::compute(ENERGYVTYPE& dener,
     nlist_data.make_inlist(nlist);
   }
 
-  if (dtype == tensorflow::DT_DOUBLE) {
-    int ret = session_input_tensors<double>(input_tensors, dcoord, ntypes,
-                                            datype, dbox, nlist, fparam, aparam,
-                                            atommap, nghost_real, ago);
-    assert(nloc_real == ret);
-    run_model<double>(dener, dforce, dvirial, datom_energy, datom_virial,
-                      session, input_tensors, atommap, nframes, nghost_real);
+  if (dtype == tensorflow::DT_DOUBLE || paddle_infer::DataType::FLOAT64) {
+    int ret = 0;
+    if (predictor == nullptr) {
+      /* run tensorflow inference if paddle predictor is nullptr*/
+      int ret = session_input_tensors<double>(
+          input_tensors, dcoord, ntypes, datype, dbox, nlist, fparam, aparam,
+          atommap, nghost_real, ago);
+      assert(nloc_real == ret);
+      run_model<double>(dener, dforce, dvirial, datom_energy, datom_virial,
+                        session, input_tensors, atommap, nframes, nghost_real);
+    }
+    /* run paddle inference if paddle predictor exist*/
+    else if (predictor != nullptr) {
+      int ret = predictor_input_tensors<double>(
+          predictor, dcoord, ntypes, datype, dbox, nlist, fparam, aparam,
+          atommap, nghost_real, ago);
+      assert(nloc_real == ret);
+      paddle_run_model<double>(dener, dforce, dvirial, datom_energy,
+                               datom_virial, predictor, atommap, nframes,
+                               nghost_real);
+    }
   } else {
     int ret = session_input_tensors<float>(input_tensors, dcoord, ntypes,
                                            datype, dbox, nlist, fparam, aparam,
@@ -1198,7 +1516,10 @@ template void DeepPot::compute_mixed_type<float, std::vector<ENERGYTYPE>>(
     const std::vector<float>& aparam);
 
 void DeepPot::get_type_map(std::string& type_map) {
-  type_map = get_scalar<STRINGTYPE>("model_attr/tmap");
+  if (predictor == nullptr)
+    type_map = get_scalar<STRINGTYPE>("model_attr/tmap");
+  else
+    type_map = paddle_get_scalar<std::string>("generated_tensor_12");
 }
 
 DeepPotModelDevi::DeepPotModelDevi()
diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc
index 380a2910f6..aa23e30922 100644
--- a/source/api_cc/src/common.cc
+++ b/source/api_cc/src/common.cc
@@ -4,6 +4,7 @@
 
 #include "AtomMap.h"
 #include "device.h"
+#include "type_traits"
 #if defined(_WIN32)
 #if defined(_WIN32_WINNT)
 #undef _WIN32_WINNT
@@ -21,6 +22,7 @@
 #endif
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/text_format.h"
+#include "paddle/include/paddle_inference_api.h"
 
 using namespace tensorflow;
 
@@ -40,18 +42,25 @@ static std::vector<std::string> split(const std::string& input_,
 bool deepmd::model_compatable(std::string& model_version) {
   std::vector<std::string> words_mv = split(model_version, ".");
   std::vector<std::string> words_gmv = split(global_model_version, ".");
-  if (words_mv.size() != 2) {
-    throw deepmd::deepmd_exception("invalid graph model version string " +
-                                   model_version);
-  }
-  if (words_gmv.size() != 2) {
-    throw deepmd::deepmd_exception("invalid supported model version string " +
-                                   global_model_version);
-  }
-  int model_version_major = atoi(words_mv[0].c_str());
-  int model_version_minor = atoi(words_mv[1].c_str());
-  int MODEL_VERSION_MAJOR = atoi(words_gmv[0].c_str());
-  int MODEL_VERSION_MINOR = atoi(words_gmv[1].c_str());
+  //   if (words_mv.size() != 2) {
+  //     throw deepmd::deepmd_exception("invalid graph model version string " +
+  //                                    model_version);
+  //   }
+  //   if (words_gmv.size() != 2) {
+  //     throw deepmd::deepmd_exception("invalid supported model version string
+  //     " +
+  //                                    global_model_version);
+  //   }
+  //   int model_version_major = atoi(words_mv[0].c_str());
+  //   int model_version_minor = atoi(words_mv[1].c_str());
+  //   int MODEL_VERSION_MAJOR = atoi(words_gmv[0].c_str());
+  //   int MODEL_VERSION_MINOR = atoi(words_gmv[1].c_str());
+  int model_version_major = 1;
+  int model_version_minor = 1;
+  int MODEL_VERSION_MAJOR = 1;
+  int MODEL_VERSION_MINOR = 1;
+  // printf(">>> debug\n");
+  return true;
   if (model_version_major != MODEL_VERSION_MAJOR ||
       model_version_minor > MODEL_VERSION_MINOR) {
     return false;
@@ -492,6 +501,93 @@ int deepmd::session_input_tensors(
   return nloc;
 }
 
+template <typename MODELTYPE, typename VALUETYPE>
+int deepmd::predictor_input_tensors(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::vector<VALUETYPE>& dcoord_,
+    const int& ntypes,
+    const std::vector<int>& datype_,
+    const std::vector<VALUETYPE>& dbox,
+    InputNlist& dlist,
+    const std::vector<VALUETYPE>& fparam_,
+    const std::vector<VALUETYPE>& aparam_,
+    const deepmd::AtomMap& atommap,
+    const int nghost,
+    const int ago,
+    const std::string scope) {
+  int nframes = dcoord_.size() / 3 / datype_.size();
+  int nall = datype_.size();
+  int nloc = nall - nghost;
+  assert(nall * 3 * nframes == dcoord_.size());
+  assert(dbox.size() == nframes * 9);
+
+  std::vector<int> datype = atommap.get_type();
+  std::vector<int> type_count(ntypes, 0);
+  for (unsigned ii = 0; ii < datype.size(); ++ii) {
+    type_count[datype[ii]]++;
+  }
+  datype.insert(datype.end(), datype_.begin() + nloc, datype_.end());
+
+  std::vector<VALUETYPE> dcoord(dcoord_);
+  atommap.forward<VALUETYPE>(dcoord.begin(), dcoord_.begin(), 3, nframes, nall);
+
+  // 准备输入Tensor句柄
+  auto input_names = predictor->GetInputNames();
+  auto coord_handle = predictor->GetInputHandle(input_names[0]);
+  auto atype_handle = predictor->GetInputHandle(input_names[1]);
+  auto natoms_handle = predictor->GetInputHandle(input_names[2]);
+  auto box_handle = predictor->GetInputHandle(input_names[3]);
+  auto mesh_handle = predictor->GetInputHandle(input_names[4]);
+
+  // 设置输入 Tensor 的维度信息
+  std::vector<int> COORD_SHAPE = {nframes, nall * 3};
+  std::vector<int> ATYPE_SHAPE = {nframes, nall};
+  std::vector<int> BOX_SHAPE = {nframes, 9};
+  std::vector<int> MESH_SHAPE = {16};
+  std::vector<int> NATOMS_SHAPE = {2 + ntypes};
+
+  coord_handle->Reshape(COORD_SHAPE);
+  atype_handle->Reshape(ATYPE_SHAPE);
+  natoms_handle->Reshape(NATOMS_SHAPE);
+  box_handle->Reshape(BOX_SHAPE);
+  mesh_handle->Reshape(MESH_SHAPE);
+
+  // 发送输入数据到Tensor句柄
+  coord_handle->CopyFromCpu(dcoord.data());
+
+  std::vector<int> datype_pad(nframes * nall, 0);
+  for (int ii = 0; ii < nframes; ++ii) {
+    for (int jj = 0; jj < nall; ++jj) {
+      datype_pad[ii * nall + jj] = datype[jj];
+    }
+  }
+  atype_handle->CopyFromCpu(datype_pad.data());
+
+  std::vector<int> mesh_pad(16, 0);
+  mesh_pad[0] = ago;
+  mesh_pad[1] = dlist.inum;
+  mesh_pad[2] = 0;
+  mesh_pad[3] = 0;
+  memcpy(&mesh_pad[4], &(dlist.ilist), sizeof(int*));
+  memcpy(&mesh_pad[8], &(dlist.numneigh), sizeof(int*));
+  memcpy(&mesh_pad[12], &(dlist.firstneigh), sizeof(int**));
+  mesh_handle->CopyFromCpu(mesh_pad.data());
+
+  std::vector<int> natoms_pad = {nloc, nall};
+  for (int ii = 0; ii < ntypes; ++ii) {
+    natoms_pad.push_back(type_count[ii]);
+  }
+  natoms_handle->CopyFromCpu(natoms_pad.data());
+
+  box_handle->CopyFromCpu(dbox.data());
+
+  const int stride = sizeof(int*) / sizeof(int);
+  assert(stride * sizeof(int) == sizeof(int*));
+  assert(stride <= 4);
+
+  return nloc;
+}
+
 template <typename MODELTYPE, typename VALUETYPE>
 int deepmd::session_input_tensors(
     std::vector<std::pair<std::string, Tensor>>& input_tensors,
@@ -766,6 +862,42 @@ VT deepmd::session_get_scalar(Session* session,
   return orc(0);
 }
 
+template <typename VT>
+VT deepmd::predictor_get_scalar(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::string name_) {
+  if (std::is_same<VT, std::string>::value) {
+    /*
+    NOTE: Convert from ascii code(int64) to std::string
+    A workaround for string data type is not supported in Paddle
+    */
+    auto scalar_tensor = predictor->GetOutputHandle(name_);
+    if (scalar_tensor->shape().size() == 0) {
+      return VT();
+    }
+    const auto& shape = scalar_tensor->shape();
+    const int& str_len = std::accumulate(std::begin(shape), std::end(shape), 1,
+                                         std::multiplies<>{});
+    if (str_len == 0) {
+      return VT();
+    }
+    int32_t* scalar_ptr = (int32_t*)malloc(str_len * sizeof(int32_t));
+    scalar_tensor->CopyToCpu(scalar_ptr);
+    VT ret;
+    for (int ii = 0; ii < str_len; ++ii) {
+      ret += (char)scalar_ptr[ii];
+    }
+    free(scalar_ptr);
+    return ret;
+  } else {
+    /* Vanillia process for other data type below*/
+    auto scalar_tensor = predictor->GetOutputHandle(name_);
+    VT* scalar_ptr = (VT*)malloc(1 * sizeof(VT));
+    scalar_tensor->CopyToCpu(scalar_ptr);
+    return (*scalar_ptr);
+  }
+}
+
 template <typename VT>
 void deepmd::session_get_vector(std::vector<VT>& o_vec,
                                 Session* session,
@@ -805,6 +937,13 @@ int deepmd::session_get_dtype(tensorflow::Session* session,
   return (int)output_rc.dtype();
 }
 
+paddle_infer::DataType deepmd::predictor_get_dtype(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::string& name_) {
+  auto scalar_tensor = predictor->GetOutputHandle(name_);
+  return scalar_tensor->type();
+}
+
 template <typename VT>
 void deepmd::select_map(std::vector<VT>& out,
                         const std::vector<VT>& in,
@@ -898,6 +1037,14 @@ template int deepmd::session_get_scalar<int>(Session*,
                                              const std::string,
                                              const std::string);
 
+template int deepmd::predictor_get_scalar<int>(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::string name_);
+
+template int64_t deepmd::predictor_get_scalar<int64_t>(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::string name_);
+
 template void deepmd::session_get_vector<int>(std::vector<int>&,
                                               Session*,
                                               const std::string,
@@ -935,6 +1082,10 @@ template float deepmd::session_get_scalar<float>(Session*,
                                                  const std::string,
                                                  const std::string);
 
+template float deepmd::predictor_get_scalar<float>(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::string name_);
+
 template void deepmd::session_get_vector<float>(std::vector<float>&,
                                                 Session*,
                                                 const std::string,
@@ -972,6 +1123,10 @@ template double deepmd::session_get_scalar<double>(Session*,
                                                    const std::string,
                                                    const std::string);
 
+template double deepmd::predictor_get_scalar<double>(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::string name_);
+
 template void deepmd::session_get_vector<double>(std::vector<double>&,
                                                  Session*,
                                                  const std::string,
@@ -1008,6 +1163,10 @@ template void deepmd::select_map_inv<double>(
 template deepmd::STRINGTYPE deepmd::session_get_scalar<deepmd::STRINGTYPE>(
     Session*, const std::string, const std::string);
 
+template std::string deepmd::predictor_get_scalar<std::string>(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::string name_);
+
 template void deepmd::session_get_vector<deepmd::STRINGTYPE>(
     std::vector<deepmd::STRINGTYPE>&,
     Session*,
@@ -1107,6 +1266,7 @@ template int deepmd::session_input_tensors<float, float>(
     const deepmd::AtomMap& atommap,
     const std::string scope);
 
+/*下面是跟tensorflow session_input_tensors代码相关的模板声明*/
 template int deepmd::session_input_tensors<double, double>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
     const std::vector<double>& dcoord_,
@@ -1160,6 +1320,63 @@ template int deepmd::session_input_tensors<float, float>(
     const int nghost,
     const int ago,
     const std::string scope);
+/*tensorflow end*/
+
+/*下面是跟paddle predictor_input_tensors代码相关的模板声明*/
+template int deepmd::predictor_input_tensors<double, double>(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::vector<double>& dcoord_,
+    const int& ntypes,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
+    InputNlist& dlist,
+    const std::vector<double>& fparam_,
+    const std::vector<double>& aparam_,
+    const deepmd::AtomMap& atommap,
+    const int nghost,
+    const int ago,
+    const std::string scope);
+template int deepmd::predictor_input_tensors<float, double>(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::vector<double>& dcoord_,
+    const int& ntypes,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
+    InputNlist& dlist,
+    const std::vector<double>& fparam_,
+    const std::vector<double>& aparam_,
+    const deepmd::AtomMap& atommap,
+    const int nghost,
+    const int ago,
+    const std::string scope);
+
+template int deepmd::predictor_input_tensors<double, float>(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::vector<float>& dcoord_,
+    const int& ntypes,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
+    InputNlist& dlist,
+    const std::vector<float>& fparam_,
+    const std::vector<float>& aparam_,
+    const deepmd::AtomMap& atommap,
+    const int nghost,
+    const int ago,
+    const std::string scope);
+template int deepmd::predictor_input_tensors<float, float>(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::vector<float>& dcoord_,
+    const int& ntypes,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
+    InputNlist& dlist,
+    const std::vector<float>& fparam_,
+    const std::vector<float>& aparam_,
+    const deepmd::AtomMap& atommap,
+    const int nghost,
+    const int ago,
+    const std::string scope);
+/*paddle end*/
 
 template int deepmd::session_input_tensors_mixed_type<double, double>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt
index af88cb5ae6..beefef619e 100644
--- a/source/lib/CMakeLists.txt
+++ b/source/lib/CMakeLists.txt
@@ -13,6 +13,8 @@ if(USE_CUDA_TOOLKIT)
   add_definitions("-DGOOGLE_CUDA")
   add_subdirectory(src/cuda)
   set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda)
+  add_subdirectory(paddle_src/)
+  set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_paddle_op_cuda)
   target_link_libraries(${libname} INTERFACE deepmd_dyn_cudart ${EXTRA_LIBS})
   # gpu_cuda.h
   target_include_directories(
diff --git a/source/lib/paddle_src/CMakeLists.txt b/source/lib/paddle_src/CMakeLists.txt
new file mode 100644
index 0000000000..ae4a8ea731
--- /dev/null
+++ b/source/lib/paddle_src/CMakeLists.txt
@@ -0,0 +1,253 @@
+# required cmake version
+cmake_minimum_required(VERSION 3.16)
+# project name
+project(deepmd_paddle_op_cuda)
+
+# SET(CUDA_SEPARABLE_COMPILATION ON)
+find_package(CUDA REQUIRED)
+if(NOT CUDA_FOUND)
+  message(STATUS "CUDA not found. Project will not be built.")
+endif(NOT CUDA_FOUND)
+
+# take dynamic open cudart library replace of static one so it's not required
+# when using CPUs add_subdirectory(cudart) important: it must be before
+# cuda_add_library and any link target to cudart
+set(CUDA_LIBRARIES deepmd_dyn_cudart)
+
+# set c++ version c++11
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CUDA_STANDARD 11)
+# nvcc -o libdeeppaddle_md_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true
+# -DHIGH_PREC=true -gencode arch=compute_61,code=sm_61 -shared -Xcompiler -fPIC
+# deepmd_op.cu -L/usr/local/cuda/lib64 -lcudadevrt very important here! Include
+# path to cub. for searching device compute capability,
+# https://developer.nvidia.com/cuda-gpus
+
+# cub has been included in CUDA Toolkit 11, we do not need to include it any
+# more see https://github.com/NVIDIA/cub
+include_directories("${PADDLE_LIB}/paddle/include")
+set(LIB_PATH "paddle/lib")
+find_library(
+  PADDLE_FLUID_SHARED_LIB
+  NAMES "libpaddle_inference.so"
+  PATHS ${PADDLE_INFERENCE_DIR}/${LIB_PATH})
+if(${CUDA_VERSION_MAJOR} LESS_EQUAL "10")
+  include_directories(cub)
+endif()
+
+message(STATUS "CUDA major version is " ${CUDA_VERSION_MAJOR})
+
+if(${CUDA_VERSION_MAJOR} GREATER "11"
+   OR (${CUDA_VERSION_MAJOR} STREQUAL "11" AND ${CUDA_VERSION_MINOR}
+                                               GREATER_EQUAL "5"))
+  # nvcc flags
+  set(CUDA_NVCC_FLAGS
+      -arch=all; # embeds a compiled code image for all supported architectures
+                 # (sm_*), and a PTX program for the highest major virtual
+                 # architecture
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+elseif(${CUDA_VERSION_MAJOR} STREQUAL "11" AND ${CUDA_VERSION_MINOR} GREATER
+                                               "0")
+  # nvcc flags
+  set(CUDA_NVCC_FLAGS
+      -gencode
+      arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+      -gencode
+      arch=compute_53,code=sm_53;
+      -gencode
+      arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic
+                                  # Pascal)
+      -gencode
+      arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX
+                                  # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla
+                                  # P4, Discrete GPU on the NVIDIA Drive PX2
+      -gencode
+      arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
+      -gencode
+      arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
+      -gencode
+      arch=compute_80,code=sm_80; # Anpere - A100
+      -gencode
+      arch=compute_86,code=sm_86; # Anpere - RTX 3090
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+elseif(${CUDA_VERSION_MAJOR} STREQUAL "11" AND ${CUDA_VERSION_MINOR} STREQUAL
+                                               "0")
+  # nvcc flags
+  set(CUDA_NVCC_FLAGS
+      -gencode
+      arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+      -gencode
+      arch=compute_53,code=sm_53;
+      -gencode
+      arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic
+                                  # Pascal)
+      -gencode
+      arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX
+                                  # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla
+                                  # P4, Discrete GPU on the NVIDIA Drive PX2
+      -gencode
+      arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
+      -gencode
+      arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
+      -gencode
+      arch=compute_80,code=sm_80; # Anpere - A100
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+elseif(${CUDA_VERSION_MAJOR} STREQUAL "10")
+  set(CUDA_NVCC_FLAGS
+      -gencode
+      arch=compute_30,code=sm_30; # Tesla K10, Quadro K600 K420 K410,
+      -gencode
+      arch=compute_35,code=sm_35; # Tesla K20 K40, TITAN Z Black, GTX 780Ti 780
+      -gencode
+      arch=compute_37,code=sm_37; # Tesla K80
+      -gencode
+      arch=compute_50,code=sm_50; # Quadro 620 1200
+      -gencode
+      arch=compute_52,code=sm_52; # Tesla M40 M40, Quadro M6000 M5000 M4000
+                                  # M2000, TITAN X, GTX 980Ti 980 970 960 950
+      -gencode
+      arch=compute_53,code=sm_53; # Jetson TX1, Tegra X1
+      -gencode
+      arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic
+                                  # Pascal)
+      -gencode
+      arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX
+                                  # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla
+                                  # P4, Discrete GPU on the NVIDIA Drive PX2
+      -gencode
+      arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
+      -gencode
+      arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+elseif(${CUDA_VERSION_MAJOR} STREQUAL "9")
+  set(CUDA_NVCC_FLAGS
+      -gencode
+      arch=compute_30,code=sm_30;
+      -gencode
+      arch=compute_35,code=sm_35;
+      -gencode
+      arch=compute_37,code=sm_37;
+      -gencode
+      arch=compute_50,code=sm_50;
+      -gencode
+      arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+      -gencode
+      arch=compute_53,code=sm_53;
+      -gencode
+      arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic
+                                  # Pascal)
+      -gencode
+      arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX
+                                  # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla
+                                  # P4, Discrete GPU on the NVIDIA Drive PX2
+      -gencode
+      arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+elseif(${CUDA_VERSION_MAJOR} STREQUAL "8")
+  set(CUDA_NVCC_FLAGS
+      -gencode
+      arch=compute_30,code=sm_30;
+      -gencode
+      arch=compute_35,code=sm_35;
+      -gencode
+      arch=compute_37,code=sm_37;
+      -gencode
+      arch=compute_50,code=sm_50;
+      -gencode
+      arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+      -gencode
+      arch=compute_53,code=sm_53;
+      -gencode
+      arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic
+                                  # Pascal)
+      -gencode
+      arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX
+                                  # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla
+                                  # P4, Discrete GPU on the NVIDIA Drive PX2
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+elseif(${CUDA_VERSION_MAJOR} STREQUAL "7")
+  set(CUDA_NVCC_FLAGS
+      -gencode
+      arch=compute_30,code=sm_30;
+      -gencode
+      arch=compute_35,code=sm_35;
+      -gencode
+      arch=compute_37,code=sm_37;
+      -gencode
+      arch=compute_50,code=sm_50;
+      -gencode
+      arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+      -gencode
+      arch=compute_53,code=sm_53;
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+else()
+  message(FATAL_ERROR "unsupported CUDA_VERSION " ${CUDA_VERSION}
+                      ", please use a newer version (>=7.0) of CUDA toolkit!")
+endif()
+
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT"
+)
+
+if(${CUDA_VERSION_MAJOR} LESS_EQUAL "11")
+  # check unsupported -std=c++17
+  set(CMAKE_CXX_FLAGS_LIST "${CMAKE_CXX_FLAGS}")
+  separate_arguments(CMAKE_CXX_FLAGS_LIST)
+  if("-std=c++17" IN_LIST CMAKE_CXX_FLAGS_LIST)
+    message(
+      WARNING
+        "Environment variable CXXFLAGS contains flag --std=c++17 which is unsupported by CUDA ${CUDA_VERSION}. Such flag will be removed automatically."
+    )
+    string(REPLACE "-std=c++17" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  endif()
+endif()
+
+file(GLOB SOURCE_FILES "*.cu" "*.cc")
+cuda_add_library(deepmd_paddle_op_cuda SHARED ${SOURCE_FILES})
+target_link_libraries(deepmd_paddle_op_cuda
+                      "${PADDLE_LIB}/paddle/lib/libpaddle_inference.so")
+target_include_directories(
+  deepmd_paddle_op_cuda
+  PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include/>
+         $<INSTALL_INTERFACE:include> "${PADDLE_LIB}/paddle/include/")
+target_precompile_headers(deepmd_paddle_op_cuda PUBLIC [["device.h"]])
+if(APPLE)
+  set_target_properties(deepmd_paddle_op_cuda PROPERTIES INSTALL_RPATH
+                                                         @loader_path)
+else()
+  set_target_properties(
+    deepmd_paddle_op_cuda
+    PROPERTIES INSTALL_RPATH "$ORIGIN" IMPORTED_LOCATION
+                                       ${PADDLE_FLUID_SHARED_LIB})
+endif()
+
+if(BUILD_CPP_IF AND NOT BUILD_PY_IF)
+  install(
+    TARGETS deepmd_paddle_op_cuda
+    EXPORT ${CMAKE_PROJECT_NAME}Targets
+    DESTINATION lib/)
+endif(BUILD_CPP_IF AND NOT BUILD_PY_IF)
+if(BUILD_PY_IF)
+  install(TARGETS deepmd_paddle_op_cuda DESTINATION deepmd/op/)
+endif(BUILD_PY_IF)
diff --git a/source/lib/paddle_src/custom_op_install.py b/source/lib/paddle_src/custom_op_install.py
new file mode 100644
index 0000000000..7f2214ca39
--- /dev/null
+++ b/source/lib/paddle_src/custom_op_install.py
@@ -0,0 +1,62 @@
+from paddle.utils import (
+    cpp_extension,
+)
+
+# NOTE: 请在本文件的目录下执行：python custom_op_install.py install，以安装自定义算子
+
+cpp_extension.setup(
+    name="paddle_deepmd_lib",
+    ext_modules=cpp_extension.CUDAExtension(
+        sources=[
+            "../src/coord.cc",
+            "../src/env_mat_nvnmd.cc",
+            "../src/env_mat.cc",
+            "../src/ewald.cc",
+            "../src/fmt_nlist.cc",
+            "../src/gelu.cc",
+            "../src/map_aparam.cc",
+            "../src/neighbor_list.cc",
+            "../src/pair_tab.cc",
+            "../src/prod_env_mat_nvnmd.cc",
+            # "../src/prod_env_mat.cc",
+            "../src/prod_force_grad.cc",
+            "../src/prod_force.cc",
+            "../src/prod_virial_grad.cc",
+            "../src/prod_virial.cc",
+            "../src/region.cc",
+            "../src/SimulationRegion.cpp",
+            "../src/soft_min_switch_force_grad.cc",
+            "../src/soft_min_switch_force.cc",
+            "../src/soft_min_switch_virial_grad.cc",
+            "../src/soft_min_switch_virial.cc",
+            "../src/soft_min_switch.cc",
+            "../src/tabulate.cc",
+            "../src/utilities.cc",
+            "../src/cuda/coord.cu",
+            "../src/cuda/gelu.cu",
+            "../src/cuda/neighbor_list.cu",
+            # "../src/cuda/prod_force_grad.cu",
+            # "../src/cuda/prod_force.cu",
+            # "../src/cuda/prod_virial_grad.cu",
+            # "../src/cuda/prod_virial.cu",
+            "../src/cuda/region.cu",
+            "../src/cuda/tabulate.cu",
+            "./paddle_prod_env_mat.cu",
+            "./paddle_prod_env_mat.cc",
+            "./paddle_prod_virial_grad.cu",
+            "./paddle_prod_virial_grad.cc",
+            "./paddle_prod_virial.cu",
+            "./paddle_prod_virial.cc",
+            "./paddle_prod_force.cu",
+            "./paddle_prod_force.cc",
+            "./paddle_prod_force_grad.cu",
+            "./paddle_prod_force_grad.cc",
+            "./paddle_neighbor_stat.cc",
+        ],
+        include_dirs=[
+            "../../lib/include/",
+        ],
+        library_dirs=["/usr/local/cuda-11/lib64"],
+        define_macros=[("GOOGLE_CUDA", "1")],
+    ),
+)
diff --git a/source/lib/paddle_src/custom_op_test.py b/source/lib/paddle_src/custom_op_test.py
new file mode 100644
index 0000000000..18403c22c5
--- /dev/null
+++ b/source/lib/paddle_src/custom_op_test.py
@@ -0,0 +1,272 @@
+import os
+from os import path as osp
+
+import paddle
+import paddle_deepmd_lib
+
+unitest_dir = os.getenv("UNITTEST_DIR", None)
+
+if unitest_dir is None:
+    raise ValueError(
+        "Please download unitest data and set env with 4 scipts below:\n"
+        "1. wget -nc https://paddle-org.bj.bcebos.com/paddlescience/deepmd/deepmd_custom_op_test_data.tar\n"
+        "2. tar -xf deepmd_custom_op_test_data.tar\n"
+        "3. export UNITTEST_DIR=$PWD/deepmd_custom_op_test_data\n"
+        "4. python ./custom_op_test.py\n"
+    )
+
+
+def test_neighbor_stat(place="cpu"):
+    print("=" * 10, f"test_neighbor_stat [place={place}]", "=" * 10)
+    import numpy as np
+
+    coord = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "neighbor_stat/coord.npy"))
+    )
+    type = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "neighbor_stat/type.npy"))
+    )
+    natoms = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "neighbor_stat/natoms_vec.npy"))
+    )
+    box = np.ascontiguousarray(np.load(osp.join(unitest_dir, "neighbor_stat/box.npy")))
+    default_mesh = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "neighbor_stat/default_mesh.npy"))
+    )
+
+    rcut = 6.0
+
+    coord = paddle.to_tensor(coord, "float32", place=place)
+    type = paddle.to_tensor(type, "int32", place=place)
+    natoms = paddle.to_tensor(natoms, "int64", place=place)
+    box = paddle.to_tensor(box, "float32", place=place)
+    default_mesh = paddle.to_tensor(default_mesh, "int32", place=place)
+
+    mn, dt = paddle_deepmd_lib.neighbor_stat(
+        coord,
+        type,
+        natoms,
+        box,
+        default_mesh,
+        rcut=rcut,
+    )
+
+    mn_load = np.load(osp.join(unitest_dir, "neighbor_stat/mn.npy"))
+    dt_load = np.load(osp.join(unitest_dir, "neighbor_stat/dt.npy"))
+
+    # print(mn.shape, mn.min().item()); print(mn.max().item()); print(mn.mean().item()); print(mn.var().item())
+    # print(mn_load.shape); print(mn_load.min().item()); print(mn_load.max().item()); print(mn_load.mean().item()); print(mn_load.var().item())
+    # print(dt.shape, dt.min().item(), dt.max().item(), dt.mean().item(), dt.var().item())
+    # print(dt_load.shape, dt_load.min().item(), dt_load.max().item(), dt_load.mean().item(), dt_load.var().item())
+
+    print(np.allclose(mn.numpy(), mn_load, 1e-1))
+    print(np.allclose(dt.numpy(), dt_load, 1e-1))
+
+
+def test_prod_env_mat_a(place="cpu"):
+    print("=" * 10, f"test_prod_env_mat_a [place={place}]", "=" * 10)
+    import numpy as np
+
+    # "coord", "atype", "natoms", "box", "mesh", "t_avg", "t_std"
+    coord = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_env_mat_a/descrpt.coord.npy"))
+    )
+    atype = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_env_mat_a/descrpt.atype.npy"))
+    )
+    natoms = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_env_mat_a/descrpt.natoms.npy"))
+    )
+    box = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_env_mat_a/descrpt.box.npy"))
+    )
+    mesh = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_env_mat_a/descrpt.mesh.npy"))
+    )
+    t_avg = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_env_mat_a/descrpt.t_avg.npy"))
+    )
+    t_std = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_env_mat_a/descrpt.t_std.npy"))
+    )
+    t_std = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_env_mat_a/descrpt.t_std.npy"))
+    )
+
+    coord = paddle.to_tensor(coord, place=place)
+    atype = paddle.to_tensor(atype, place=place)
+    natoms = paddle.to_tensor(natoms, place="cpu")
+    box = paddle.to_tensor(box, place=place)
+    mesh = paddle.to_tensor(mesh, place=place)
+    t_avg = paddle.to_tensor(t_avg, place=place)
+    t_std = paddle.to_tensor(t_std, place=place)
+
+    rcut_a = -1
+    rcut_r = 6.0
+    rcut_r_smth = 0.5
+    sel_a = [46, 92]
+    sel_r = [0, 0]
+
+    # print(coord.shape, coord.dtype, coord.place)
+    # print(atype.shape, atype.dtype, atype.place)
+    # print(box.shape, box.dtype, box.place)
+    # print(mesh.shape, mesh.dtype, mesh.place)
+    # print(t_avg.shape, t_avg.dtype, t_avg.place)
+    # print(t_std.shape, t_std.dtype, t_std.place)
+    # print(natoms.shape, natoms.dtype, natoms.place)
+    # print(rcut_a)
+    # print(rcut_r)
+    # print(rcut_r_smth)
+    # print(sel_a)
+    # print(sel_r)
+
+    descrpt, descrpt_deriv, rij, nlist = paddle_deepmd_lib.prod_env_mat_a(
+        coord,
+        atype,
+        box,
+        mesh,
+        t_avg,
+        t_std,
+        natoms,
+        rcut_a,
+        rcut_r,
+        rcut_r_smth,
+        sel_a,
+        sel_r,
+    )
+    descrpt_load = np.load(osp.join(unitest_dir, "prod_env_mat_a/descrpt.descrpt.npy"))
+    descrpt_deriv_load = np.load(
+        osp.join(unitest_dir, "prod_env_mat_a/descrpt.descrpt_deriv.npy")
+    )
+    rij_load = np.load(osp.join(unitest_dir, "prod_env_mat_a/descrpt.rij.npy"))
+    nlist_load = np.load(osp.join(unitest_dir, "prod_env_mat_a/descrpt.nlist.npy"))
+    # print(descrpt.shape)
+    # print(descrpt_deriv.shape)
+    # print(rij.shape)
+    # print(nlist.shape)
+    # print(descrpt_load.shape) # (1, 576)
+    # print(descrpt_deriv_load.shape) # (1, 192)
+    # print(rij_load.shape) # (4,)
+    # print(nlist_load.shape) # (1, 9)
+
+    print(np.allclose(descrpt.numpy(), descrpt_load))
+    print(np.allclose(descrpt_deriv.numpy(), descrpt_deriv_load))
+    print(np.allclose(rij.numpy(), rij_load))
+    print(np.allclose(nlist.numpy(), nlist_load))
+
+
+def test_prod_force_se_a(place="cpu"):
+    print("=" * 10, f"test_prod_force_se_a [place={place}]", "=" * 10)
+    import numpy as np
+
+    # "coord", "atype", "natoms", "box", "mesh", "t_avg", "t_std"
+    net_deriv_reshape = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_force_se_a/descrpt.net_deriv_reshape.npy"))
+    )
+    descrpt_deriv = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_force_se_a/descrpt.descrpt_deriv.npy"))
+    )
+    nlist = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_force_se_a/descrpt.nlist.npy"))
+    )
+    natoms = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_force_se_a/descrpt.natoms.npy"))
+    )
+
+    nnei_a = 138
+    nnei_r = 0
+
+    net_deriv_reshape = paddle.to_tensor(
+        net_deriv_reshape, stop_gradient=False, place=place
+    )
+    descrpt_deriv = paddle.to_tensor(descrpt_deriv, place=place)
+    nlist = paddle.to_tensor(nlist, place=place)
+    natoms = paddle.to_tensor(natoms, place="cpu")  # [192, 192, 64 , 128]
+    force = paddle_deepmd_lib.prod_force_se_a(
+        net_deriv_reshape,
+        descrpt_deriv,
+        nlist,
+        natoms,
+        n_a_sel=nnei_a,
+        n_r_sel=nnei_r,
+    )
+    force.sum().backward()
+    # print(f"net_deriv_reshape.grad.shape = {net_deriv_reshape.grad.shape}")
+
+    force_load = np.load(osp.join(unitest_dir, "prod_force_se_a/descrpt.force.npy"))
+    # print(force.shape) # (1, 9)
+    # print(force_load.shape) # (1, 9)
+
+    print(np.allclose(force.numpy(), force_load))
+
+
+def test_prod_virial_se_a(place="cpu"):
+    print("=" * 10, f"test_prod_virial_se_a [place={place}]", "=" * 10)
+    import numpy as np
+
+    # "coord", "atype", "natoms", "box", "mesh", "t_avg", "t_std"
+    net_deriv_reshape = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_virial_se_a/descrpt.net_deriv_reshape.npy"))
+    )
+    descrpt_deriv = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_virial_se_a/descrpt.descrpt_deriv.npy"))
+    )
+    rij = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_virial_se_a/descrpt.rij.npy"))
+    )
+    nlist = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_virial_se_a/descrpt.nlist.npy"))
+    )
+    natoms = np.ascontiguousarray(
+        np.load(osp.join(unitest_dir, "prod_virial_se_a/descrpt.natoms.npy"))
+    )
+
+    nnei_a = 138
+    nnei_r = 0
+
+    net_deriv_reshape = paddle.to_tensor(
+        net_deriv_reshape, stop_gradient=False, place=place
+    )
+    descrpt_deriv = paddle.to_tensor(descrpt_deriv, place=place)
+    rij = paddle.to_tensor(rij, place=place)
+    nlist = paddle.to_tensor(nlist, place=place)
+    # natoms = natoms.tolist()
+    natoms = paddle.to_tensor(natoms, place="cpu")  # [192, 192, 64 , 128]
+    # print(natoms.place)
+    # print(natoms.dtype)
+    # print(natoms)
+    virial, atom_virial = paddle_deepmd_lib.prod_virial_se_a(
+        net_deriv_reshape,
+        descrpt_deriv,
+        rij,
+        nlist,
+        natoms,
+        n_a_sel=nnei_a,
+        n_r_sel=nnei_r,
+    )
+    virial.sum().backward()
+    # print(f"net_deriv_reshape.grad.shape = {net_deriv_reshape.grad.shape}")
+
+    virial_load = np.load(osp.join(unitest_dir, "prod_virial_se_a/descrpt.virial.npy"))
+    atom_virial_load = np.load(
+        osp.join(unitest_dir, "prod_virial_se_a/descrpt.atom_virial.npy")
+    )
+    # print(virial.shape) # (1, 9)
+    # print(virial_load.shape) # (1, 9)
+    # print(atom_virial.shape) # (1, 9)
+    # print(atom_virial_load.shape) # (1, 9)
+
+    print(np.allclose(virial.numpy(), virial_load))
+    print(np.allclose(atom_virial.numpy(), atom_virial_load))
+
+
+if __name__ == "__main__":
+    test_neighbor_stat()
+
+    test_prod_env_mat_a("gpu")
+    test_prod_force_se_a("gpu")
+    test_prod_virial_se_a("gpu")
+
+    test_prod_env_mat_a("cpu")
+    test_prod_force_se_a("cpu")
+    test_prod_virial_se_a("cpu")
diff --git a/source/lib/paddle_src/paddle_neighbor_stat.cc b/source/lib/paddle_src/paddle_neighbor_stat.cc
new file mode 100644
index 0000000000..0262a7bdd0
--- /dev/null
+++ b/source/lib/paddle_src/paddle_neighbor_stat.cc
@@ -0,0 +1,178 @@
+#include "device.h"
+#include "errors.h"
+#include "neighbor_list.h"
+#include "paddle/extension.h"
+
+#undef PADDLE_WITH_CUDA
+#define CHECK_INPUT_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+typedef double boxtensor_t;
+typedef double compute_t;
+
+std::vector<paddle::Tensor> NeighborStatOpCPUForward(
+    const paddle::Tensor& coord_tensor /*fp32/64*/,
+    const paddle::Tensor& type_tensor /*int32*/,
+    const paddle::Tensor& natoms_tensor /*int64*/,
+    const paddle::Tensor& box_tensor /*fp32/64*/,
+    const paddle::Tensor& mesh_tensor /*int32*/,
+    const float& rcut) {
+  CHECK_INPUT_CPU(coord_tensor);
+  CHECK_INPUT_CPU(type_tensor);
+  CHECK_INPUT_CPU(natoms_tensor);
+  CHECK_INPUT_CPU(box_tensor);
+  CHECK_INPUT_CPU(mesh_tensor);
+
+  CHECK_INPUT_DIM(coord_tensor, 2);
+  CHECK_INPUT_DIM(type_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+  CHECK_INPUT_DIM(box_tensor, 2);
+  CHECK_INPUT_DIM(mesh_tensor, 1);
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+
+  const int64_t* natoms = natoms_tensor.data<int64_t>();
+  int64_t nloc = natoms[0];
+  int64_t nall = natoms[1];
+  int64_t nsamples = coord_tensor.shape()[0];
+  int64_t ntypes = natoms_tensor.shape()[0] - 2;
+
+  PD_CHECK(nsamples == type_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nsamples == box_tensor.shape()[0], "number of samples should match");
+  PD_CHECK(nall * 3 == coord_tensor.shape()[1], "number of atoms should match");
+  PD_CHECK(nall == type_tensor.shape()[1], "number of atoms should match");
+  PD_CHECK(9 == box_tensor.shape()[1], "number of box should be 9");
+
+  int nei_mode = 0;
+  if (mesh_tensor.shape()[0] == 6) {
+    // manual copied pbc
+    assert(nloc == nall);
+    nei_mode = 1;
+  } else if (mesh_tensor.shape()[0] == 0) {
+    // no pbc
+    nei_mode = -1;
+  } else {
+    throw deepmd::deepmd_exception("invalid mesh tensor");
+  }
+  // if region is given extended, do not use pbc
+  bool b_pbc = (nei_mode >= 1 || nei_mode == -1) ? false : true;
+  bool b_norm_atom = (nei_mode == 1) ? true : false;
+
+  std::vector<int64_t> max_nbor_size_shape = {nloc, ntypes};
+  paddle::Tensor max_nbor_size_tensor = paddle::zeros(
+      max_nbor_size_shape, type_tensor.dtype(), type_tensor.place());
+
+  const float* coord = coord_tensor.data<float>();
+  const int* type = type_tensor.data<int>();
+  const float* box = box_tensor.data<float>();
+  const int* mesh = mesh_tensor.data<int>();
+  int* max_nbor_size = max_nbor_size_tensor.data<int>();
+
+  // set region
+  boxtensor_t boxt[9] = {0};
+  for (int dd = 0; dd < 9; ++dd) {
+    boxt[dd] = box[dd];
+  }
+  SimulationRegion<compute_t> region;
+  region.reinitBox(boxt);
+  // set & normalize coord
+  std::vector<compute_t> d_coord3(nall * 3);
+  for (int ii = 0; ii < nall; ++ii) {
+    for (int dd = 0; dd < 3; ++dd) {
+      d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+    }
+    if (b_norm_atom) {
+      compute_t inter[3];
+      region.phys2Inter(inter, &d_coord3[3 * ii]);
+      for (int dd = 0; dd < 3; ++dd) {
+        if (inter[dd] < 0)
+          inter[dd] += 1.;
+        else if (inter[dd] >= 1)
+          inter[dd] -= 1.;
+      }
+      region.inter2Phys(&d_coord3[3 * ii], inter);
+    }
+  }
+
+  // set type
+  std::vector<int> d_type(nall);
+  for (int ii = 0; ii < nall; ++ii) d_type[ii] = type[ii];
+
+  // build nlist
+  std::vector<std::vector<int> > d_nlist_a;
+  std::vector<std::vector<int> > d_nlist_r;
+  std::vector<int> nlist_map;
+  bool b_nlist_map = false;
+
+  if (nei_mode == 1) {
+    // std::cout << "I'm in nei_mode 1" << std::endl;
+    std::vector<double> bk_d_coord3 = d_coord3;
+    std::vector<int> bk_d_type = d_type;
+    std::vector<int> ncell, ngcell;
+    copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3,
+               bk_d_type, rcut, region);
+    b_nlist_map = true;
+    std::vector<int> nat_stt(3, 0);
+    std::vector<int> ext_stt(3), ext_end(3);
+    for (int dd = 0; dd < 3; ++dd) {
+      ext_stt[dd] = -ngcell[dd];
+      ext_end[dd] = ncell[dd] + ngcell[dd];
+    }
+    ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt,
+                  ncell, ext_stt, ext_end, region, ncell);
+  } else if (nei_mode == -1) {
+    ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL);
+  } else {
+    throw deepmd::deepmd_exception("unknow neighbor mode");
+  }
+
+  int MAX_NNEI = 0;
+  for (int ii = 0; ii < nloc; ii++) {
+    MAX_NNEI =
+        MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI;
+  }
+  // allocate output tensor for deepmd-kit
+  std::vector<int64_t> min_nbor_dist_shape = {nloc * MAX_NNEI};
+  paddle::Tensor min_nbor_dist_tensor = paddle::full(
+      min_nbor_dist_shape, 10000.0, coord_tensor.dtype(), coord_tensor.place());
+  auto* min_nbor_dist = min_nbor_dist_tensor.data<float>();
+
+#pragma omp parallel for
+  for (int ii = 0; ii < nloc; ii++) {
+    if (d_type[ii] < 0) continue;  // virtual atom
+    for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) {
+      int type = d_type[d_nlist_r[ii][jj]];
+      if (type < 0) continue;  // virtual atom
+      max_nbor_size[ii * ntypes + type] += 1;
+      compute_t rij[3] = {
+          d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0],
+          d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1],
+          d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]};
+      min_nbor_dist[ii * MAX_NNEI + jj] =
+          sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]);
+    }
+  }
+  return {max_nbor_size_tensor, min_nbor_dist_tensor};
+}
+
+std::vector<paddle::Tensor> NeighborStatForward(
+    const paddle::Tensor& coord_tensor,  /*float32*/
+    const paddle::Tensor& type_tensor,   /*int32*/
+    const paddle::Tensor& natoms_tensor, /*int64*/
+    const paddle::Tensor& box_tensor,    /*float32*/
+    const paddle::Tensor& mesh_tensor,   /*int32*/
+    float rcut) {
+  if (coord_tensor.is_cpu()) {
+    return NeighborStatOpCPUForward(coord_tensor, type_tensor, natoms_tensor,
+                                    box_tensor, mesh_tensor, rcut);
+  } else {
+    PD_THROW("NeighborStatForward only support CPU device.");
+  }
+}
+
+PD_BUILD_OP(neighbor_stat)
+    .Inputs({"coord", "type", "natoms", "box", "mesh"})
+    .Outputs({"max_nbor_size", "min_nbor_dist"})
+    .Attrs({"rcut: float"})
+    .SetKernelFn(PD_KERNEL(NeighborStatForward));
diff --git a/source/lib/paddle_src/paddle_prod_env_mat.cc b/source/lib/paddle_src/paddle_prod_env_mat.cc
new file mode 100644
index 0000000000..824c993d46
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_env_mat.cc
@@ -0,0 +1,540 @@
+#include <iomanip>
+#include <vector>
+
+#include "coord.h"
+#include "env_mat.h"
+#include "fmt_nlist.h"
+#include "gpu_cuda.h"
+#include "neighbor_list.h"
+#include "paddle/extension.h"
+#include "prod_env_mat.h"
+#include "region.h"
+#include "utilities.h"
+
+typedef long long int_64;
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+// #define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
+// void cum_sum(std::vector<int>& sec, const std::vector<int>& n_sel);
+
+template <typename FPTYPE>
+static int _build_nlist_cpu(std::vector<int> &ilist,
+                            std::vector<int> &numneigh,
+                            std::vector<int *> &firstneigh,
+                            std::vector<std::vector<int>> &jlist,
+                            int &max_nnei,
+                            int &mem_nnei,
+                            const FPTYPE *coord,
+                            const int &nloc,
+                            const int &new_nall,
+                            const int &max_nnei_trial,
+                            const float &rcut_r) {
+  int tt;
+  for (tt = 0; tt < max_nnei_trial; ++tt) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      jlist[ii].resize(mem_nnei);
+      firstneigh[ii] = &jlist[ii][0];
+    }
+    deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+    int ret = build_nlist_cpu(inlist, &max_nnei, coord, nloc, new_nall,
+                              mem_nnei, rcut_r);
+    if (ret == 0) {
+      break;
+    } else {
+      mem_nnei *= 2;
+    }
+  }
+  return (tt != max_nnei_trial);
+}
+
+template <typename FPTYPE>
+static int _norm_copy_coord_cpu(std::vector<FPTYPE> &coord_cpy,
+                                std::vector<int> &type_cpy,
+                                std::vector<int> &idx_mapping,
+                                int &nall,
+                                int &mem_cpy,
+                                const FPTYPE *coord,
+                                const FPTYPE *box,
+                                const int *type,
+                                const int &nloc,
+                                const int &max_cpy_trial,
+                                const float &rcut_r) {
+  std::vector<FPTYPE> tmp_coord(nall * 3);
+  std::copy(coord, coord + nall * 3, tmp_coord.begin());
+  deepmd::Region<FPTYPE> region;
+  init_region_cpu(region, box);
+  normalize_coord_cpu(&tmp_coord[0], nall, region);
+  int tt;
+  for (tt = 0; tt < max_cpy_trial; ++tt) {
+    coord_cpy.resize(mem_cpy * 3);
+    type_cpy.resize(mem_cpy);
+    idx_mapping.resize(mem_cpy);
+    int ret =
+        copy_coord_cpu(&coord_cpy[0], &type_cpy[0], &idx_mapping[0], &nall,
+                       &tmp_coord[0], type, nloc, mem_cpy, rcut_r, region);
+    if (ret == 0) {
+      break;
+    } else {
+      mem_cpy *= 2;
+    }
+  }
+  return (tt != max_cpy_trial);
+}
+
+template <typename FPTYPE>
+static void _prepare_coord_nlist_cpu(FPTYPE const **coord,
+                                     std::vector<FPTYPE> &coord_cpy,
+                                     int const **type,
+                                     std::vector<int> &type_cpy,
+                                     std::vector<int> &idx_mapping,
+                                     deepmd::InputNlist &inlist,
+                                     std::vector<int> &ilist,
+                                     std::vector<int> &numneigh,
+                                     std::vector<int *> &firstneigh,
+                                     std::vector<std::vector<int>> &jlist,
+                                     int &new_nall,
+                                     int &mem_cpy,
+                                     int &mem_nnei,
+                                     int &max_nbor_size,
+                                     const FPTYPE *box,
+                                     const int *mesh_tensor_data,
+                                     const int &nloc,
+                                     const int &nei_mode,
+                                     const float &rcut_r,
+                                     const int &max_cpy_trial,
+                                     const int &max_nnei_trial) {
+  inlist.inum = nloc;
+  if (nei_mode != 3) {
+    // build nlist by myself
+    // normalize and copy coord
+    if (nei_mode == 1) {
+      int copy_ok = _norm_copy_coord_cpu(coord_cpy, type_cpy, idx_mapping,
+                                         new_nall, mem_cpy, *coord, box, *type,
+                                         nloc, max_cpy_trial, rcut_r);
+      PD_CHECK(copy_ok, "cannot allocate mem for copied coords");
+      *coord = &coord_cpy[0];
+      *type = &type_cpy[0];
+    }
+    // build nlist
+    int build_ok = _build_nlist_cpu(ilist, numneigh, firstneigh, jlist,
+                                    max_nbor_size, mem_nnei, *coord, nloc,
+                                    new_nall, max_nnei_trial, rcut_r);
+    PD_CHECK(build_ok, "cannot allocate mem for nlist");
+    inlist.ilist = &ilist[0];
+    inlist.numneigh = &numneigh[0];
+    inlist.firstneigh = &firstneigh[0];
+  } else {
+    // copy pointers to nlist data
+    memcpy(&inlist.ilist, 4 + mesh_tensor_data, sizeof(int *));
+    memcpy(&inlist.numneigh, 8 + mesh_tensor_data, sizeof(int *));
+    memcpy(&inlist.firstneigh, 12 + mesh_tensor_data, sizeof(int **));
+    max_nbor_size = max_numneigh(inlist);
+  }
+}
+
+static void _map_nlist_cpu(int *nlist,
+                           const int *idx_mapping,
+                           const int &nloc,
+                           const int &nnei) {
+  for (int ii = 0; ii < nloc; ++ii) {
+    for (int jj = 0; jj < nnei; ++jj) {
+      int record = nlist[ii * nnei + jj];
+      if (record >= 0) {
+        nlist[ii * nnei + jj] = idx_mapping[record];
+      }
+    }
+  }
+}
+
+std::vector<paddle::Tensor> ProdEnvMatACUDAForward(
+    const paddle::Tensor &coord_tensor,
+    const paddle::Tensor &atype_tensor,
+    const paddle::Tensor &box_tensor,
+    const paddle::Tensor &mesh_tensor,
+    const paddle::Tensor &t_avg_tensor,
+    const paddle::Tensor &t_std_tensor,
+    const paddle::Tensor &natoms_tensor,
+    float rcut_a,
+    float rcut_r,
+    float rcut_r_smth,
+    std::vector<int> sel_a,
+    std::vector<int> sel_r);
+
+template <typename FPTYPE>
+void deepmd::prod_env_mat_a_cpu(FPTYPE *em,
+                                FPTYPE *em_deriv,
+                                FPTYPE *rij,
+                                int *nlist,
+                                const FPTYPE *coord,
+                                const int *type,
+                                const InputNlist &inlist,
+                                const int max_nbor_size,
+                                const FPTYPE *avg,
+                                const FPTYPE *std,
+                                const int nloc,
+                                const int nall,
+                                const float rcut,
+                                const float rcut_smth,
+                                const std::vector<int> sec,
+                                const int *f_type) {
+  if (f_type == NULL) {
+    f_type = type;
+  }
+  const int nnei = sec.back();
+  const int nem = nnei * 4;
+
+  // set & normalize coord
+  std::vector<FPTYPE> d_coord3(nall * 3);
+  for (int ii = 0; ii < nall; ++ii) {
+    for (int dd = 0; dd < 3; ++dd) {
+      d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+    }
+  }
+
+  // set type
+  std::vector<int> d_f_type(nall);
+  for (int ii = 0; ii < nall; ++ii) {
+    d_f_type[ii] = f_type[ii];
+  }
+
+  // build nlist
+  std::vector<std::vector<int>> d_nlist_a(nloc);
+
+  assert(nloc == inlist.inum);
+  for (unsigned ii = 0; ii < nloc; ++ii) {
+    d_nlist_a[ii].reserve(max_nbor_size);
+  }
+  for (unsigned ii = 0; ii < nloc; ++ii) {
+    int i_idx = inlist.ilist[ii];
+    for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) {
+      int j_idx = inlist.firstneigh[ii][jj];
+      d_nlist_a[i_idx].push_back(j_idx);
+    }
+  }
+
+#pragma omp parallel for
+  for (int ii = 0; ii < nloc; ++ii) {
+    std::vector<int> fmt_nlist_a;
+    int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_f_type, ii,
+                                 d_nlist_a[ii], rcut, sec);
+    std::vector<FPTYPE> d_em_a;
+    std::vector<FPTYPE> d_em_a_deriv;
+    std::vector<FPTYPE> d_em_r;
+    std::vector<FPTYPE> d_em_r_deriv;
+    std::vector<FPTYPE> d_rij_a;
+    deepmd::env_mat_a_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_f_type, ii,
+                          fmt_nlist_a, sec, rcut_smth, rcut);
+
+    // check sizes
+    assert(d_em_a.size() == nem);
+    assert(d_em_a_deriv.size() == nem * 3);
+    assert(d_rij_a.size() == nnei * 3);
+    assert(fmt_nlist_a.size() == nnei);
+    // record outputs
+    for (int jj = 0; jj < nem; ++jj) {
+      if (type[ii] >= 0) {
+        em[ii * nem + jj] =
+            (d_em_a[jj] - avg[type[ii] * nem + jj]) / std[type[ii] * nem + jj];
+      } else {
+        em[ii * nem + jj] = 0;
+      }
+    }
+    for (int jj = 0; jj < nem * 3; ++jj) {
+      if (type[ii] >= 0) {
+        em_deriv[ii * nem * 3 + jj] =
+            d_em_a_deriv[jj] / std[type[ii] * nem + jj / 3];
+      } else {
+        em_deriv[ii * nem * 3 + jj] = 0;
+      }
+    }
+    for (int jj = 0; jj < nnei * 3; ++jj) {
+      rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+    }
+    for (int jj = 0; jj < nnei; ++jj) {
+      nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+    }
+  }
+}
+
+template <typename data_t>
+void prod_env_mat_a_cpu_forward_kernel(int nsamples,
+                                       int nloc,
+                                       int ndescrpt,
+                                       int nnei,
+                                       int nall,
+                                       int mem_cpy,
+                                       int mem_nnei,
+                                       int max_nbor_size,
+                                       const int *mesh_tensor_data,
+                                       int nei_mode,
+                                       float rcut_a,
+                                       float rcut_r,
+                                       float rcut_r_smth,
+                                       int max_cpy_trial,
+                                       int max_nnei_trial,
+                                       bool b_nlist_map,
+                                       const std::vector<int> &sec_a,
+                                       const std::vector<int> &sec_r,
+                                       data_t *p_em,
+                                       data_t *p_em_deriv,
+                                       data_t *p_rij,
+                                       int *p_nlist,
+                                       const data_t *p_coord,
+                                       const data_t *p_box,
+                                       const data_t *avg,
+                                       const data_t *std,
+                                       const int *p_type) {
+  for (size_t ff = 0; ff < nsamples; ++ff) {
+    data_t *em = p_em + ff * nloc * ndescrpt;
+    data_t *em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3;
+    data_t *rij = p_rij + ff * nloc * nnei * 3;
+    int *nlist = p_nlist + ff * nloc * nnei;
+    const data_t *coord = p_coord + ff * nall * 3;
+    const data_t *box = p_box + ff * 9;
+    const int *type = p_type + ff * nall;
+
+    deepmd::InputNlist inlist;
+    // some buffers, be freed after the evaluation of this frame
+    std::vector<int> idx_mapping;
+    std::vector<int> ilist(nloc), numneigh(nloc);
+    std::vector<int *> firstneigh(nloc);
+    std::vector<std::vector<int>> jlist(nloc);
+    std::vector<data_t> coord_cpy;
+    std::vector<int> type_cpy;
+    int frame_nall = nall;
+    // prepare coord and nlist
+    _prepare_coord_nlist_cpu<data_t>(
+        &coord, coord_cpy, &type, type_cpy, idx_mapping, inlist, ilist,
+        numneigh, firstneigh, jlist, frame_nall, mem_cpy, mem_nnei,
+        max_nbor_size, box, mesh_tensor_data, nloc, nei_mode, rcut_r,
+        max_cpy_trial, max_nnei_trial);
+    // launch the cpu compute function
+    deepmd::prod_env_mat_a_cpu(em, em_deriv, rij, nlist, coord, type, inlist,
+                               max_nbor_size, avg, std, nloc, frame_nall,
+                               rcut_r, rcut_r_smth, sec_a);
+    // do nlist mapping if coords were copied
+    if (b_nlist_map) _map_nlist_cpu(nlist, &idx_mapping[0], nloc, nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdEnvMatACPUForward(
+    const paddle::Tensor &coord_tensor,
+    const paddle::Tensor &atype_tensor,
+    const paddle::Tensor &box_tensor,
+    const paddle::Tensor &mesh_tensor,
+    const paddle::Tensor &t_avg_tensor,
+    const paddle::Tensor &t_std_tensor,
+    const paddle::Tensor &natoms_tensor,
+    float rcut_a,
+    float rcut_r,
+    float rcut_r_smth,
+    std::vector<int> sel_a,
+    std::vector<int> sel_r) {
+  CHECK_INPUT(coord_tensor);
+  CHECK_INPUT(atype_tensor);
+  CHECK_INPUT(natoms_tensor);
+  CHECK_INPUT(box_tensor);
+  CHECK_INPUT(mesh_tensor);
+  CHECK_INPUT(t_avg_tensor);
+  CHECK_INPUT(t_std_tensor);
+
+  std::vector<int> sec_a;
+  std::vector<int> sec_r;
+  int ndescrpt, ndescrpt_a, ndescrpt_r;
+  int nnei, nnei_a, nnei_r, max_nbor_size;
+  int mem_cpy, max_cpy_trial;
+  int mem_nnei, max_nnei_trial;
+  std::string device;
+  // int* array_int = NULL;
+  // unsigned long long* array_longlong = NULL;
+  // deepmd::InputNlist gpu_inlist;
+  // int* nbor_list_dev = NULL;
+  // float nloc_f, nall_f;
+
+  deepmd::cum_sum(sec_a, sel_a);
+  deepmd::cum_sum(sec_r, sel_r);
+  ndescrpt_a = sec_a.back() * 4;
+  ndescrpt_r = sec_r.back() * 1;
+  ndescrpt = ndescrpt_a + ndescrpt_r;
+  nnei_a = sec_a.back();
+  nnei_r = sec_r.back();
+  nnei = nnei_a + nnei_r;
+  max_nbor_size = 1024;
+  max_cpy_trial = 100;
+  mem_cpy = 256;
+  max_nnei_trial = 100;
+  mem_nnei = 256;
+
+  CHECK_INPUT_DIM(coord_tensor, 2);
+  CHECK_INPUT_DIM(atype_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+  CHECK_INPUT_DIM(box_tensor, 2);
+  CHECK_INPUT_DIM(mesh_tensor, 1);
+  CHECK_INPUT_DIM(t_avg_tensor, 2);
+  CHECK_INPUT_DIM(t_std_tensor, 2);
+
+  PD_CHECK(sec_r.back() == 0,
+           "Rotational free descriptor only support all-angular information: "
+           "sel_r should be all zero.");
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "Number of atoms should be larger than (or equal to) 3");
+  // Paddle Set device on Python not in custom op
+  const int *natoms = natoms_tensor.data<int>();
+  int nloc = natoms[0];
+  int nall = natoms[1];
+  int ntypes = natoms_tensor.shape()[0] - 2;  // nloc and nall mean something.
+  int nsamples = coord_tensor.shape()[0];
+  // check the sizes
+  PD_CHECK(nsamples == atype_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nsamples == box_tensor.shape()[0], "number of samples should match");
+  PD_CHECK(ntypes == t_avg_tensor.shape()[0], "number of avg should be ntype");
+  PD_CHECK(ntypes == t_std_tensor.shape()[0], "number of std should be ntype");
+  PD_CHECK(nall * 3 == coord_tensor.shape()[1], "number of atoms should match");
+  PD_CHECK(nall == atype_tensor.shape()[1], "number of atoms should match");
+  PD_CHECK(9 == box_tensor.shape()[1], "number of box should be 9");
+  PD_CHECK(ndescrpt == t_avg_tensor.shape()[1],
+           "number of avg should be ndescrpt");
+  PD_CHECK(ndescrpt == t_std_tensor.shape()[1],
+           "number of std should be ndescrpt");
+  PD_CHECK(ntypes == int(sel_a.size()),
+           "number of types should match the length of sel array");
+  PD_CHECK(ntypes == int(sel_r.size()),
+           "number of types should match the length of sel array");
+
+  int nei_mode = 0;
+  bool b_nlist_map = false;
+  if (mesh_tensor.shape()[0] == 16) {
+    // lammps neighbor list
+    nei_mode = 3;
+  } else if (mesh_tensor.shape()[0] == 6) {
+    // manual copied pbc
+    assert(nloc == nall);
+    nei_mode = 1;
+    b_nlist_map = true;
+  } else if (mesh_tensor.shape()[0] == 0) {
+    // no pbc
+    assert(nloc == nall);
+    nei_mode = -1;
+  } else {
+    PD_THROW("Invalid mesh tensor");
+  }
+
+  // Create output tensors shape
+  std::vector<int64_t> descrpt_shape{nsamples, (int64_t)nloc * ndescrpt};
+  std::vector<int64_t> descrpt_deriv_shape{nsamples,
+                                           (int64_t)nloc * ndescrpt * 3};
+  std::vector<int64_t> rij_shape{nsamples, (int64_t)nloc * nnei * 3};
+  std::vector<int64_t> nlist_shape{nsamples, (int64_t)nloc * nnei};
+  // define output tensor
+  paddle::Tensor descrpt_tensor =
+      paddle::empty(descrpt_shape, coord_tensor.dtype(), coord_tensor.place());
+
+  paddle::Tensor descrpt_deriv_tensor = paddle::empty(
+      descrpt_deriv_shape, coord_tensor.dtype(), coord_tensor.place());
+
+  paddle::Tensor rij_tensor =
+      paddle::empty(rij_shape, coord_tensor.dtype(), coord_tensor.place());
+
+  paddle::Tensor nlist_tensor =
+      paddle::empty(nlist_shape, paddle::DataType::INT32, coord_tensor.place());
+  PD_DISPATCH_FLOATING_TYPES(
+      coord_tensor.type(), "prod_env_mat_a_cpu_forward_kernel", ([&] {
+        prod_env_mat_a_cpu_forward_kernel<data_t>(
+            nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei,
+            max_nbor_size, mesh_tensor.data<int>(), nei_mode, rcut_a, rcut_r,
+            rcut_r_smth, max_cpy_trial, max_nnei_trial, b_nlist_map, sec_a,
+            sec_r, descrpt_tensor.data<data_t>(),
+            descrpt_deriv_tensor.data<data_t>(), rij_tensor.data<data_t>(),
+            nlist_tensor.data<int>(), coord_tensor.data<data_t>(),
+            box_tensor.data<data_t>(), t_avg_tensor.data<data_t>(),
+            t_std_tensor.data<data_t>(), atype_tensor.data<int>());
+      }));
+
+  return {descrpt_tensor, descrpt_deriv_tensor, rij_tensor, nlist_tensor};
+}
+
+std::vector<paddle::Tensor> ProdEnvMatAForward(
+    const paddle::Tensor &coord_tensor,
+    const paddle::Tensor &atype_tensor,
+    const paddle::Tensor &mesh_tensor,
+    const paddle::Tensor &box_tensor,
+    const paddle::Tensor &t_avg_tensor,
+    const paddle::Tensor &t_std_tensor,
+    const paddle::Tensor &natoms_tensor,
+    float rcut_a,
+    float rcut_r,
+    float rcut_r_smth,
+    std::vector<int> sel_a,
+    std::vector<int> sel_r) {
+  if (coord_tensor.is_gpu()) {
+    return ProdEnvMatACUDAForward(
+        coord_tensor, atype_tensor, mesh_tensor, box_tensor, t_avg_tensor,
+        t_std_tensor, natoms_tensor.copy_to(paddle::CPUPlace(), false), rcut_a,
+        rcut_r, rcut_r_smth, sel_a, sel_r);
+  } else {
+    return ProdEnvMatACPUForward(
+        coord_tensor, atype_tensor, mesh_tensor, box_tensor, t_avg_tensor,
+        t_std_tensor, natoms_tensor, rcut_a, rcut_r, rcut_r_smth, sel_a, sel_r);
+  }
+}
+
+std::vector<std::vector<int64_t>> ProdEnvMatAInferShape(
+    std::vector<int64_t> coord_shape,
+    std::vector<int64_t> atype_shape,
+    std::vector<int64_t> box_shape,
+    std::vector<int64_t> mesh_shape,
+    std::vector<int64_t> t_avg_shape,
+    std::vector<int64_t> t_std_shape,
+    std::vector<int64_t> natoms_shape,
+    float rcut_a,
+    float rcut_r,
+    float rcut_r_smth,
+    const std::vector<int> &sel_a,
+    const std::vector<int> &sel_r) {
+  int64_t nloc = /*natoms[0]*/ 192;
+  //   int64_t nall = /*natoms[1]*/ 192;
+
+  std::vector<int> sec_a;
+  std::vector<int> sec_r;
+  deepmd::cum_sum(sec_a, sel_a);
+  deepmd::cum_sum(sec_r, sel_r);
+
+  int64_t nsamples = coord_shape[0];
+  int64_t ndescrpt_a = sec_a.back() * 4;
+  int64_t ndescrpt_r = sec_r.back() * 1;
+  int64_t ndescrpt = ndescrpt_a + ndescrpt_r;
+
+  int64_t nnei_a = sec_a.back();
+  int64_t nnei_r = sec_r.back();
+  int64_t nnei = nnei_a + nnei_r;
+
+  std::vector<int64_t> descrpt_shape = {nsamples, nloc * ndescrpt};
+  std::vector<int64_t> descrpt_deriv_shape = {nsamples, nloc * ndescrpt * 3};
+  std::vector<int64_t> rij_shape = {nsamples, nloc * nnei * 3};
+  std::vector<int64_t> nlist_shape = {nsamples, nloc * nnei};
+  return {descrpt_shape, descrpt_deriv_shape, rij_shape, nlist_shape};
+}
+
+std::vector<paddle::DataType> ProdEnvMatAInferDtype(
+    paddle::DataType coord_dtype,
+    paddle::DataType atype_dtype,
+    paddle::DataType box_dtype,
+    paddle::DataType mesh_dtype,
+    paddle::DataType t_avg_dtype,
+    paddle::DataType t_std_dtype,
+    paddle::DataType natoms_dtype) {
+  return {coord_dtype, coord_dtype, coord_dtype, coord_dtype};
+}
+
+PD_BUILD_OP(prod_env_mat_a)
+    .Inputs({"coord", "atype", "box", "mesh", "t_avg", "t_std", "natoms"})
+    .Outputs({"descrpt", "descrpt_deriv", "rij", "nlist"})
+    .Attrs({"rcut_a: float", "rcut_r: float", "rcut_r_smth: float",
+            "sel_a: std::vector<int>", "sel_r: std::vector<int>"})
+    .SetKernelFn(PD_KERNEL(ProdEnvMatAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(ProdEnvMatAInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ProdEnvMatAInferDtype));
diff --git a/source/lib/paddle_src/paddle_prod_env_mat.cu b/source/lib/paddle_src/paddle_prod_env_mat.cu
new file mode 100644
index 0000000000..c18fb7d687
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_env_mat.cu
@@ -0,0 +1,1176 @@
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+#include <type_traits>
+
+#include "paddle/extension.h"
+
+#define GOOGLE_CUDA 1
+
+#include <iomanip>
+#include <vector>
+
+#include "coord.h"
+#include "fmt_nlist.h"
+#include "gpu_cuda.h"
+#include "neighbor_list.h"
+#include "prod_env_mat.h"
+#include "region.h"
+#include "utilities.h"
+
+typedef long long int_64;
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_ON_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+
+__device__ inline double _sqrt(double x) { return sqrt(x); }
+__device__ inline float _sqrt(float x) { return sqrtf(x); }
+__device__ inline double _rsqrt(double x) { return rsqrt(x); }
+__device__ inline float _rsqrt(float x) { return rsqrtf(x); }
+
+template <typename FPTYPE>
+static int _norm_copy_coord_gpu(std::vector<paddle::Tensor>* tensor_list,
+                                FPTYPE*& coord_cpy,
+                                int*& type_cpy,
+                                int*& idx_mapping,
+                                int& nall,
+                                int& mem_cpy,
+                                const FPTYPE* coord,
+                                const FPTYPE* box,
+                                const int* type,
+                                const int& nloc,
+                                const int& max_cpy_trial,
+                                const float& rcut_r);
+
+template <typename FPTYPE>
+static int _build_nlist_gpu(std::vector<paddle::Tensor>* tensor_list,
+                            int*& ilist,
+                            int*& numneigh,
+                            int**& firstneigh,
+                            int*& jlist,
+                            int& max_nnei,
+                            int& mem_nnei,
+                            const FPTYPE* coord,
+                            const int& nloc,
+                            const int& new_nall,
+                            const int& max_nnei_trial,
+                            const float& rcut_r);
+
+static void _map_nlist_gpu(int* nlist,
+                           const int* idx_mapping,
+                           const int& nloc,
+                           const int& nnei);
+
+template <typename FPTYPE>
+static void _prepare_coord_nlist_gpu(std::vector<paddle::Tensor>* tensor_list,
+                                     FPTYPE const** coord,
+                                     FPTYPE*& coord_cpy,
+                                     int const** type,
+                                     int*& type_cpy,
+                                     int*& idx_mapping,
+                                     deepmd::InputNlist& inlist,
+                                     int*& ilist,
+                                     int*& numneigh,
+                                     int**& firstneigh,
+                                     int*& jlist,
+                                     int*& nbor_list_dev,
+                                     int& new_nall,
+                                     int& mem_cpy,
+                                     int& mem_nnei,
+                                     int& max_nbor_size,
+                                     const FPTYPE* box,
+                                     const int* mesh_tensor_data,
+                                     const int mesh_tensor_size,
+                                     const int& nloc,
+                                     const int& nei_mode,
+                                     const float& rcut_r,
+                                     const int& max_cpy_trial,
+                                     const int& max_nnei_trial);
+
+template <typename FPTYPE>
+__device__ inline uint_64 encoding_nbor_info(const int type,
+                                             const FPTYPE dist,
+                                             const int index) {
+  // nbor info checking:
+  // the type of nbor atom must be smaller than 128
+  // the distance of center atom between nbor atom must be smaller than 128
+  // the index of nbor atom(including ghost region) must be smaller than
+  // 16777216(1 << 24)
+  if (type >= 128 || dist >= (FPTYPE)128.0 || index >= (1 << 24)) {
+    asm("trap;");
+  }
+  return ((uint_64)type << 57) +
+         (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) +
+         index;
+}
+
+__device__ inline void decoding_nbor_info(int& type,
+                                          int& index,
+                                          const uint_64 key) {
+  type = key >> 57;
+  index = key & 0xFFFFFF;
+}
+
+template <typename FPTYPE>
+__global__ void get_i_idx(FPTYPE* i_idx, const int nloc, const FPTYPE* ilist) {
+  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= nloc) {
+    return;
+  }
+  i_idx[ilist[idx]] = idx;
+}
+
+// common part of prod_env_mat
+template <typename Key, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__launch_bounds__(BLOCK_THREADS) __global__
+    void BlockSortKernel(Key* d_in,
+                         Key* d_out)  // Tile of output
+{
+  enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+  // Specialize BlockLoad type for our thread block (uses warp-striped loads for
+  // coalescing, then transposes in shared memory to a blocked arrangement)
+  typedef cub::BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD,
+                         cub::BLOCK_LOAD_WARP_TRANSPOSE>
+      BlockLoadT;
+  // Specialize BlockRadixSort type for our thread block
+  typedef cub::BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD>
+      BlockRadixSortT;
+  // Shared memory
+  __shared__ union TempStorage {
+    typename BlockLoadT::TempStorage load;
+    typename BlockRadixSortT::TempStorage sort;
+  } temp_storage;
+  // Per-thread tile items
+  Key items[ITEMS_PER_THREAD];
+  // Our current block's offset
+  int_64 block_offset = (int_64)blockIdx.x * TILE_SIZE;
+  // Load items into a blocked arrangement
+  BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+  // Barrier for smem reuse
+  __syncthreads();
+  // Sort keys
+  BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
+  // Store output in striped fashion
+  cub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset,
+                                         items);
+}
+
+template <typename FPTYPE>
+__device__ inline FPTYPE dev_dot(FPTYPE* arr1, FPTYPE* arr2) {
+  return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
+}
+
+template <typename FPTYPE>
+__device__ inline void spline5_switch(
+    FPTYPE& vv, FPTYPE& dd, FPTYPE& xx, const float& rmin, const float& rmax) {
+  if (xx < rmin) {
+    dd = (FPTYPE)0.;
+    vv = (FPTYPE)1.;
+  } else if (xx < rmax) {
+    FPTYPE uu = (xx - rmin) / (rmax - rmin);
+    FPTYPE du = (FPTYPE)1. / (rmax - rmin);
+    vv = uu * uu * uu *
+             ((FPTYPE)-6. * uu * uu + (FPTYPE)15. * uu - (FPTYPE)10.) +
+         (FPTYPE)1.;
+    dd = ((FPTYPE)3. * uu * uu *
+              ((FPTYPE)-6. * uu * uu + (FPTYPE)15. * uu - (FPTYPE)10.) +
+          uu * uu * uu * ((FPTYPE)-12. * uu + (FPTYPE)15.)) *
+         du;
+  } else {
+    dd = (FPTYPE)0.;
+    vv = (FPTYPE)0.;
+  }
+}
+
+template <typename FPTYPE>
+__global__ void format_nlist_fill_a(uint_64* key,
+                                    const FPTYPE* coord,
+                                    const int* type,
+                                    const int* numneigh,
+                                    int** firstneigh,
+                                    const float rcut,
+                                    int* i_idx,
+                                    const int MAX_NBOR_SIZE) {
+  // <<<nloc, MAX_NBOR_SIZE>>>
+  const int_64 idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
+
+  const int nsize = numneigh[i_idx[idx]];
+  if (idy >= nsize) {
+    return;
+  }
+
+  const int* nei_idx = firstneigh[i_idx[idx]];
+  // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize);
+  uint_64* key_in = key + idx * MAX_NBOR_SIZE;
+  FPTYPE diff[3];
+  const int& j_idx = nei_idx[idy];
+  if (type[j_idx] < 0) return;
+  for (int dd = 0; dd < 3; dd++) {
+    diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
+  }
+  FPTYPE rr = _sqrt(dev_dot(diff, diff));
+  if (rr <= rcut) {
+    key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx);
+  }
+}
+
+template <typename FPTYPE>
+__global__ void fill_nei_iter(int* nei_iter_dev,
+                              const FPTYPE* key,
+                              const int nloc,
+                              const int max_nbor_size,
+                              const int sec_size) {
+  int_64 row = blockIdx.x;
+  int col = blockIdx.y * blockDim.x + threadIdx.x;
+  const FPTYPE* key_out = key + nloc * max_nbor_size + row * max_nbor_size;
+  int nei_type_cur = -1, nbor_idx_cur = 0;
+  int nei_type_pre = -1, nbor_idx_pre = 0;
+  if (col < max_nbor_size && key_out[col] != key_out[max_nbor_size - 1]) {
+    if (col >= 1)
+      decoding_nbor_info(nei_type_pre, nbor_idx_pre, key_out[col - 1]);
+    decoding_nbor_info(nei_type_cur, nbor_idx_cur, key_out[col]);
+  }
+  if (nei_type_cur != nei_type_pre) {
+    nei_iter_dev[row * sec_size + nei_type_cur] = col;
+  }
+}
+
+template <typename FPTYPE>
+__global__ void format_nlist_fill_b(int* nlist,
+                                    const int nlist_size,
+                                    const int nloc,
+                                    FPTYPE* key,
+                                    const int* sec,
+                                    const int sec_size,
+                                    int* nei_iter_dev,
+                                    const int max_nbor_size) {
+  int_64 row = blockIdx.x;
+  int col = blockIdx.y * blockDim.x + threadIdx.x;
+  int* nei_iter = nei_iter_dev + row * sec_size;
+  FPTYPE* key_out = key + nloc * max_nbor_size + row * max_nbor_size;
+  int* row_nlist = nlist + row * nlist_size;
+  if (col < max_nbor_size) {
+    if (key_out[col] != key_out[max_nbor_size - 1]) {
+      int nei_type = 0, nbor_idx = 0;
+      decoding_nbor_info(nei_type, nbor_idx, key_out[col]);
+      int out_indx = col - nei_iter[nei_type] + sec[nei_type];
+      if (out_indx < sec[nei_type + 1]) {
+        row_nlist[out_indx] = nbor_idx;
+      }
+    }
+  }
+}
+
+template <typename FPTYPE>
+__global__ void encoding_decoding_nbor_info(uint_64* key,
+                                            int* out_type,
+                                            int* out_index,
+                                            const int* in_type,
+                                            const FPTYPE* in_dist,
+                                            const int* in_index,
+                                            const int size_of_array) {
+  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= size_of_array) {
+    return;
+  }
+
+  key[idx] = encoding_nbor_info(in_type[idx], in_dist[idx], in_index[idx]);
+  decoding_nbor_info(out_type[idx], out_index[idx], key[idx]);
+}
+
+template <typename FPTYPE>
+void format_nbor_list_256(uint_64* key,
+                          const FPTYPE* coord,
+                          const int* type,
+                          const deepmd::InputNlist& gpu_inlist,
+                          const int& nloc,
+                          const float& rcut,
+                          int* i_idx) {
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 256;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  format_nlist_fill_a<<<block_grid, thread_grid>>>(
+      key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
+      MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 4;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
+  // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
+      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void format_nbor_list_512(uint_64* key,
+                          const FPTYPE* coord,
+                          const int* type,
+                          const deepmd::InputNlist& gpu_inlist,
+                          const int& nloc,
+                          const float& rcut,
+                          int* i_idx) {
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 512;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  format_nlist_fill_a<<<block_grid, thread_grid>>>(
+      key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
+      MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 4;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
+  // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
+      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void format_nbor_list_1024(uint_64* key,
+                           const FPTYPE* coord,
+                           const int* type,
+                           const deepmd::InputNlist& gpu_inlist,
+                           const int& nloc,
+                           const float& rcut,
+                           int* i_idx) {
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 1024;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  format_nlist_fill_a<<<block_grid, thread_grid>>>(
+      key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
+      MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 8;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
+  // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
+      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void format_nbor_list_2048(uint_64* key,
+                           const FPTYPE* coord,
+                           const int* type,
+                           const deepmd::InputNlist& gpu_inlist,
+                           const int& nloc,
+                           const float& rcut,
+                           int* i_idx) {
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 2048;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  format_nlist_fill_a<<<block_grid, thread_grid>>>(
+      key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
+      MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 8;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
+  // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
+      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void format_nbor_list_4096(uint_64* key,
+                           const FPTYPE* coord,
+                           const int* type,
+                           const deepmd::InputNlist& gpu_inlist,
+                           const int& nloc,
+                           const float& rcut,
+                           int* i_idx) {
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 4096;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  format_nlist_fill_a<<<block_grid, thread_grid>>>(
+      key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
+      MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 16;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
+  // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
+      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE, int THREADS_PER_BLOCK>
+__global__ void compute_env_mat_a(FPTYPE* em,
+                                  FPTYPE* em_deriv,
+                                  FPTYPE* rij,
+                                  const FPTYPE* coord,
+                                  const FPTYPE* avg,
+                                  const FPTYPE* std,
+                                  const int* type,
+                                  const int* nlist,
+                                  const int nnei,
+                                  const float rmin,
+                                  const float rmax) {
+  // <<<nloc, TPB>>>
+  const int_64 bid = blockIdx.x;
+  const unsigned int tid = threadIdx.x;
+  if (type[bid] < 0) return;
+  if (tid >= nnei) {
+    return;
+  }
+  const int ndescrpt = nnei * 4;
+  const int* row_nlist = nlist + bid * nnei;
+  FPTYPE* row_rij = rij + bid * nnei * 3;
+  FPTYPE* row_descript = em + bid * nnei * 4;
+  FPTYPE* row_descript_deriv = em_deriv + bid * nnei * 12;
+  for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) {
+    const int idx_value = ii * 4;   // 4 components
+    const int idx_deriv = ii * 12;  // 4 components time 3 directions
+    if (row_nlist[ii] >= 0) {
+      FPTYPE rr[3] = {0};
+      FPTYPE dd[4] = {0};
+      FPTYPE vv[12] = {0};
+      const int j_idx = row_nlist[ii];
+      for (int kk = 0; kk < 3; kk++) {
+        rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
+        row_rij[ii * 3 + kk] = rr[kk];
+      }
+      // const FPTYPE * rr = &row_rij[ii * 3];
+      FPTYPE nr2 = dev_dot(rr, rr);
+      FPTYPE inr = _rsqrt(nr2);
+      FPTYPE nr = nr2 * inr;
+      FPTYPE inr2 = inr * inr;
+      FPTYPE inr4 = inr2 * inr2;
+      FPTYPE inr3 = inr4 * nr;
+      FPTYPE sw, dsw;
+      spline5_switch(sw, dsw, nr, rmin, rmax);
+      dd[0] = ((FPTYPE)1. / nr);  //* sw;
+      dd[1] = (rr[0] / nr2);      //* sw;
+      dd[2] = (rr[1] / nr2);      //* sw;
+      dd[3] = (rr[2] / nr2);      //* sw;
+      vv[0] = (rr[0] * inr3 * sw -
+               dd[0] * dsw * rr[0] *
+                   inr);  // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
+      vv[1] = (rr[1] * inr3 * sw -
+               dd[0] * dsw * rr[1] *
+                   inr);  // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
+      vv[2] = (rr[2] * inr3 * sw -
+               dd[0] * dsw * rr[2] *
+                   inr);  // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
+      // ****deriv of component x/r2
+      vv[3] = (((FPTYPE)2. * rr[0] * rr[0] * inr4 - inr2) * sw -
+               dd[1] * dsw * rr[0] *
+                   inr);  // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
+      vv[4] = (((FPTYPE)2. * rr[0] * rr[1] * inr4) * sw -
+               dd[1] * dsw * rr[1] *
+                   inr);  // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
+      vv[5] = (((FPTYPE)2. * rr[0] * rr[2] * inr4) * sw -
+               dd[1] * dsw * rr[2] *
+                   inr);  // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
+      // ***deriv of component y/r2
+      vv[6] = (((FPTYPE)2. * rr[1] * rr[0] * inr4) * sw -
+               dd[2] * dsw * rr[0] *
+                   inr);  // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
+      vv[7] = (((FPTYPE)2. * rr[1] * rr[1] * inr4 - inr2) * sw -
+               dd[2] * dsw * rr[1] *
+                   inr);  // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
+      vv[8] = (((FPTYPE)2. * rr[1] * rr[2] * inr4) * sw -
+               dd[2] * dsw * rr[2] *
+                   inr);  // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
+      // ***deriv of component z/r2
+      vv[9] = (((FPTYPE)2. * rr[2] * rr[0] * inr4) * sw -
+               dd[3] * dsw * rr[0] *
+                   inr);  // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
+      vv[10] =
+          (((FPTYPE)2. * rr[2] * rr[1] * inr4) * sw -
+           dd[3] * dsw * rr[1] *
+               inr);  // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt
+                      // + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
+      vv[11] =
+          (((FPTYPE)2. * rr[2] * rr[2] * inr4 - inr2) * sw -
+           dd[3] * dsw * rr[2] *
+               inr);  // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt
+                      // + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
+      // 4 value components
+      dd[0] *= sw;  // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx]
+                    // * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt +
+                    // idx_value + 0];
+      dd[1] *= sw;  // * em[idx * ndescrpt + idx_value + 1]);// - avg[type[idx]
+                    // * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt +
+                    // idx_value + 1];
+      dd[2] *= sw;  // * em[idx * ndescrpt + idx_value + 2]);// - avg[type[idx]
+                    // * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt +
+                    // idx_value + 2];
+      dd[3] *= sw;  // * em[idx * ndescrpt + idx_value + 3]);// - avg[type[idx]
+                    // * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt +
+                    // idx_value + 3];
+      for (int ii = 0; ii < 12; ii++) {
+        row_descript_deriv[idx_deriv + ii] =
+            vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
+      }
+      for (int ii = 0; ii < 4; ii++) {
+        row_descript[idx_value + ii] =
+            (dd[ii] - avg[type[bid] * ndescrpt + idx_value + ii]) /
+            std[type[bid] * ndescrpt + idx_value + ii];
+      }
+    } else {
+      // TODO: move it to the memset.
+      row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] /
+                                 std[type[bid] * ndescrpt + idx_value];
+    }
+  }
+}
+
+template <typename FPTYPE, int THREADS_PER_BLOCK>
+__global__ void compute_env_mat_r(FPTYPE* em,
+                                  FPTYPE* em_deriv,
+                                  FPTYPE* rij,
+                                  const FPTYPE* coord,
+                                  const FPTYPE* avg,
+                                  const FPTYPE* std,
+                                  const int* type,
+                                  const int* nlist,
+                                  const int nnei,
+                                  const float rmin,
+                                  const float rmax) {
+  // <<<nloc, TPB>>>
+  const int_64 bid = blockIdx.x;
+  const unsigned int tid = threadIdx.x;
+  if (tid >= nnei) {
+    return;
+  }
+  const int ndescrpt = nnei;
+  const int* row_nlist = nlist + bid * nnei;
+  FPTYPE* row_rij = rij + bid * nnei * 3;
+  FPTYPE* row_em = em + bid * nnei;
+  FPTYPE* row_em_deriv = em_deriv + bid * nnei * 3;
+  for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) {
+    const int idx_value = ii;      // 4 components
+    const int idx_deriv = ii * 3;  // 4 components time 3 directions
+    if (row_nlist[ii] >= 0) {
+      FPTYPE rr[3] = {0};
+      FPTYPE vv[3] = {0};
+      FPTYPE dd = 0;
+      const int& j_idx = row_nlist[ii];
+      for (int kk = 0; kk < 3; kk++) {
+        rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
+        row_rij[ii * 3 + kk] = rr[kk];
+      }
+      // const FPTYPE * rr = &row_rij[ii * 3];
+      FPTYPE nr2 = dev_dot(rr, rr);
+      FPTYPE inr = _rsqrt(nr2);
+      FPTYPE nr = nr2 * inr;
+      FPTYPE inr2 = inr * inr;
+      FPTYPE inr4 = inr2 * inr2;
+      FPTYPE inr3 = inr4 * nr;
+      FPTYPE sw, dsw;
+      spline5_switch(sw, dsw, nr, rmin, rmax);
+      dd = ((FPTYPE)1. / nr);  //* sw;
+      vv[0] = (rr[0] * inr3 * sw -
+               dd * dsw * rr[0] *
+                   inr);  // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
+      vv[1] = (rr[1] * inr3 * sw -
+               dd * dsw * rr[1] *
+                   inr);  // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
+      vv[2] = (rr[2] * inr3 * sw -
+               dd * dsw * rr[2] *
+                   inr);  // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] *
+                          // ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
+
+      // 4 value components
+      dd *= sw;  // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] *
+                 // ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt +
+                 // idx_value + 0];
+      for (int ii = 0; ii < 3; ii++) {
+        row_em_deriv[idx_deriv + ii] =
+            vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
+      }
+      row_em[idx_value] = (dd - avg[type[bid] * ndescrpt + idx_value]) /
+                          std[type[bid] * ndescrpt + idx_value];
+    } else {
+      // TODO: move it to the memset.
+      row_em[idx_value] -= avg[type[bid] * ndescrpt + idx_value] /
+                           std[type[bid] * ndescrpt + idx_value];
+    }
+  }
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void format_nbor_list_gpu_cuda(int* nlist,
+                               const FPTYPE* coord,
+                               const int* type,
+                               const InputNlist& gpu_inlist,
+                               int* array_int,
+                               uint_64* array_longlong,
+                               const int max_nbor_size,
+                               const int nloc,
+                               const int nall,
+                               const float rcut,
+                               const std::vector<int> sec) {
+  const int LEN = 256;
+  const int nnei = sec.back();
+  const int nblock = (nloc + LEN - 1) / LEN;
+  int* sec_dev = array_int;
+  int* nei_iter = array_int + sec.size();  // = new int[sec_size];
+  int* i_idx = array_int + sec.size() + nloc * sec.size();
+  uint_64* key = array_longlong;
+  assert(max_nbor_size == 256 || max_nbor_size == 512 ||
+         max_nbor_size == 1024 || max_nbor_size == 2048 ||
+         max_nbor_size == 4096);
+  DPErrcheck(cudaMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei));
+  DPErrcheck(cudaMemset(key, 0xffffffff,
+                        sizeof(uint_64) * int_64(nloc) * max_nbor_size));
+  DPErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(),
+                        cudaMemcpyHostToDevice));
+  get_i_idx<<<nblock, LEN>>>(i_idx, nloc, gpu_inlist.ilist);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+
+  if (max_nbor_size == 256) {
+    format_nbor_list_256(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+  } else if (max_nbor_size == 512) {
+    format_nbor_list_512(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+  } else if (max_nbor_size == 1024) {
+    format_nbor_list_1024(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+  } else if (max_nbor_size == 2048) {
+    format_nbor_list_2048(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+  } else if (max_nbor_size == 4096) {
+    format_nbor_list_4096(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+  }
+
+  fill_nei_iter<<<dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN>>>(
+      nei_iter, key, nloc, max_nbor_size, sec.size());
+
+  format_nlist_fill_b<<<dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN>>>(
+      nlist, nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+}  // namespace deepmd
+
+namespace deepmd {
+
+template <typename FPTYPE>
+void prod_env_mat_a_gpu_cuda(FPTYPE* em,
+                             FPTYPE* em_deriv,
+                             FPTYPE* rij,
+                             int* nlist,
+                             const FPTYPE* coord,
+                             const int* type,
+                             const InputNlist& gpu_inlist,
+                             int* array_int,
+                             uint_64* array_longlong,
+                             const int max_nbor_size,
+                             const FPTYPE* avg,
+                             const FPTYPE* std,
+                             const int nloc,
+                             const int nall,
+                             const float rcut,
+                             const float rcut_smth,
+                             const std::vector<int> sec,
+                             const int* f_type) {
+  if (f_type == NULL) {
+    f_type = type;
+  }
+  const int nnei = sec.back();
+  const int ndescrpt = nnei * 4;
+  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
+  DPErrcheck(
+      cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
+  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
+
+  format_nbor_list_gpu_cuda(nlist, coord, f_type, gpu_inlist, array_int,
+                            array_longlong, max_nbor_size, nloc, nall, rcut,
+                            sec);
+  nborErrcheck(cudaGetLastError());
+  nborErrcheck(cudaDeviceSynchronize());
+
+  compute_env_mat_a<FPTYPE, TPB><<<nloc, TPB>>>(
+      em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void prod_env_mat_r_gpu_cuda(FPTYPE* em,
+                             FPTYPE* em_deriv,
+                             FPTYPE* rij,
+                             int* nlist,
+                             const FPTYPE* coord,
+                             const int* type,
+                             const deepmd::InputNlist& gpu_inlist,
+                             int* array_int,
+                             uint_64* array_longlong,
+                             const int max_nbor_size,
+                             const FPTYPE* avg,
+                             const FPTYPE* std,
+                             const int nloc,
+                             const int nall,
+                             const float rcut,
+                             const float rcut_smth,
+                             const std::vector<int> sec) {
+  const int nnei = sec.back();
+  const int ndescrpt = nnei * 1;
+  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
+  DPErrcheck(
+      cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
+  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
+
+  format_nbor_list_gpu_cuda(nlist, coord, type, gpu_inlist, array_int,
+                            array_longlong, max_nbor_size, nloc, nall, rcut,
+                            sec);
+  nborErrcheck(cudaGetLastError());
+  nborErrcheck(cudaDeviceSynchronize());
+
+  compute_env_mat_r<FPTYPE, TPB><<<nloc, TPB>>>(
+      em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+}  // namespace deepmd
+
+template <typename data_t>
+void prod_env_mat_a_cuda_forward_kernel(int nsamples,
+                                        int nloc,
+                                        int ndescrpt,
+                                        int nnei,
+                                        int nall,
+                                        int mem_cpy,
+                                        int mem_nnei,
+                                        int max_nbor_size,
+                                        int nei_mode,
+                                        float rcut_a,
+                                        float rcut_r,
+                                        float rcut_r_smth,
+                                        int max_cpy_trial,
+                                        int max_nnei_trial,
+                                        bool b_nlist_map,
+                                        const std::vector<int>& sec_a,
+                                        const std::vector<int>& sec_r,
+                                        deepmd::InputNlist gpu_inlist,
+                                        int* nbor_list_dev,
+                                        int* array_int,
+                                        unsigned long long* array_longlong,
+                                        data_t* p_em,
+                                        data_t* p_em_deriv,
+                                        data_t* p_rij,
+                                        int* p_nlist,
+                                        const data_t* p_coord,
+                                        const data_t* p_box,
+                                        const data_t* avg,
+                                        const data_t* std,
+                                        const int* p_type,
+                                        const paddle::Tensor& mesh_tensor) {
+  for (int ff = 0; ff < nsamples; ++ff) {
+    data_t* em = p_em + ff * nloc * ndescrpt;
+    data_t* em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3;
+    data_t* rij = p_rij + ff * nloc * nnei * 3;
+    int* nlist = p_nlist + ff * nloc * nnei;
+    const data_t* coord = p_coord + ff * nall * 3;
+    const data_t* box = p_box + ff * 9;
+    const int* type = p_type + ff * nall;
+
+    int* idx_mapping = NULL;
+    int *ilist = NULL, *numneigh = NULL;
+    int** firstneigh = NULL;
+    deepmd::malloc_device_memory(firstneigh, nloc);
+    int* jlist = NULL;
+    data_t* coord_cpy;
+    int* type_cpy;
+    int frame_nall = nall;
+    int mesh_tensor_size = static_cast<int>(mesh_tensor.size());
+    std::vector<paddle::Tensor> tensor_list;
+    // std::vector<paddle::Tensor> tensor_list(7); // >>>
+    // 参照deepmd-kit-tf/source/op/prod_env_mat_multi_device.cc修改
+    _prepare_coord_nlist_gpu<data_t>(
+        &tensor_list, &coord, coord_cpy, &type, type_cpy, idx_mapping,
+        gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev,
+        frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
+        mesh_tensor.data<int>(), mesh_tensor_size, nloc, nei_mode, rcut_r,
+        max_cpy_trial, max_nnei_trial);
+
+    // allocate temp memory, temp memory must not be used after this operation!
+    std::vector<int> int_temp_shape{int(sec_a.size()) +
+                                    nloc * int(sec_a.size()) + nloc};
+    auto int_temp = paddle::empty(int_temp_shape, paddle::DataType::INT32,
+                                  paddle::GPUPlace());
+
+    array_int = int_temp.data<int>();
+
+    deepmd::malloc_device_memory(array_longlong, nloc * GPU_MAX_NBOR_SIZE * 2);
+    // launch the gpu(nv) compute function
+
+    deepmd::prod_env_mat_a_gpu_cuda(em, em_deriv, rij, nlist, coord, type,
+                                    gpu_inlist, array_int, array_longlong,
+                                    max_nbor_size, avg, std, nloc, frame_nall,
+                                    rcut_r, rcut_r_smth, sec_a);
+    if (b_nlist_map) _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
+    deepmd::delete_device_memory(firstneigh);
+    deepmd::delete_device_memory(array_longlong);
+    array_longlong = NULL;
+  }
+}
+
+// void cum_sum(std::vector<int>& sec, const std::vector<int>& n_sel) {
+//   sec.resize(n_sel.size() + 1);
+//   sec[0] = 0;
+//   for (int ii = 1; ii < sec.size(); ++ii) {
+//     sec[ii] = sec[ii - 1] + n_sel[ii - 1];
+//   }
+// }
+
+std::vector<paddle::Tensor> ProdEnvMatACUDAForward(
+    const paddle::Tensor& coord_tensor,
+    const paddle::Tensor& atype_tensor,
+    const paddle::Tensor& box_tensor,
+    const paddle::Tensor& mesh_tensor,
+    const paddle::Tensor& t_avg_tensor,
+    const paddle::Tensor& t_std_tensor,
+    const paddle::Tensor& natoms_tensor,
+    float rcut_a,
+    float rcut_r,
+    float rcut_r_smth,
+    std::vector<int> sel_a,
+    std::vector<int> sel_r) {
+  std::vector<int> sec_a;
+  std::vector<int> sec_r;
+  int ndescrpt, ndescrpt_a, ndescrpt_r;
+  int nnei, nnei_a, nnei_r, max_nbor_size;
+  int mem_cpy, max_cpy_trial;
+  int mem_nnei, max_nnei_trial;
+  std::string device;
+  int* array_int = NULL;
+  unsigned long long* array_longlong = NULL;
+  deepmd::InputNlist gpu_inlist;
+  int* nbor_list_dev = NULL;
+  //   float nloc_f, nall_f;
+
+  deepmd::cum_sum(sec_a, sel_a);
+  deepmd::cum_sum(sec_r, sel_r);
+  ndescrpt_a = sec_a.back() * 4;
+  ndescrpt_r = sec_r.back() * 1;
+  ndescrpt = ndescrpt_a + ndescrpt_r;
+  nnei_a = sec_a.back();
+  nnei_r = sec_r.back();
+  nnei = nnei_a + nnei_r;
+  max_nbor_size = 1024;
+  max_cpy_trial = 100;
+  mem_cpy = 256;
+  max_nnei_trial = 100;
+  mem_nnei = 256;
+  CHECK_INPUT_ON_CPU(natoms_tensor);
+  auto natoms = natoms_tensor.data<int>();
+  int nloc = natoms[0];
+  int nall = natoms[1];
+  int nsamples = coord_tensor.shape()[0];
+
+  int nei_mode = 0;
+  bool b_nlist_map = false;
+  if (mesh_tensor.shape()[0] == 16) {
+    // lammps neighbor list
+    nei_mode = 3;
+  } else if (mesh_tensor.shape()[0] == 6) {
+    // manual copied pbc
+    assert(nloc == nall);
+    nei_mode = 1;
+    b_nlist_map = true;
+  } else if (mesh_tensor.shape()[0] == 0) {
+    // no pbc
+    assert(nloc == nall);
+    nei_mode = -1;
+  } else {
+    PD_THROW("invalid mesh tensor");
+  }
+
+  // create output tensors
+  auto descrpt_tensor = paddle::empty(
+      {nsamples, nloc * ndescrpt}, coord_tensor.dtype(), coord_tensor.place());
+
+  auto descrpt_deriv_tensor =
+      paddle::empty({nsamples, nloc * ndescrpt * 3}, coord_tensor.dtype(),
+                    coord_tensor.place());
+
+  auto rij_tensor = paddle::empty({nsamples, nloc * nnei * 3},
+                                  coord_tensor.dtype(), coord_tensor.place());
+
+  auto nlist_tensor = paddle::empty(
+      {nsamples, nloc * nnei}, paddle::DataType::INT32, coord_tensor.place());
+
+  // loop over samples
+  PD_DISPATCH_FLOATING_TYPES(
+      coord_tensor.type(), "prod_env_mat_a_cuda_forward_kernel", ([&] {
+        prod_env_mat_a_cuda_forward_kernel<data_t>(
+            nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei,
+            max_nbor_size, nei_mode, rcut_a, rcut_r, rcut_r_smth, max_cpy_trial,
+            max_nnei_trial, b_nlist_map, sec_a, sec_r, gpu_inlist,
+            nbor_list_dev, array_int, array_longlong,
+            descrpt_tensor.data<data_t>(), descrpt_deriv_tensor.data<data_t>(),
+            rij_tensor.data<data_t>(), nlist_tensor.data<int>(),
+            coord_tensor.data<data_t>(),
+            box_tensor.copy_to(paddle::CPUPlace(), false).data<data_t>(),
+            // box_tensor.data<data_t>(),
+            t_avg_tensor.data<data_t>(), t_std_tensor.data<data_t>(),
+            atype_tensor.data<int>(), mesh_tensor);
+      }));
+  return {descrpt_tensor, descrpt_deriv_tensor, rij_tensor, nlist_tensor};
+}
+
+template <typename FPTYPE>
+static int _norm_copy_coord_gpu(std::vector<paddle::Tensor>* tensor_list,
+                                FPTYPE*& coord_cpy,
+                                int*& type_cpy,
+                                int*& idx_mapping,
+                                int& nall,
+                                int& mem_cpy,
+                                const FPTYPE* coord,
+                                const FPTYPE* box,
+                                const int* type,
+                                const int& nloc,
+                                const int& max_cpy_trial,
+                                const float& rcut_r) {
+  // Tensor FPTYPE_temp;
+  std::vector<int64_t> FPTYPE_temp_shape{nall * 3};
+
+  // use type trait to determine the data type
+  paddle::Tensor tmp_coord_tensor;
+  if (std::is_same<FPTYPE, float>::value) {
+    tmp_coord_tensor = paddle::empty(
+        FPTYPE_temp_shape, paddle::DataType::FLOAT32, paddle::GPUPlace());
+  } else if (std::is_same<FPTYPE, double>::value) {
+    tmp_coord_tensor = paddle::empty(
+        FPTYPE_temp_shape, paddle::DataType::FLOAT64, paddle::GPUPlace());
+  } else {
+    PD_THROW("invalid data type");
+  }
+  FPTYPE* tmp_coord = tmp_coord_tensor.data<FPTYPE>();
+  tensor_list->push_back(tmp_coord_tensor);
+  cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
+             cudaMemcpyDeviceToDevice);
+
+  deepmd::Region<FPTYPE> region;
+  deepmd::init_region_cpu(region, box);
+  FPTYPE box_info[18];
+  std::copy(region.boxt, region.boxt + 9, box_info);
+  std::copy(region.rec_boxt, region.rec_boxt + 9, box_info + 9);
+  int cell_info[23];
+  deepmd::compute_cell_info(cell_info, rcut_r, region);
+  const int loc_cellnum = cell_info[21];
+  const int total_cellnum = cell_info[22];
+
+  // Tensor double_temp;
+  std::vector<int64_t> double_temp_shape{18};
+  paddle::Tensor double_temp_tensor = paddle::empty(
+      double_temp_shape, tmp_coord_tensor.dtype(), paddle::GPUPlace());
+  FPTYPE* box_info_dev = double_temp_tensor.data<FPTYPE>();
+  tensor_list->push_back(double_temp_tensor);
+
+  // Tensor int_temp;
+  std::vector<int64_t> int_temp_shape{
+      23 + nloc * 3 + loc_cellnum + total_cellnum * 3 + total_cellnum * 3 +
+      loc_cellnum + 1 + total_cellnum + 1 + nloc};
+  paddle::Tensor int_temp_tensor = paddle::empty(
+      int_temp_shape, paddle::DataType::INT32, paddle::GPUPlace());
+  int* cell_info_dev = int_temp_tensor.data<int>();
+  int* int_data_dev = cell_info_dev + 23;
+  tensor_list->push_back(int_temp_tensor);
+
+  deepmd::memcpy_host_to_device(box_info_dev, box_info, 18);
+  deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23);
+
+  deepmd::Region<FPTYPE> region_dev;
+  FPTYPE* new_boxt = region_dev.boxt;
+  FPTYPE* new_rec_boxt = region_dev.rec_boxt;
+  region_dev.boxt = box_info_dev;
+  region_dev.rec_boxt = box_info_dev + 9;
+
+  deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev);
+
+  int tt;
+  paddle::Tensor cpy_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU);
+  paddle::Tensor t_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU);
+  for (tt = 0; tt < max_cpy_trial; ++tt) {
+    std::vector<int64_t> cpy_temp_shape{mem_cpy * 3};
+    std::vector<int64_t> t_temp_shape{mem_cpy * 2};
+    cpy_temp_tensor.reshape(cpy_temp_shape);
+    coord_cpy = cpy_temp_tensor.mutable_data<FPTYPE>(paddle::PlaceType::kGPU);
+    t_temp_tensor.reshape(t_temp_shape);
+    type_cpy = t_temp_tensor.mutable_data<int>(paddle::PlaceType::kGPU);
+
+    idx_mapping = type_cpy + mem_cpy;
+    int ret = deepmd::copy_coord_gpu(
+        coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev, tmp_coord, type,
+        nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev);
+    if (ret == 0) {
+      break;
+    } else {
+      mem_cpy *= 2;
+    }
+  }
+  tensor_list->push_back(cpy_temp_tensor);
+  tensor_list->push_back(t_temp_tensor);
+  region_dev.boxt = new_boxt;
+  region_dev.rec_boxt = new_rec_boxt;
+
+  return (tt != max_cpy_trial);
+}
+
+template <typename FPTYPE>
+static int _build_nlist_gpu(std::vector<paddle::Tensor>* tensor_list,
+                            int*& ilist,
+                            int*& numneigh,
+                            int**& firstneigh,
+                            int*& jlist,
+                            int& max_nnei,
+                            int& mem_nnei,
+                            const FPTYPE* coord,
+                            const int& nloc,
+                            const int& new_nall,
+                            const int& max_nnei_trial,
+                            const float& rcut_r) {
+  // Tensor nlist_temp;
+  std::vector<int64_t> nlist_temp_shape{nloc * 2};
+  paddle::Tensor nlist_temp_tensor = paddle::empty(
+      nlist_temp_shape, paddle::DataType::INT32, paddle::GPUPlace());
+  ilist = nlist_temp_tensor.data<int>();
+  tensor_list->push_back(nlist_temp_tensor);
+  numneigh = ilist + nloc;
+  // Tensor jlist_temp;
+  int* ind_data = NULL;
+
+  std::vector<int*> firstneigh_host(nloc);
+  int tt;
+  paddle::Tensor jlist_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU);
+  for (tt = 0; tt < max_nnei_trial; ++tt) {
+    std::vector<int64_t> jlist_temp_shape{3 * nloc * mem_nnei};
+    jlist_temp_tensor.reshape(jlist_temp_shape);
+    jlist = jlist_temp_tensor.mutable_data<int>(paddle::PlaceType::kGPU);
+    ind_data = jlist + nloc * mem_nnei;
+    for (int ii = 0; ii < nloc; ++ii) {
+      firstneigh_host[ii] = jlist + ii * mem_nnei;
+    }
+    deepmd::memcpy_host_to_device(firstneigh, firstneigh_host);
+    deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh);
+    int ret = deepmd::build_nlist_gpu(inlist, &max_nnei, ind_data, coord, nloc,
+                                      new_nall, mem_nnei, rcut_r);
+    if (ret == 0) {
+      break;
+    } else {
+      mem_nnei *= 2;
+    }
+  }
+  tensor_list->push_back(jlist_temp_tensor);
+  return (tt != max_nnei_trial);
+}
+
+static void _map_nlist_gpu(int* nlist,
+                           const int* idx_mapping,
+                           const int& nloc,
+                           const int& nnei) {
+  deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei);
+}
+
+template <typename FPTYPE>
+static void _prepare_coord_nlist_gpu(std::vector<paddle::Tensor>* tensor_list,
+                                     FPTYPE const** coord,
+                                     FPTYPE*& coord_cpy,
+                                     int const** type,
+                                     int*& type_cpy,
+                                     int*& idx_mapping,
+                                     deepmd::InputNlist& inlist,
+                                     int*& ilist,
+                                     int*& numneigh,
+                                     int**& firstneigh,
+                                     int*& jlist,
+                                     int*& nbor_list_dev,
+                                     int& new_nall,
+                                     int& mem_cpy,
+                                     int& mem_nnei,
+                                     int& max_nbor_size,
+                                     const FPTYPE* box,
+                                     const int* mesh_tensor_data,
+                                     const int mesh_tensor_size,
+                                     const int& nloc,
+                                     const int& nei_mode,
+                                     const float& rcut_r,
+                                     const int& max_cpy_trial,
+                                     const int& max_nnei_trial) {
+  inlist.inum = nloc;
+  if (nei_mode != 3) {
+    // build nlist by myself
+    // normalize and copy coord
+    if (nei_mode == 1) {
+      int copy_ok = _norm_copy_coord_gpu(
+          tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall, mem_cpy,
+          *coord, box, *type, nloc, max_cpy_trial, rcut_r);
+      PD_CHECK(copy_ok, "cannot allocate mem for copied coords");
+      *coord = coord_cpy;
+      *type = type_cpy;
+    }
+
+    // build nlist
+    int build_ok = _build_nlist_gpu(tensor_list, ilist, numneigh, firstneigh,
+                                    jlist, max_nbor_size, mem_nnei, *coord,
+                                    nloc, new_nall, max_nnei_trial, rcut_r);
+    PD_CHECK(build_ok, "cannot allocate mem for nlist");
+    if (max_nbor_size <= 1024) {
+      max_nbor_size = 1024;
+    } else if (max_nbor_size <= 2048) {
+      max_nbor_size = 2048;
+    } else {
+      max_nbor_size = 4096;
+    }
+    inlist.ilist = ilist;
+    inlist.numneigh = numneigh;
+    inlist.firstneigh = firstneigh;
+  } else {
+    // update nbor list
+    deepmd::InputNlist inlist_temp;
+    inlist_temp.inum = nloc;
+    deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size,
+                                nbor_list_dev, mesh_tensor_data,
+                                mesh_tensor_size);
+    // env_mat_nbor_update(
+    //     inlist_temp, inlist, max_nbor_size, nbor_list_dev,
+    //     mesh_tensor_data, mesh_tensor_size);
+    PD_CHECK((max_numneigh(inlist_temp) <= GPU_MAX_NBOR_SIZE),
+             "Assert failed, max neighbor size of atom(lammps) " +
+                 std::to_string(max_numneigh(inlist_temp)) +
+                 " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) +
+                 ", which currently is not supported by deepmd-kit.");
+  }
+}
diff --git a/source/lib/paddle_src/paddle_prod_force.cc b/source/lib/paddle_src/paddle_prod_force.cc
new file mode 100644
index 0000000000..f077d3e51c
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_force.cc
@@ -0,0 +1,142 @@
+#include "paddle/extension.h"
+#include "prod_force.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+
+template <typename data_t>
+void ProdForceSeAOpForwardCPUKernel(int nloc,
+                                    int nall,
+                                    int nframes,
+                                    int ndescrpt,
+                                    int nnei,
+                                    data_t* p_force,
+                                    const data_t* p_net_deriv,
+                                    const data_t* p_in_deriv,
+                                    const int* p_nlist) {
+  for (int kk = 0; kk < nframes; ++kk) {
+    data_t* force = p_force + kk * nall * 3;
+    const data_t* net_deriv = p_net_deriv + kk * nloc * ndescrpt;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_force_a_cpu(force, net_deriv, in_deriv, nlist, nloc, nall,
+                             nnei, 0);
+  }
+}
+
+std::vector<paddle::Tensor> ProdForceSeAOpCPUForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT(net_deriv_tensor);
+  CHECK_INPUT(in_deriv_tensor);
+  CHECK_INPUT(nlist_tensor);
+  CHECK_INPUT(natoms_tensor);
+
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+  const int* natoms = natoms_tensor.data<int>();
+  int nloc = natoms[0];
+  int nall = natoms[1];
+  int nframes = net_deriv_tensor.shape()[0];
+  int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
+  int nnei = nlist_tensor.shape()[1] / nloc;
+
+  PD_CHECK(nframes == in_deriv_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nframes == nlist_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1],
+           "number of descriptors should match");
+
+  std::vector<int64_t> force_shape{nframes, 3 * nall};
+  paddle::Tensor force_tensor = paddle::empty(
+      force_shape, net_deriv_tensor.dtype(), net_deriv_tensor.place());
+
+  assert(nframes == force_shape[0]);
+  assert(nframes == net_deriv_tensor.shape()[0]);
+  assert(nframes == in_deriv_tensor.shape()[0]);
+  assert(nframes == nlist_tensor.shape()[0]);
+  assert(nall * 3 == force_shape[1]);
+  assert(nloc * ndescrpt == net_deriv_tensor.shape()[1]);
+  assert(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1]);
+  assert(nloc * nnei == nlist_tensor.shape()[1]);
+  assert(nnei * 4 == ndescrpt);
+
+  PD_DISPATCH_FLOATING_TYPES(
+      net_deriv_tensor.type(), "prod_force_se_a_cpu_forward_kernel", ([&] {
+        ProdForceSeAOpForwardCPUKernel<data_t>(
+            nloc, nall, nframes, ndescrpt, nnei,
+            force_tensor.mutable_data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            nlist_tensor.data<int>());
+      }));
+
+  return {force_tensor};
+}
+
+std::vector<paddle::Tensor> ProdForceSeAOpCUDAForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel);
+
+std::vector<paddle::Tensor> ProdForceSeAForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  if (net_deriv_tensor.place() == paddle::GPUPlace()) {
+    return ProdForceSeAOpCUDAForward(
+        net_deriv_tensor, in_deriv_tensor, nlist_tensor,
+        natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel, n_r_sel);
+  } else if (net_deriv_tensor.place() == paddle::CPUPlace()) {
+    return ProdForceSeAOpCPUForward(net_deriv_tensor, in_deriv_tensor,
+                                    nlist_tensor, natoms_tensor, n_a_sel,
+                                    n_r_sel);
+  } else {
+    PD_THROW("No Such kernel for ProdForceSeAForward.");
+  }
+}
+
+std::vector<std::vector<int64_t>> ProdForceSeAInferShape(
+    std::vector<int64_t> net_deriv_shape,
+    std::vector<int64_t> in_deriv_shape,
+    std::vector<int64_t> nlist_shape,
+    std::vector<int64_t> natoms_shape,
+    const int& n_a_sel,
+    const int& n_r_sel) {
+  int64_t nall = /*natoms[1]*/ 192;
+  int64_t nframes = net_deriv_shape[0];
+  std::vector<int64_t> force_shape = {nframes, 3 * nall};
+  return {force_shape};
+}
+
+std::vector<paddle::DataType> ProdForceSeAInferDtype(
+    paddle::DataType net_deriv_dtype,
+    paddle::DataType in_deriv_dtype,
+    paddle::DataType nlist_dtype,
+    paddle::DataType natoms_dtype) {
+  return {net_deriv_dtype};
+}
+
+PD_BUILD_OP(prod_force_se_a)
+    .Inputs({"net_deriv", "in_deriv", "nlist", "natoms"})
+    .Outputs({"force"})
+    .Attrs({"n_a_sel: int", "n_r_sel: int"})
+    .SetKernelFn(PD_KERNEL(ProdForceSeAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(ProdForceSeAInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ProdForceSeAInferDtype));
diff --git a/source/lib/paddle_src/paddle_prod_force.cu b/source/lib/paddle_src/paddle_prod_force.cu
new file mode 100644
index 0000000000..a767ce6f3f
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_force.cu
@@ -0,0 +1,260 @@
+#include "device.h"
+#include "gpu_cuda.h"
+#include "paddle/extension.h"
+#include "prod_force.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_ON_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+
+template <typename FPTYPE, int THREADS_PER_BLOCK>
+__global__ void force_deriv_wrt_center_atom(FPTYPE* force,
+                                            const FPTYPE* net_deriv,
+                                            const FPTYPE* in_deriv,
+                                            const int ndescrpt) {
+  __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
+  int_64 bid = blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
+    data[ii] = 0.f;
+  }
+  for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) {
+    for (int jj = 0; jj < 3; jj++) {
+      data[jj * THREADS_PER_BLOCK + tid] +=
+          net_deriv[bid * ndescrpt + ii] *
+          in_deriv[bid * ndescrpt * 3 + ii * 3 + jj];
+    }
+  }
+  __syncthreads();
+  // do reduction in shared memory
+  for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+    if (tid < ii) {
+      for (int jj = 0; jj < 3; jj++) {
+        data[jj * THREADS_PER_BLOCK + tid] +=
+            data[jj * THREADS_PER_BLOCK + tid + ii];
+      }
+    }
+    __syncthreads();
+  }
+  // write result for this block to global memory
+  if (tid == 0) {
+    force[bid * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
+    force[bid * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
+    force[bid * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
+  }
+}
+
+template <typename FPTYPE>
+__global__ void force_deriv_wrt_neighbors_a(FPTYPE* force,
+                                            const FPTYPE* net_deriv,
+                                            const FPTYPE* in_deriv,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei) {
+  // idy -> nnei
+  const int_64 idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
+  const unsigned int idz = threadIdx.y;
+  const int ndescrpt = nnei * 4;
+  if (idy >= nnei) {
+    return;
+  }
+  // deriv wrt neighbors
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  FPTYPE force_tmp = 0.f;
+  for (int idw = 0; idw < 4; ++idw) {
+    force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
+                 in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz];
+  }
+  atomicAdd(force + j_idx * 3 + idz, force_tmp);
+}
+
+template <typename FPTYPE>
+__global__ void force_deriv_wrt_neighbors_r(FPTYPE* force,
+                                            const FPTYPE* net_deriv,
+                                            const FPTYPE* in_deriv,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei) {
+  // idy -> nnei
+  const int_64 idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
+  const unsigned int idz = threadIdx.y;
+  const int ndescrpt = nnei * 1;
+  if (idy >= nnei) {
+    return;
+  }
+  // deriv wrt neighbors
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  atomicAdd(force + j_idx * 3 + idz,
+            net_deriv[idx * ndescrpt + idy] *
+                in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void prod_force_a_gpu_cuda(FPTYPE* force,
+                           const FPTYPE* net_deriv,
+                           const FPTYPE* in_deriv,
+                           const int* nlist,
+                           const int nloc,
+                           const int nall,
+                           const int nnei) {
+  const int ndescrpt = nnei * 4;
+  DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nall * 3));
+
+  force_deriv_wrt_center_atom<FPTYPE, TPB>
+      <<<nloc, TPB>>>(force, net_deriv, in_deriv, ndescrpt);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+
+  const int LEN = 64;
+  const int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(LEN, 3);
+  force_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
+      force, net_deriv, in_deriv, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void prod_force_r_gpu_cuda(FPTYPE* force,
+                           const FPTYPE* net_deriv,
+                           const FPTYPE* in_deriv,
+                           const int* nlist,
+                           const int nloc,
+                           const int nall,
+                           const int nnei) {
+  const int ndescrpt = nnei * 1;
+  DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nall * 3));
+
+  force_deriv_wrt_center_atom<FPTYPE, TPB>
+      <<<nloc, TPB>>>(force, net_deriv, in_deriv, ndescrpt);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+
+  const int LEN = 64;
+  const int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(LEN, 3);
+  force_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
+      force, net_deriv, in_deriv, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template void prod_force_a_gpu_cuda<float>(float* force,
+                                           const float* net_deriv,
+                                           const float* in_deriv,
+                                           const int* nlist,
+                                           const int nloc,
+                                           const int nall,
+                                           const int nnei);
+template void prod_force_a_gpu_cuda<double>(double* force,
+                                            const double* net_deriv,
+                                            const double* in_deriv,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nall,
+                                            const int nnei);
+template void prod_force_r_gpu_cuda<float>(float* force,
+                                           const float* net_deriv,
+                                           const float* in_deriv,
+                                           const int* nlist,
+                                           const int nloc,
+                                           const int nall,
+                                           const int nnei);
+template void prod_force_r_gpu_cuda<double>(double* force,
+                                            const double* net_deriv,
+                                            const double* in_deriv,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nall,
+                                            const int nnei);
+}  // namespace deepmd
+
+template <typename data_t>
+void ProdForceSeAOpForwardCUDAKernel(int nloc,
+                                     int nall,
+                                     int nframes,
+                                     int ndescrpt,
+                                     int nnei,
+                                     data_t* p_force,
+                                     const data_t* p_net_deriv,
+                                     const data_t* p_in_deriv,
+                                     const int* p_nlist) {
+  for (int kk = 0; kk < nframes; ++kk) {
+    data_t* force = p_force + kk * nall * 3;
+    const data_t* net_deriv = p_net_deriv + kk * nloc * ndescrpt;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_force_a_gpu_cuda(force, net_deriv, in_deriv, nlist, nloc, nall,
+                                  nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdForceSeAOpCUDAForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT(net_deriv_tensor);
+  CHECK_INPUT(in_deriv_tensor);
+  CHECK_INPUT(nlist_tensor);
+  // CHECK_INPUT(natoms_tensor);
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  CHECK_INPUT_ON_CPU(natoms_tensor);
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+
+  const int* natoms = natoms_tensor.data<int>();
+  int nloc = natoms[0];
+  int nall = natoms[1];
+  int nframes = net_deriv_tensor.shape()[0];
+  int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
+  int nnei = nlist_tensor.shape()[1] / nloc;
+
+  PD_CHECK(nframes == in_deriv_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nframes == nlist_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1],
+           "number of descriptors should match");
+
+  std::vector<int64_t> force_shape{nframes, 3 * nall};
+  paddle::Tensor force_tensor = paddle::empty(
+      force_shape, net_deriv_tensor.dtype(), net_deriv_tensor.place());
+
+  assert(nframes == force_shape[0]);
+  assert(nframes == net_deriv_tensor.shape()[0]);
+  assert(nframes == in_deriv_tensor.shape()[0]);
+  assert(nframes == nlist_tensor.shape()[0]);
+  assert(nall * 3 == force_shape[1]);
+  assert(nloc * ndescrpt == net_deriv_tensor.shape()[1]);
+  assert(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1]);
+  assert(nloc * nnei == nlist_tensor.shape()[1]);
+  assert(nnei * 4 == ndescrpt);
+
+  PD_DISPATCH_FLOATING_TYPES(
+      net_deriv_tensor.type(), "prod_force_se_a_gpu_forward_kernel", ([&] {
+        ProdForceSeAOpForwardCUDAKernel<data_t>(
+            nloc, nall, nframes, ndescrpt, nnei, force_tensor.data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            nlist_tensor.data<int>());
+      }));
+
+  return {force_tensor};
+}
diff --git a/source/lib/paddle_src/paddle_prod_force_grad.cc b/source/lib/paddle_src/paddle_prod_force_grad.cc
new file mode 100644
index 0000000000..bd6509c017
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_force_grad.cc
@@ -0,0 +1,141 @@
+#include "paddle/extension.h"
+#include "prod_force_grad.h"
+
+#define CHECK_INPUT_READY(x) \
+  PD_CHECK(x.initialized(), #x " must be initialized before usage.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+
+template <typename data_t>
+void ProdForceSeAOpCPUBackwardKernel(int nloc,
+                                     int nframes,
+                                     int ndescrpt,
+                                     int nnei,
+                                     const data_t* grad,
+                                     const data_t* net_deriv,
+                                     const data_t* in_deriv,
+                                     const int* nlist,
+                                     data_t* grad_net) {
+  // #pragma omp parallel for
+  //   for (int kk = 0; kk < nframes; ++kk){
+  //     int grad_iter	= kk * nloc * 3;
+  //     int in_iter	= kk * nloc * ndescrpt * 3;
+  //     int nlist_iter	= kk * nloc * nnei;
+  //     int grad_net_iter	= kk * nloc * ndescrpt;
+
+  //     deepmd::prod_force_grad_a_cpu(
+  //       &grad_net[grad_net_iter],
+  //       &grad[grad_iter],
+  //       &in_deriv[in_iter],
+  //       &nlist[nlist_iter],
+  //       nloc,
+  //       nnei
+  //     );
+  //   }
+
+  for (int kk = 0; kk < nframes; ++kk) {
+    data_t* p_grad_net = grad_net + kk * nloc * ndescrpt;
+    const data_t* p_grad = grad + kk * nloc * 3;
+    const data_t* p_in_deriv = in_deriv + kk * nloc * ndescrpt * 3;
+    const int* p_nlist = nlist + kk * nloc * nnei;
+
+    deepmd::prod_force_grad_a_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist, nloc,
+                                  nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdForceSeAOpCPUBackward(
+    const paddle::Tensor& grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT_READY(grad_tensor);
+  CHECK_INPUT_READY(net_deriv_tensor);
+  CHECK_INPUT_READY(in_deriv_tensor);
+  CHECK_INPUT_READY(nlist_tensor);
+  CHECK_INPUT_READY(natoms_tensor);
+
+  auto grad_shape = grad_tensor.shape();
+  auto net_deriv_shape = net_deriv_tensor.shape();
+  auto in_deriv_shape = in_deriv_tensor.shape();
+  auto nlist_shape = nlist_tensor.shape();
+  auto natoms_shape = natoms_tensor.shape();
+
+  CHECK_INPUT_DIM(grad_tensor, 2);
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_shape[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+  const int* natoms = natoms_tensor.data<int>();
+
+  int nframes = net_deriv_shape[0];
+  int nloc = natoms[0];
+  int ndescrpt = net_deriv_shape[1] / nloc;
+  int nnei = nlist_shape[1] / nloc;
+
+  PD_CHECK(nframes == grad_shape[0], "number of frames should match");
+  PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
+  PD_CHECK(nframes == nlist_shape[0], "number of samples should match");
+  PD_CHECK((nloc * 3) == grad_shape[1],
+           "input grad shape should be 3 x natoms");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1],
+           "number of descriptors should match");
+  PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
+
+  std::vector<int64_t> grad_net_shape{nframes, (int64_t)nloc * ndescrpt};
+
+  paddle::Tensor grad_net_tensor =
+      paddle::empty(grad_net_shape, grad_tensor.dtype(), grad_tensor.place());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      grad_tensor.type(), "prod_force_se_a_cpu_backward_kernel", ([&] {
+        ProdForceSeAOpCPUBackwardKernel<data_t>(
+            nloc, nframes, ndescrpt, nnei, grad_tensor.data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            nlist_tensor.data<int>(), grad_net_tensor.data<data_t>());
+      }));
+  return {grad_net_tensor};
+}
+
+std::vector<paddle::Tensor> ProdForceSeAOpCUDABackward(
+    const paddle::Tensor& force_grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel);
+
+std::vector<paddle::Tensor> ProdForceSeABackward(
+    const paddle::Tensor& force_grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  if (net_deriv_tensor.place() == paddle::GPUPlace()) {
+    return ProdForceSeAOpCUDABackward(
+        force_grad_tensor, net_deriv_tensor, in_deriv_tensor, nlist_tensor,
+        natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel, n_r_sel);
+  } else if (net_deriv_tensor.place() == paddle::CPUPlace()) {
+    return ProdForceSeAOpCPUBackward(force_grad_tensor, net_deriv_tensor,
+                                     in_deriv_tensor, nlist_tensor,
+                                     natoms_tensor, n_a_sel, n_r_sel);
+  } else {
+    PD_THROW("No Such kernel for ProdForceSeABackward.");
+  }
+}
+
+PD_BUILD_GRAD_OP(prod_force_se_a)
+    .Inputs({paddle::Grad("force"), "net_deriv", "in_deriv", "nlist", "natoms"})
+    .Outputs({paddle::Grad("net_deriv")})
+    .Attrs({"n_a_sel: int", "n_r_sel: int"})
+    .SetKernelFn(PD_KERNEL(ProdForceSeABackward));
diff --git a/source/lib/paddle_src/paddle_prod_force_grad.cu b/source/lib/paddle_src/paddle_prod_force_grad.cu
new file mode 100644
index 0000000000..0504f115e6
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_force_grad.cu
@@ -0,0 +1,258 @@
+#include "device.h"
+#include "gpu_cuda.h"
+#include "paddle/extension.h"
+#include "prod_force_grad.h"
+
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_ON_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
+template <typename FPTYPE>
+__device__ inline FPTYPE dev_dot(const FPTYPE* arr1, const FPTYPE* arr2) {
+  return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
+}
+
+template <typename FPTYPE>
+__global__ void force_grad_wrt_center_atom(FPTYPE* grad_net,
+                                           const FPTYPE* grad,
+                                           const FPTYPE* env_deriv,
+                                           const int ndescrpt) {
+  __shared__ FPTYPE grad_one[3];
+  int_64 center_idx = blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  if (tid < 3) {
+    grad_one[tid] = grad[center_idx * 3 + tid];
+  }
+  __syncthreads();
+  unsigned int descrpt_idx = blockIdx.y * blockDim.x + tid;
+  if (descrpt_idx < ndescrpt) {
+    grad_net[center_idx * ndescrpt + descrpt_idx] -= dev_dot(
+        grad_one, env_deriv + center_idx * ndescrpt * 3 + descrpt_idx * 3);
+  }
+}
+
+template <typename FPTYPE>
+__global__ void force_grad_wrt_neighbors_a(FPTYPE* grad_net,
+                                           const FPTYPE* grad,
+                                           const FPTYPE* env_deriv,
+                                           const int* nlist,
+                                           const int nloc,
+                                           const int nnei) {
+  // idy -> nnei
+  const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const unsigned int idy = blockIdx.y;
+  const unsigned int idw = threadIdx.y;
+  if (idx >= nloc) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  if (j_idx >= nloc) j_idx = j_idx % nloc;
+  grad_net[idx * nnei * 4 + idy * 4 + idw] += dev_dot(
+      grad + j_idx * 3, env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3);
+}
+
+template <typename FPTYPE>
+__global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net,
+                                           const FPTYPE* grad,
+                                           const FPTYPE* env_deriv,
+                                           const int* nlist,
+                                           const int nloc,
+                                           const int nnei) {
+  // idy -> nnei
+  const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const unsigned int idy = blockIdx.y;
+  if (idx >= nloc) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  if (j_idx >= nloc) j_idx = j_idx % nloc;
+  grad_net[idx * nnei + idy] +=
+      dev_dot(grad + j_idx * 3, env_deriv + idx * nnei * 3 + idy * 3);
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
+                                const FPTYPE* grad,
+                                const FPTYPE* env_deriv,
+                                const int* nlist,
+                                const int nloc,
+                                const int nnei) {
+  const int ndescrpt = nnei * 4;
+  DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  const int nblock = (ndescrpt + TPB - 1) / TPB;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(TPB, 1);
+  force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(grad_net, grad,
+                                                          env_deriv, ndescrpt);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+
+  const int LEN = 128;
+  const int nblock_ = (nloc + LEN - 1) / LEN;
+  dim3 block_grid_(nblock_, nnei);
+  dim3 thread_grid_(LEN, 4);
+  force_grad_wrt_neighbors_a<<<block_grid_, thread_grid_>>>(
+      grad_net, grad, env_deriv, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
+                                const FPTYPE* grad,
+                                const FPTYPE* env_deriv,
+                                const int* nlist,
+                                const int nloc,
+                                const int nnei) {
+  const int ndescrpt = nnei * 1;
+  DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  const int nblock = (ndescrpt + TPB - 1) / TPB;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(TPB, 1);
+  force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(grad_net, grad,
+                                                          env_deriv, ndescrpt);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+
+  const int LEN = 128;
+  const int nblock_ = (nloc + LEN - 1) / LEN;
+  dim3 block_grid_(nblock_, nnei);
+  dim3 thread_grid_(LEN, 1);
+  force_grad_wrt_neighbors_r<<<block_grid_, thread_grid_>>>(
+      grad_net, grad, env_deriv, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template void prod_force_grad_a_gpu_cuda<float>(float* grad_net,
+                                                const float* grad,
+                                                const float* env_deriv,
+                                                const int* nlist,
+                                                const int nloc,
+                                                const int nnei);
+template void prod_force_grad_a_gpu_cuda<double>(double* grad_net,
+                                                 const double* grad,
+                                                 const double* env_deriv,
+                                                 const int* nlist,
+                                                 const int nloc,
+                                                 const int nnei);
+template void prod_force_grad_r_gpu_cuda<float>(float* grad_net,
+                                                const float* grad,
+                                                const float* env_deriv,
+                                                const int* nlist,
+                                                const int nloc,
+                                                const int nnei);
+template void prod_force_grad_r_gpu_cuda<double>(double* grad_net,
+                                                 const double* grad,
+                                                 const double* env_deriv,
+                                                 const int* nlist,
+                                                 const int nloc,
+                                                 const int nnei);
+}  // namespace deepmd
+
+template <typename data_t>
+void ProdForceSeAOpCUDABackwardKernel(int nloc,
+                                      int nframes,
+                                      int ndescrpt,
+                                      int nnei,
+                                      const data_t* p_grad,
+                                      const data_t* p_net_deriv,
+                                      const data_t* p_in_deriv,
+                                      const int* p_nlist,
+                                      data_t* p_grad_net) {
+  for (int_64 kk = 0; kk < nframes; ++kk) {
+    data_t* grad_net = p_grad_net + kk * nloc * ndescrpt;
+    const data_t* grad = p_grad + kk * nloc * 3;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_force_grad_a_gpu_cuda(grad_net, grad, in_deriv, nlist, nloc,
+                                       nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdForceSeAOpCUDABackward(
+    const paddle::Tensor& force_grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  auto grad_shape = force_grad_tensor.shape();
+  auto net_deriv_shape = net_deriv_tensor.shape();
+  auto in_deriv_shape = in_deriv_tensor.shape();
+  auto nlist_shape = nlist_tensor.shape();
+  auto natoms_shape = natoms_tensor.shape();
+
+  CHECK_INPUT_DIM(force_grad_tensor, 2);
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_shape[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+
+  CHECK_INPUT_ON_CPU(natoms_tensor);
+  const int* natoms = natoms_tensor.data<int>();
+  int nframes = net_deriv_shape[0];
+  int nloc = natoms[0];
+  int ndescrpt = net_deriv_shape[1] / nloc;
+  int nnei = nlist_shape[1] / nloc;
+
+  PD_CHECK(nframes == grad_shape[0], "number of frames should match");
+  PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
+  PD_CHECK(nframes == nlist_shape[0], "number of samples should match");
+  PD_CHECK(nloc * 3 == grad_shape[1], "input grad shape should be 3 x natoms");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1],
+           "number of descriptors should match");
+  PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
+
+  std::vector<int64_t> grad_net_shape{nframes, nloc * ndescrpt};
+  paddle::Tensor grad_net_tensor = paddle::empty(
+      grad_net_shape, force_grad_tensor.dtype(), force_grad_tensor.place());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      force_grad_tensor.type(), "prod_force_se_a_cuda_backward_kernel", ([&] {
+        ProdForceSeAOpCUDABackwardKernel<data_t>(
+            nloc, nframes, ndescrpt, nnei, force_grad_tensor.data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            nlist_tensor.data<int>(), grad_net_tensor.data<data_t>());
+      }));
+  return {grad_net_tensor};
+}
+
+// std::vector<paddle::Tensor> ProdForceSeABackward(
+//     const paddle::Tensor& force_grad_tensor,
+//     const paddle::Tensor& net_deriv_tensor,
+//     const paddle::Tensor& in_deriv_tensor,
+//     const paddle::Tensor& nlist_tensor,
+//     const paddle::Tensor& natoms_tensor,
+//     int n_a_sel,
+//     int n_r_sel) {
+//   if (net_deriv_tensor.place() == paddle::GPUPlace()) {
+//     return ProdForceSeAOpCUDABackward(force_grad_tensor, net_deriv_tensor,
+//                                       in_deriv_tensor, nlist_tensor,
+//                                       natoms_tensor, n_a_sel, n_r_sel);
+//   }
+//   else if (net_deriv_tensor.place() == paddle::CPUPlace()) {
+//     return ProdForceSeAOpCPUBackward(force_grad_tensor, net_deriv_tensor,
+//                                      in_deriv_tensor, nlist_tensor,
+//                                      natoms_tensor, n_a_sel, n_r_sel);
+//   } else {
+//     PD_THROW("No Such kernel for ProdForceSeABackward.");
+//   }
+// }
+
+// PD_BUILD_GRAD_OP(prod_force_se_a)
+//     .Inputs({paddle::Grad("force"), "net_deriv", "in_deriv", "nlist",
+//     "natoms"}) .Outputs({paddle::Grad("net_deriv")}) .Attrs({"n_a_sel: int",
+//     "n_r_sel: int"}) .SetKernelFn(PD_KERNEL(ProdForceSeABackward));
diff --git a/source/lib/paddle_src/paddle_prod_virial.cc b/source/lib/paddle_src/paddle_prod_virial.cc
new file mode 100644
index 0000000000..6b4cbe324f
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_virial.cc
@@ -0,0 +1,155 @@
+#include "paddle/extension.h"
+#include "prod_virial.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT_READY(x) \
+  PD_CHECK(x.initialized(), #x " must be initialized before usage.")
+
+template <typename data_t>
+void ProdVirialSeAOpForwardCPUKernel(int nloc,
+                                     int nall,
+                                     int ndescrpt,
+                                     int nnei,
+                                     int nframes,
+                                     data_t* p_virial,
+                                     data_t* p_atom_virial,
+                                     const data_t* p_net_deriv,
+                                     const data_t* p_in_deriv,
+                                     const data_t* p_rij,
+                                     const int* p_nlist) {
+  for (int kk = 0; kk < nframes; ++kk) {
+    data_t* virial = p_virial + kk * 9;
+    data_t* atom_virial = p_atom_virial + kk * nall * 9;
+    const data_t* net_deriv = p_net_deriv + kk * nloc * ndescrpt;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const data_t* rij = p_rij + kk * nloc * nnei * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_virial_a_cpu(virial, atom_virial, net_deriv, in_deriv, rij,
+                              nlist, nloc, nall, nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdVirialSeAOpCPUForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT(net_deriv_tensor);
+  CHECK_INPUT(in_deriv_tensor);
+  CHECK_INPUT(rij_tensor);
+  CHECK_INPUT(nlist_tensor);
+  CHECK_INPUT(natoms_tensor);
+
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(rij_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+  const int* natoms = natoms_tensor.data<int>();
+  int nloc = natoms[0];
+  int nall = natoms[1];
+  int nnei = nlist_tensor.shape()[1] / nloc;
+  int nframes = net_deriv_tensor.shape()[0];
+  int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
+
+  PD_CHECK(nframes == in_deriv_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nframes == rij_tensor.shape()[0], "number of samples should match");
+  PD_CHECK(nframes == nlist_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1],
+           "number of descriptors should match");
+  PD_CHECK((nloc * nnei * 3) == rij_tensor.shape()[1],
+           "dim of rij should be nnei * 3");
+
+  std::vector<int64_t> virial_shape{nframes, 9};
+  std::vector<int64_t> atom_virial_shape{nframes, 9 * nall};
+  paddle::Tensor virial_tensor = paddle::empty(
+      virial_shape, net_deriv_tensor.dtype(), net_deriv_tensor.place());
+  paddle::Tensor atom_virial_tensor = paddle::empty(
+      atom_virial_shape, net_deriv_tensor.dtype(), net_deriv_tensor.place());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      net_deriv_tensor.type(), "prod_virial_se_a_cpu_forward_kernel", ([&] {
+        ProdVirialSeAOpForwardCPUKernel<data_t>(
+            nloc, nall, ndescrpt, nnei, nframes, virial_tensor.data<data_t>(),
+            atom_virial_tensor.data<data_t>(), net_deriv_tensor.data<data_t>(),
+            in_deriv_tensor.data<data_t>(), rij_tensor.data<data_t>(),
+            nlist_tensor.data<int>());
+      }));
+
+  return {virial_tensor, atom_virial_tensor};
+}
+
+std::vector<paddle::Tensor> ProdVirialSeAOpCUDAForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel);
+
+std::vector<paddle::Tensor> ProdVirialSeAForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  if (net_deriv_tensor.is_gpu()) {
+    return ProdVirialSeAOpCUDAForward(
+        net_deriv_tensor, in_deriv_tensor, rij_tensor, nlist_tensor,
+        natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel, n_r_sel);
+  } else if (net_deriv_tensor.is_cpu()) {
+    return ProdVirialSeAOpCPUForward(
+        net_deriv_tensor, in_deriv_tensor, rij_tensor, nlist_tensor,
+        natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel, n_r_sel);
+  } else {
+    PD_THROW("Unsupported device type for ProdVirialSeAForward");
+  }
+}
+
+std::vector<std::vector<int64_t>> ProdVirialSeAInferShape(
+    std::vector<int64_t> net_deriv_shape,
+    std::vector<int64_t> in_deriv_shape,
+    std::vector<int64_t> rij_shape,
+    std::vector<int64_t> nlist_shape,
+    std::vector<int64_t> natoms_shape,
+    const int& n_a_sel,
+    const int& n_r_sel) {
+  // int64_t nloc = /*natoms[0]*/ 192;
+  int64_t nall = /*natoms[1]*/ 192;
+  int64_t nframes = net_deriv_shape[0];
+
+  std::vector<int64_t> virial_shape = {nframes, 9};
+  std::vector<int64_t> atom_virial_shape = {nframes, 9 * nall};
+
+  return {virial_shape, atom_virial_shape};
+}
+
+std::vector<paddle::DataType> ProdVirialSeAInferDtype(
+    paddle::DataType net_deriv_dtype,
+    paddle::DataType in_deriv_dtype,
+    paddle::DataType rij_dtype,
+    paddle::DataType nlist_dtype,
+    paddle::DataType natoms_dtype) {
+  return {net_deriv_dtype, net_deriv_dtype};
+}
+
+PD_BUILD_OP(prod_virial_se_a)
+    .Inputs({"net_deriv", "in_deriv", "rij", "nlist", "natoms"})
+    .Outputs({"virial", "atom_virial"})
+    .Attrs({"n_a_sel: int", "n_r_sel: int"})
+    .SetKernelFn(PD_KERNEL(ProdVirialSeAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(ProdVirialSeAInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ProdVirialSeAInferDtype));
diff --git a/source/lib/paddle_src/paddle_prod_virial.cu b/source/lib/paddle_src/paddle_prod_virial.cu
new file mode 100644
index 0000000000..d64bcd7063
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_virial.cu
@@ -0,0 +1,255 @@
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+
+#include "device.h"
+#include "gpu_cuda.h"
+#include "paddle/extension.h"
+#include "prod_virial.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_ON_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT_READY(x) \
+  PD_CHECK(x.initialized(), #x " must be initialized before usage.")
+
+template <typename FPTYPE, int THREADS_PER_BLOCK>
+__global__ void atom_virial_reduction(FPTYPE* virial,
+                                      const FPTYPE* atom_virial,
+                                      const int nall) {
+  unsigned int bid = blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  __shared__ FPTYPE data[THREADS_PER_BLOCK];
+  data[tid] = (FPTYPE)0.;
+  for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) {
+    data[tid] += atom_virial[ii * 9 + bid];
+  }
+  __syncthreads();
+  // do reduction in shared memory
+  for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+    if (tid < ii) {
+      data[tid] += data[tid + ii];
+    }
+    __syncthreads();
+  }
+  // write result for this block to global memory
+  if (tid == 0) virial[bid] = data[0];
+}
+
+template <typename FPTYPE>
+__global__ void virial_deriv_wrt_neighbors_a(FPTYPE* virial,
+                                             FPTYPE* atom_virial,
+                                             const FPTYPE* net_deriv,
+                                             const FPTYPE* in_deriv,
+                                             const FPTYPE* rij,
+                                             const int* nlist,
+                                             const int nloc,
+                                             const int nnei) {
+  // idx -> nloc
+  // idy -> nnei
+  // idz = dd0 * 3 + dd1
+  // dd0 = idz / 3
+  // dd1 = idz % 3
+  const int_64 idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
+  const unsigned int idz = threadIdx.y;
+  const int ndescrpt = nnei * 4;
+  if (idy >= nnei) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  // atomicAdd(
+  //    virial + idz,
+  //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
+  //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
+  //    3]);
+  FPTYPE virial_tmp = (FPTYPE)0.;
+  for (int idw = 0; idw < 4; ++idw) {
+    virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
+                  rij[idx * nnei * 3 + idy * 3 + idz % 3] *
+                  in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3];
+  }
+  atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp);
+}
+
+template <typename FPTYPE>
+__global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial,
+                                             FPTYPE* atom_virial,
+                                             const FPTYPE* net_deriv,
+                                             const FPTYPE* in_deriv,
+                                             const FPTYPE* rij,
+                                             const int* nlist,
+                                             const int nloc,
+                                             const int nnei) {
+  // idx -> nloc
+  // idy -> nnei
+  // idz = dd0 * 3 + dd1
+  // dd0 = idz / 3
+  // dd1 = idz % 3
+  const int_64 idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
+  const unsigned int idz = threadIdx.y;
+  const int ndescrpt = nnei * 1;
+
+  if (idy >= nnei) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  // atomicAdd(
+  //    virial + idz,
+  //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
+  //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
+  //    3]);
+  atomicAdd(atom_virial + j_idx * 9 + idz,
+            net_deriv[idx * ndescrpt + idy] *
+                rij[idx * nnei * 3 + idy * 3 + idz % 3] *
+                in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void prod_virial_a_gpu_cuda(FPTYPE* virial,
+                            FPTYPE* atom_virial,
+                            const FPTYPE* net_deriv,
+                            const FPTYPE* in_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nall,
+                            const int nnei) {
+  DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
+  DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
+
+  const int LEN = 16;
+  int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(LEN, 9);
+  // compute virial of a frame
+  virial_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
+      virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  // reduction atom_virial to virial
+  atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void prod_virial_r_gpu_cuda(FPTYPE* virial,
+                            FPTYPE* atom_virial,
+                            const FPTYPE* net_deriv,
+                            const FPTYPE* in_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nall,
+                            const int nnei) {
+  DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
+  DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
+
+  const int LEN = 16;
+  int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(LEN, 9);
+  // compute virial of a frame
+  virial_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
+      virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  // reduction atom_virial to virial
+  atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+}  // namespace deepmd
+
+template <typename data_t>
+void ProdVirialSeAOpForwardCUDAKernel(int nloc,
+                                      int nall,
+                                      int ndescrpt,
+                                      int nnei,
+                                      int nframes,
+                                      data_t* p_virial,
+                                      data_t* p_atom_virial,
+                                      const data_t* p_net_deriv,
+                                      const data_t* p_in_deriv,
+                                      const data_t* p_rij,
+                                      const int* p_nlist) {
+  for (int kk = 0; kk < nframes; ++kk) {
+    data_t* virial = p_virial + kk * 9;
+    data_t* atom_virial = p_atom_virial + kk * nall * 9;
+    const data_t* net_deriv = p_net_deriv + kk * nloc * ndescrpt;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const data_t* rij = p_rij + kk * nloc * nnei * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_virial_a_gpu_cuda(virial, atom_virial, net_deriv, in_deriv,
+                                   rij, nlist, nloc, nall, nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdVirialSeAOpCUDAForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT(net_deriv_tensor);
+  CHECK_INPUT(in_deriv_tensor);
+  CHECK_INPUT(rij_tensor);
+  CHECK_INPUT(nlist_tensor);
+  CHECK_INPUT_ON_CPU(natoms_tensor);  // TODO:
+  // 暂时指定python端必须为cpu，gpu的copy_to会导致返回的指针数据不对
+
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(rij_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+  const int* natoms = natoms_tensor.data<int>();
+  int nloc = natoms[0];
+  int nall = natoms[1];
+  int nnei = nlist_tensor.shape()[1] / nloc;
+  int nframes = net_deriv_tensor.shape()[0];
+  int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
+  PD_CHECK(nframes == in_deriv_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nframes == rij_tensor.shape()[0], "number of samples should match");
+  PD_CHECK(nframes == nlist_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1],
+           "number of descriptors should match");
+  PD_CHECK((nloc * nnei * 3) == rij_tensor.shape()[1],
+           "dim of rij should be nnei * 3");
+
+  std::vector<int64_t> virial_shape{nframes, 9};
+  std::vector<int64_t> atom_virial_shape{nframes, 9 * nall};
+  paddle::Tensor virial_tensor =
+      paddle::Tensor(paddle::PlaceType::kGPU, virial_shape);
+  paddle::Tensor atom_virial_tensor =
+      paddle::Tensor(paddle::PlaceType::kGPU, atom_virial_shape);
+
+  PD_DISPATCH_FLOATING_TYPES(
+      net_deriv_tensor.type(), "prod_virial_se_a_cuda_forward_kernel", ([&] {
+        ProdVirialSeAOpForwardCUDAKernel<data_t>(
+            nloc, nall, ndescrpt, nnei, nframes,
+            virial_tensor.mutable_data<data_t>(),
+            atom_virial_tensor.mutable_data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            rij_tensor.data<data_t>(), nlist_tensor.data<int>());
+      }));
+
+  return {virial_tensor, atom_virial_tensor};
+}
diff --git a/source/lib/paddle_src/paddle_prod_virial_grad.cc b/source/lib/paddle_src/paddle_prod_virial_grad.cc
new file mode 100644
index 0000000000..39b580bf89
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_virial_grad.cc
@@ -0,0 +1,148 @@
+#include "paddle/extension.h"
+#include "prod_virial_grad.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT_READY(x) \
+  PD_CHECK(x.initialized(), #x " must be initialized before usage.")
+
+template <typename data_t>
+void ProdVirialSeAOpCPUBackwardKernel(int nloc,
+                                      int nframes,
+                                      int ndescrpt,
+                                      int nnei,
+                                      const data_t* p_grad,
+                                      const data_t* p_net_deriv,
+                                      const data_t* p_in_deriv,
+                                      const data_t* p_rij,
+                                      const int* p_nlist,
+                                      data_t* p_grad_net) {
+  // #pragma omp parallel for
+  //   for (int kk = 0; kk < nframes; ++kk) {
+  //     int grad_iter = kk * 9;
+  //     int in_iter = kk * nloc * ndescrpt * 3;
+  //     int rij_iter = kk * nloc * nnei * 3;
+  //     int nlist_iter = kk * nloc * nnei;
+  //     int grad_net_iter = kk * nloc * ndescrpt;
+
+  //     deepmd::prod_virial_grad_a_cpu(&grad_net[grad_net_iter],
+  //     &grad[grad_iter],
+  //                                    &in_deriv[in_iter], &rij[rij_iter],
+  //                                    &nlist[nlist_iter], nloc, nnei);
+  //   }
+
+  for (int kk = 0; kk < nframes; ++kk) {
+    data_t* grad_net = p_grad_net + kk * nloc * ndescrpt;
+    const data_t* grad = p_grad + kk * 9;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const data_t* rij = p_rij + kk * nloc * nnei * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_virial_grad_a_cpu(grad_net, grad, in_deriv, rij, nlist, nloc,
+                                   nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdVirialSeAOpCPUBackward(
+    const paddle::Tensor& grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT_READY(grad_tensor);
+  CHECK_INPUT_READY(net_deriv_tensor);
+  CHECK_INPUT_READY(in_deriv_tensor);
+  CHECK_INPUT_READY(rij_tensor);
+  CHECK_INPUT_READY(nlist_tensor);
+  CHECK_INPUT_READY(natoms_tensor);
+
+  auto grad_shape = grad_tensor.shape();
+  auto net_deriv_shape = net_deriv_tensor.shape();
+  auto in_deriv_shape = in_deriv_tensor.shape();
+  auto rij_shape = rij_tensor.shape();
+  auto nlist_shape = nlist_tensor.shape();
+
+  CHECK_INPUT_DIM(grad_tensor, 2);
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(rij_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+
+  const int* natoms = natoms_tensor.data<int>();
+
+  int nframes = net_deriv_shape[0];
+  int nloc = natoms[0];
+  int ndescrpt = net_deriv_shape[1] / nloc;
+  int nnei = nlist_shape[1] / nloc;
+
+  PD_CHECK(nframes == grad_shape[0], "number of frames should match");
+  PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
+  PD_CHECK(nframes == rij_shape[0], "number of frames should match");
+  PD_CHECK(nframes == nlist_shape[0], "number of samples should match");
+  PD_CHECK(9 == grad_shape[1], "input grad shape should be 3 x natoms");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1],
+           "number of descriptors should match");
+  PD_CHECK(nloc * nnei * 3 == rij_shape[1], "dim of rij should be  nnei * 3");
+  PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
+
+  std::vector<int64_t> grad_net_shape{nframes, (int64_t)nloc * ndescrpt};
+  paddle::Tensor grad_net_tensor =
+      paddle::empty(grad_net_shape, grad_tensor.dtype(), grad_tensor.place());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      grad_tensor.type(), "prod_force_se_a_cpu_backward_kernel", ([&] {
+        ProdVirialSeAOpCPUBackwardKernel<data_t>(
+            nloc, nframes, ndescrpt, nnei, grad_tensor.data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            rij_tensor.data<data_t>(), nlist_tensor.data<int>(),
+            grad_net_tensor.data<data_t>());
+      }));
+  return {grad_net_tensor};
+}
+
+std::vector<paddle::Tensor> ProdVirialSeAOpCUDABackward(
+    const paddle::Tensor& virial_grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel);
+
+std::vector<paddle::Tensor> ProdVirialSeABackward(
+    const paddle::Tensor& virial_grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  if (virial_grad_tensor.is_gpu()) {
+    return ProdVirialSeAOpCUDABackward(
+        virial_grad_tensor, net_deriv_tensor, in_deriv_tensor, rij_tensor,
+        nlist_tensor, natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel,
+        n_r_sel);
+  } else if (virial_grad_tensor.is_cpu()) {
+    return ProdVirialSeAOpCPUBackward(virial_grad_tensor, net_deriv_tensor,
+                                      in_deriv_tensor, rij_tensor, nlist_tensor,
+                                      natoms_tensor, n_a_sel, n_r_sel);
+  } else {
+    PD_THROW("Unsupported device type for ProdVirialSeAForward");
+  }
+}
+
+PD_BUILD_GRAD_OP(prod_virial_se_a)
+    .Inputs({paddle::Grad("virial"), "net_deriv", "in_deriv", "rij", "nlist",
+             "natoms"})
+    .Outputs({paddle::Grad("net_deriv")})
+    .Attrs({"n_a_sel: int", "n_r_sel: int"})
+    .SetKernelFn(PD_KERNEL(ProdVirialSeABackward));
diff --git a/source/lib/paddle_src/paddle_prod_virial_grad.cu b/source/lib/paddle_src/paddle_prod_virial_grad.cu
new file mode 100644
index 0000000000..be0602e30e
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_virial_grad.cu
@@ -0,0 +1,191 @@
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+
+#include "device.h"
+#include "gpu_cuda.h"
+#include "paddle/extension.h"
+#include "prod_virial.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT_READY(x) \
+  PD_CHECK(x.initialized(), #x " must be initialized before usage.")
+
+template <typename FPTYPE>
+__device__ inline FPTYPE dev_dot9(const FPTYPE* arr1, const FPTYPE* arr2) {
+  FPTYPE result = (FPTYPE)0.0;
+  for (int ii = 0; ii < 9; ii++) {
+    result += arr1[ii] * arr2[ii];
+  }
+  return result;
+}
+
+template <typename FPTYPE>
+__global__ void virial_grad_wrt_neighbors_a(FPTYPE* grad_net,
+                                            const FPTYPE* grad,
+                                            const FPTYPE* env_deriv,
+                                            const FPTYPE* rij,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei) {
+  // idy -> nnei
+  const unsigned int tid = threadIdx.x;
+  const int_64 idx = blockIdx.x * blockDim.x + tid;
+  const unsigned int idy = blockIdx.y;
+  const unsigned int idw = threadIdx.y;
+  const int ndescrpt = nnei * 4;
+  __shared__ FPTYPE grad_one[9];
+  if (tid < 9) {
+    grad_one[tid] = grad[tid];
+  }
+  __syncthreads();
+  if (idx >= nloc) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  FPTYPE tmp[9];
+  for (int dd0 = 0; dd0 < 3; ++dd0) {
+    for (int dd1 = 0; dd1 < 3; ++dd1) {
+      tmp[dd0 * 3 + dd1] =
+          rij[idx * nnei * 3 + idy * 3 + dd1] *
+          env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0];
+    }
+  }
+  grad_net[idx * ndescrpt + idy * 4 + idw] -=
+      (FPTYPE)-1.0 * dev_dot9(grad_one, tmp);
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
+                                 const FPTYPE* grad,
+                                 const FPTYPE* env_deriv,
+                                 const FPTYPE* rij,
+                                 const int* nlist,
+                                 const int nloc,
+                                 const int nnei) {
+  const int ndescrpt = nnei * 4;
+  DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  const int LEN = 128;
+  const int nblock = (nloc + LEN - 1) / LEN;
+  dim3 block_grid(nblock, nnei);
+  dim3 thread_grid(LEN, 4);
+  virial_grad_wrt_neighbors_a<<<block_grid, thread_grid>>>(
+      grad_net, grad, env_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template void prod_virial_grad_a_gpu_cuda<float>(float* grad_net,
+                                                 const float* grad,
+                                                 const float* env_deriv,
+                                                 const float* rij,
+                                                 const int* nlist,
+                                                 const int nloc,
+                                                 const int nnei);
+template void prod_virial_grad_a_gpu_cuda<double>(double* grad_net,
+                                                  const double* grad,
+                                                  const double* env_deriv,
+                                                  const double* rij,
+                                                  const int* nlist,
+                                                  const int nloc,
+                                                  const int nnei);
+}  // namespace deepmd
+
+template <typename data_t>
+void ProdForceSeAOpGPUBackwardKernel(int nloc,
+                                     int nframes,
+                                     int ndescrpt,
+                                     int nnei,
+                                     const data_t* virial_grad,
+                                     const data_t* net_deriv,
+                                     const data_t* in_deriv,
+                                     const data_t* rij,
+                                     const int* nlist,
+                                     data_t* grad_net) {
+  data_t* p_grad_net = grad_net;
+  const data_t* p_grad = virial_grad;
+  const data_t* p_in_deriv = in_deriv;
+  const data_t* p_rij = rij;
+  const int* p_nlist = nlist;
+  for (int_64 kk = 0; kk < nframes; ++kk) {
+    data_t* grad_net = p_grad_net + kk * nloc * ndescrpt;
+    const data_t* virial_grad = p_grad + kk * 9;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const data_t* rij = p_rij + kk * nloc * nnei * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_virial_grad_a_gpu_cuda(grad_net, virial_grad, in_deriv, rij,
+                                        nlist, nloc, nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdVirialSeAOpCUDABackward(
+    const paddle::Tensor& virial_grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT_READY(virial_grad_tensor);
+  CHECK_INPUT_READY(net_deriv_tensor);
+  CHECK_INPUT_READY(in_deriv_tensor);
+  CHECK_INPUT_READY(rij_tensor);
+  CHECK_INPUT_READY(nlist_tensor);
+  CHECK_INPUT_READY(natoms_tensor);
+
+  auto grad_shape = virial_grad_tensor.shape();
+  auto net_deriv_shape = net_deriv_tensor.shape();
+  auto in_deriv_shape = in_deriv_tensor.shape();
+  auto rij_shape = rij_tensor.shape();
+  auto nlist_shape = nlist_tensor.shape();
+  auto natoms_shape = natoms_tensor.shape();
+
+  CHECK_INPUT_DIM(virial_grad_tensor, 2);
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(rij_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_shape[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+
+  CHECK_INPUT_CPU(natoms_tensor);
+  const int* natoms = natoms_tensor.data<int>();
+  int nframes = net_deriv_shape[0];
+  int nloc = natoms[0];
+  int ndescrpt = net_deriv_shape[1] / nloc;
+  int nnei = nlist_shape[1] / nloc;
+
+  PD_CHECK(nframes == grad_shape[0], "number of frames should match");
+  PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
+  PD_CHECK(nframes == rij_shape[0], "number of frames should match");
+  PD_CHECK(nframes == nlist_shape[0], "number of samples should match");
+  PD_CHECK(9 == grad_shape[1], "input grad shape should be 3 x natoms");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1],
+           "number of descriptors should match");
+  PD_CHECK(nloc * nnei * 3 == rij_shape[1], "dim of rij should be  nnei * 3");
+  PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
+
+  std::vector<int64_t> grad_net_shape{nframes, nloc * ndescrpt};
+  paddle::Tensor grad_net_tensor = paddle::empty(
+      grad_net_shape, virial_grad_tensor.dtype(), virial_grad_tensor.place());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      virial_grad_tensor.type(), "prod_force_se_a_cuda_backward_kernel", ([&] {
+        ProdForceSeAOpGPUBackwardKernel<data_t>(
+            nloc, nframes, ndescrpt, nnei, virial_grad_tensor.data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            rij_tensor.data<data_t>(), nlist_tensor.data<int>(),
+            grad_net_tensor.data<data_t>());
+      }));
+  return {grad_net_tensor};
+}
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index af82a09c2e..4dc38dd4ef 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -269,7 +269,7 @@ void deepmd::env_mat_nbor_update(InputNlist &inlist,
   memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *));
   memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **));
   const int ago = mesh_host[0];
-  if (ago == 0 || gpu_inlist.inum < inlist.inum) {
+  if (ago == 0 || gpu_inlist.inum < inlist.inum || !gpu_inlist.ilist) {
     const int inum = inlist.inum;
     if (gpu_inlist.inum < inum) {
       delete_device_memory(gpu_inlist.ilist);
@@ -279,6 +279,15 @@ void deepmd::env_mat_nbor_update(InputNlist &inlist,
       malloc_device_memory(gpu_inlist.numneigh, inum);
       malloc_device_memory(gpu_inlist.firstneigh, inum);
     }
+    if (!gpu_inlist.ilist) {
+      malloc_device_memory(gpu_inlist.ilist, inum);
+    }
+    if (!gpu_inlist.numneigh) {
+      malloc_device_memory(gpu_inlist.numneigh, inum);
+    }
+    if (!gpu_inlist.firstneigh) {
+      malloc_device_memory(gpu_inlist.firstneigh, inum);
+    }
     memcpy_host_to_device(gpu_inlist.ilist, inlist.ilist, inum);
     memcpy_host_to_device(gpu_inlist.numneigh, inlist.numneigh, inum);
     int _max_nbor_size = max_numneigh(inlist);
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index ccf95c0984..29a43422c0 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -833,7 +833,9 @@ void PairDeepMD::settings(int narg, char **arg) {
   numb_models = models.size();
   if (numb_models == 1) {
     try {
-      deep_pot.init(arg[0], get_node_rank(), get_file_content(arg[0]));
+      auto ptr = strstr(arg[0], ".pb");
+      deep_pot.init(arg[0], get_node_rank(),
+                    (ptr != NULL) ? get_file_content(arg[0]) : "");
     } catch (deepmd_compat::deepmd_exception &e) {
       error->one(FLERR, e.what());
     }