diff --git a/.github/labeler.yml b/.github/labeler.yml
index 195d2cd217..b0a85679de 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,15 +1,39 @@
 Python:
-- deepmd/**/*
-- deepmd_cli/**/*
-- source/tests/**/*
-Docs: doc/**/*
-Examples: examples/**/*
-Core: source/lib/**/*
-CUDA: source/lib/src/gpu/**/*
-ROCM: source/lib/src/gpu/**/*
-OP: source/op/**/*
-C++: source/api_cc/**/*
-C: source/api_c/**/*
-LAMMPS: source/lmp/**/*
-Gromacs: source/gmx/**/*
-i-Pi: source/ipi/**/*
+- changed-files:
+  - any-glob-to-any-file:
+    - deepmd/**/*
+    - deepmd_utils/**/*
+    - source/tests/**/*
+Docs:
+- changed-files:
+  - any-glob-to-any-file: doc/**/*
+Examples:
+- changed-files:
+  - any-glob-to-any-file: examples/**/*
+Core:
+- changed-files:
+  - any-glob-to-any-file: source/lib/**/*
+CUDA:
+- changed-files:
+  - any-glob-to-any-file: source/lib/src/gpu/**/*
+ROCM:
+- changed-files:
+  - any-glob-to-any-file: source/lib/src/gpu/**/*
+OP:
+- changed-files:
+  - any-glob-to-any-file: source/op/**/*
+C++:
+- changed-files:
+  - any-glob-to-any-file: source/api_cc/**/*
+C:
+- changed-files:
+  - any-glob-to-any-file: source/api_c/**/*
+LAMMPS:
+- changed-files:
+  - any-glob-to-any-file: source/lmp/**/*
+Gromacs:
+- changed-files:
+  - any-glob-to-any-file: source/gmx/**/*
+i-Pi:
+- changed-files:
+  - any-glob-to-any-file: source/ipi/**/*
diff --git a/.github/release.yml b/.github/release.yml
new file mode 100644
index 0000000000..382e5db00e
--- /dev/null
+++ b/.github/release.yml
@@ -0,0 +1,34 @@
+changelog:
+  exclude:
+    authors:
+      - app/pre-commit-ci
+      - app/dependabot
+  categories:
+    - title: Breaking Changes
+      labels:
+        - "breaking change"
+    - title: New Features
+      labels:
+        - "new feature"
+    - title: Enhancement
+      labels:
+        - enhancement
+    - title: Documentation
+      labels:
+        # automatically added
+        - Docs
+        # for docs outside the doc directory
+        - "other docs"
+      exclude:
+        labels:
+          - build
+          - bug
+    - title: Build and release
+      labels:
+        - build
+    - title: Bug fixings
+      labels:
+        - bug
+    - title: Other Changes
+      labels:
+        - "*"
diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml
index 964a11ce37..f029517d80 100644
--- a/.github/workflows/build_cc.yml
+++ b/.github/workflows/build_cc.yml
@@ -21,7 +21,7 @@ jobs:
           dp_variant: clang
     steps:
     - uses: actions/checkout@v4
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@v5
       with:
         python-version: '3.11'
         cache: 'pip'
@@ -37,7 +37,7 @@ jobs:
          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
          && sudo dpkg -i cuda-keyring_1.0-1_all.deb \
          && sudo apt-get update \
-         && sudo apt-get -y install cuda-cudart-dev-12-0 cuda-nvcc-12-0
+         && sudo apt-get -y install cuda-cudart-dev-12-2 cuda-nvcc-12-2
       if: matrix.variant == 'cuda120'
       env:
         DEBIAN_FRONTEND: noninteractive
diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 84c8ac4b74..23076e9bf5 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -33,6 +33,13 @@ jobs:
             python: 311
             platform_id: manylinux_x86_64
             dp_variant: cuda
+            cuda_version: 12.2
+          - os: ubuntu-latest
+            python: 311
+            platform_id: manylinux_x86_64
+            dp_variant: cuda
+            cuda_version: 11.8
+            dp_pkg_name: deepmd-kit-cu11
           # macos-x86-64
           - os: macos-latest
             python: 311
@@ -68,8 +75,11 @@ jobs:
           CIBW_ARCHS: all
           CIBW_BUILD: cp${{ matrix.python }}-${{ matrix.platform_id }}
           DP_VARIANT: ${{ matrix.dp_variant }}
-      - uses: actions/upload-artifact@v3
+          CUDA_VERSION: ${{ matrix.cuda_version }}
+          DP_PKG_NAME: ${{ matrix.dp_pkg_name }}
+      - uses: actions/upload-artifact@v4
         with:
+          name: cibw-cp${{ matrix.python }}-${{ matrix.platform_id }}-cu${{ matrix.cuda_version }}-${{ strategy.job-index }}
           path: ./wheelhouse/*.whl
   build_sdist:
     name: Build source distribution
@@ -78,7 +88,7 @@ jobs:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         name: Install Python
         with:
           python-version: '3.11'
@@ -87,8 +97,9 @@ jobs:
       - name: Build sdist
         run: python -m build --sdist
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
+          name: cibw-sdist
           path: dist/*.tar.gz
 
   upload_pypi:
@@ -99,22 +110,31 @@ jobs:
       id-token: write
     if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
     steps:
-      - uses: actions/download-artifact@v3
+      - uses: actions/download-artifact@v4
         with:
-          name: artifact
+          pattern: cibw-*
           path: dist
+          merge-multiple: true
       - uses: pypa/gh-action-pypi-publish@release/v1
 
   build_docker:
     # use the already built wheels to build docker
     needs: [build_wheels]
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - variant: ""
+            cuda_version: "12"
+          - variant: "_cu11"
+            cuda_version: "11"
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/download-artifact@v3
+      - uses: actions/download-artifact@v4
         with:
-          name: artifact
           path: source/install/docker/dist
+          merge-multiple: true
       - name: Log in to the Container registry
         uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d
         with:
@@ -124,27 +144,30 @@ jobs:
 
       - name: Extract metadata (tags, labels) for Docker
         id: meta
-        uses: docker/metadata-action@96383f45573cb7f253c731d3b3ab81c87ef81934
+        uses: docker/metadata-action@dbef88086f6cef02e264edb7dbf63250c17cef6c
         with:
           images: ghcr.io/deepmodeling/deepmd-kit
 
       - name: Build and push Docker image
-        uses: docker/build-push-action@0565240e2d4ab88bba5387d719585280857ece09
+        uses: docker/build-push-action@4a13e500e55cf31b7a5d59a38ab2040ab0f42f56
         with:
           context: source/install/docker
-          push: ${{ github.repository_owner == 'deepmodeling' && github.event_name == 'push' }}
-          tags: ${{ steps.meta.outputs.tags }}
+          push: ${{ github.repository_owner == 'deepmodeling' && github.event_name == 'push' && github.actor != 'dependabot[bot]' }}
+          tags: ${{ steps.meta.outputs.tags }}${{ matrix.variant }}
           labels: ${{ steps.meta.outputs.labels }}
+          build-args: |
+            VARIANT=${{ matrix.variant }}
+            CUDA_VERSION=${{ matrix.cuda_version }}
 
   build_pypi_index:
     needs: [build_wheels, build_sdist]
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/download-artifact@v3
+      - uses: actions/download-artifact@v4
         with:
-          name: artifact
           path: dist/packages
-      - uses: actions/setup-python@v4
+          merge-multiple: true
+      - uses: actions/setup-python@v5
         name: Install Python
         with:
           python-version: '3.11'
@@ -153,7 +176,7 @@ jobs:
           ls dist/packages > package_list.txt
           dumb-pypi --output-dir dist --packages-url ../../packages --package-list package_list.txt --title "DeePMD-kit Developed Packages"
       - name: Upload Pages artifact
-        uses: actions/upload-pages-artifact@v2
+        uses: actions/upload-pages-artifact@v3
         with:
           path: dist
   deploy_pypi_index:
@@ -169,11 +192,11 @@ jobs:
     steps:
       - name: Deploy to GitHub Pages
         id: deployment
-        uses: actions/deploy-pages@v2
+        uses: actions/deploy-pages@v4
 
   pass:
     name: Pass testing build wheels
-    needs: [build_wheels, build_sdist]
+    needs: [build_wheels, build_sdist, build_docker, build_pypi_index]
     runs-on: ubuntu-latest
     if: always()
     steps:
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000000..a9a162432c
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,58 @@
+name: "CodeQL"
+
+on:
+  push:
+  pull_request:
+  schedule:
+    - cron: '45 2 * * 2'
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
+    timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'c-cpp', 'javascript-typescript', 'python' ]
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.11'
+        cache: 'pip'
+      if: matrix.language == 'c-cpp'
+    - name: "Setup dependencies"
+      if: matrix.language == 'c-cpp'
+      run: |
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
+         && sudo dpkg -i cuda-keyring_1.0-1_all.deb \
+         && sudo apt-get update \
+         && sudo apt-get -y install cuda-cudart-dev-12-2 cuda-nvcc-12-2
+        python -m pip install tensorflow
+      env:
+        DEBIAN_FRONTEND: noninteractive
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v3
+      with:
+        languages: ${{ matrix.language }}
+        queries: security-extended,security-and-quality
+    - name: "Run, Build Application using script"
+      run: source/install/build_cc.sh
+      env:
+        DP_VARIANT: cuda
+        DOWNLOAD_TENSORFLOW: "FALSE"
+      if: matrix.language == 'c-cpp'
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v3
+      with:
+        category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 2c8ba30ba1..877c780f1f 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -9,6 +9,6 @@ jobs:
       pull-requests: write
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/labeler@v4
+    - uses: actions/labeler@v5
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/.github/workflows/package_c.yml b/.github/workflows/package_c.yml
index ada205be00..5594c79181 100644
--- a/.github/workflows/package_c.yml
+++ b/.github/workflows/package_c.yml
@@ -8,23 +8,37 @@ jobs:
   build_c:
     name: Build C library
     runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        include:
+          - tensorflow_build_version: "2.15"
+            tensorflow_version: ""
+            filename: libdeepmd_c.tar.gz
+          - tensorflow_build_version: "2.14"
+            tensorflow_version: ">=2.5.0rc0,<2.15"
+            filename: libdeepmd_c_cu11.tar.gz
     steps:
       - uses: actions/checkout@v4
       - name: Package C library
         run: ./source/install/docker_package_c.sh
+        env:
+          TENSORFLOW_VERSION: ${{ matrix.tensorflow_version }}
+          TENSORFLOW_BUILD_VERSION: ${{ matrix.tensorflow_build_version }}
+      - run: cp libdeepmd_c.tar.gz ${{ matrix.filename }}
+        if: matrix.filename != 'libdeepmd_c.tar.gz'
       # for download and debug
       - name: Upload artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
-          name: libdeepmd_c
-          path: ./libdeepmd_c.tar.gz
+          name: libdeepmd_c-${{ strategy.job-index }}-${{ matrix.filename }}
+          path: ${{ matrix.filename }}
       - name: Test C library
         run: ./source/install/docker_test_package_c.sh
       - name: Release
         uses: softprops/action-gh-release@v1
         if: startsWith(github.ref, 'refs/tags/')
         with:
-          files: libdeepmd_c.tar.gz
+          files: ${{ matrix.filename }}
   test_c:
     name: Test building from C library
     needs: [build_c]
@@ -32,9 +46,10 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - name: Download artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
-          name: libdeepmd_c
+          pattern: libdeepmd_c-*
+          merge-multiple: true
       - run: tar -vxzf ./libdeepmd_c.tar.gz
       - name: Test C library
         run: ./source/install/build_from_c.sh
diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml
index a98afa7a94..ef6fade8e5 100644
--- a/.github/workflows/test_cc.yml
+++ b/.github/workflows/test_cc.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v4
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@v5
       with:
         python-version: '3.11'
         cache: 'pip'
diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
index ca72a32277..e74c0abde2 100644
--- a/.github/workflows/test_cuda.yml
+++ b/.github/workflows/test_cuda.yml
@@ -11,14 +11,14 @@ jobs:
     runs-on: nvidia
     # https://github.com/deepmodeling/deepmd-kit/pull/2884#issuecomment-1744216845
     container:
-      image: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+      image: nvidia/cuda:12.2.0-devel-ubuntu22.04
       options: --gpus all
     if: github.repository_owner == 'deepmodeling' && github.event.label.name == 'Test CUDA' || github.event_name == 'workflow_dispatch'
     steps:
     - name: Make sudo and git work
       run: apt-get update && apt-get install -y sudo git
     - uses: actions/checkout@v4
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@v5
       with:
         python-version: '3.11'
         # cache: 'pip'
@@ -31,18 +31,19 @@ jobs:
          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
          && sudo dpkg -i cuda-keyring_1.0-1_all.deb \
          && sudo apt-get update \
-         && sudo apt-get -y install cuda-11-8 libcudnn8=8.9.5.*-1+cuda11.8
+         && sudo apt-get -y install cuda-12-2 libcudnn8=8.9.5.*-1+cuda12.2
       if: false  # skip as we use nvidia image
     - name: Set PyPI mirror for Aliyun cloud machine
       run: python -m pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple/
     - run: python -m pip install -U "pip>=21.3.1,!=23.0.0"
-    - run: python -m pip install -v -e .[gpu,test,lmp,cu11] "ase @ https://gitlab.com/ase/ase/-/archive/8c5aa5fd6448c5cfb517a014dccf2b214a9dfa8f/ase-8c5aa5fd6448c5cfb517a014dccf2b214a9dfa8f.tar.gz"
+    - run: python -m pip install "tensorflow>=2.15.0rc0"
+    - run: python -m pip install -v -e .[gpu,test,lmp,cu12] "ase @ https://gitlab.com/ase/ase/-/archive/8c5aa5fd6448c5cfb517a014dccf2b214a9dfa8f/ase-8c5aa5fd6448c5cfb517a014dccf2b214a9dfa8f.tar.gz"
       env:
         DP_BUILD_TESTING: 1
         DP_VARIANT: cuda
-        CUDA_PATH: /usr/local/cuda-11.8
+        CUDA_PATH: /usr/local/cuda-12.2
     - run: dp --version
-    - run: python -m pytest -s --cov=deepmd --cov=deepmd_cli source/tests --durations=0
+    - run: python -m pytest -s --cov=deepmd --cov=deepmd_utils source/tests --durations=0
     - run: source/install/test_cc_local.sh
       env:
         OMP_NUM_THREADS: 1
@@ -52,7 +53,7 @@ jobs:
         CMAKE_GENERATOR: Ninja
         DP_VARIANT: cuda
         DP_USE_MPICH2: 1
-        CUDA_PATH: /usr/local/cuda-11.8
+        CUDA_PATH: /usr/local/cuda-12.2
     - run: |
         export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/dp_test/lib:$CUDA_PATH/lib64:$LD_LIBRARY_PATH
         export PATH=$GITHUB_WORKSPACE/dp_test/bin:$PATH
@@ -63,7 +64,7 @@ jobs:
         TF_INTRA_OP_PARALLELISM_THREADS: 1
         TF_INTER_OP_PARALLELISM_THREADS: 1
         LAMMPS_PLUGIN_PATH: ${{ github.workspace }}/dp_test/lib/deepmd_lmp
-        CUDA_PATH: /usr/local/cuda-11.8
+        CUDA_PATH: /usr/local/cuda-12.2
     - uses: codecov/codecov-action@v3
       with:
         gcov: true
diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
index 0ac29a7d9b..1bd78bfae0 100644
--- a/.github/workflows/test_python.yml
+++ b/.github/workflows/test_python.yml
@@ -18,7 +18,7 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python }}
         cache: 'pip'
@@ -38,7 +38,7 @@ jobs:
         HOROVOD_WITH_TENSORFLOW: 1
         HOROVOD_WITHOUT_GLOO: 1
     - run: dp --version
-    - run: pytest --cov=deepmd --cov=deepmd_cli source/tests --durations=0
+    - run: pytest --cov=deepmd --cov=deepmd_utils source/tests --durations=0
     - uses: codecov/codecov-action@v3
       with:
         gcov: true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e168af2c8d..d4e89f1129 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,23 +23,22 @@ repos:
     -   id: check-toml
 # Python
 -   repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
     - id: isort
       files: \.py$
       exclude: ^source/3rdparty
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.1.1
+    rev: v0.1.13
     hooks:
     - id: ruff
       args: ["--fix"]
       exclude: ^source/3rdparty
--   repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 23.10.0
-    hooks:
-    -   id: black-jupyter
-        exclude: ^source/3rdparty
+      types_or: [python, pyi, jupyter]
+    - id: ruff-format
+      exclude: ^source/3rdparty
+      types_or: [python, pyi, jupyter]
 # numpydoc
 -   repo: https://github.com/Carreau/velin
     rev: 0.0.12
@@ -54,7 +53,7 @@ repos:
     -   id: blacken-docs
 # C++
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v17.0.3
+    rev: v17.0.6
     hooks:
     -   id: clang-format
         exclude: ^source/3rdparty|source/lib/src/gpu/cudart/.+\.inc
@@ -65,7 +64,7 @@ repos:
     -   id: csslint
 # Shell
 - repo: https://github.com/scop/pre-commit-shfmt
-  rev: v3.7.0-1
+  rev: v3.7.0-4
   hooks:
     - id: shfmt
 # CMake
diff --git a/README.md b/README.md
index 5914abe607..81fdead098 100644
--- a/README.md
+++ b/README.md
@@ -114,6 +114,7 @@ A full [document](doc/train/train-input-auto.rst) on options in the training inp
     - [Deep potential long-range](doc/model/dplr.md)
     - [Deep Potential - Range Correction (DPRc)](doc/model/dprc.md)
     - [Linear model](doc/model/linear.md)
+    - [Interpolation or combination with a pairwise potential](doc/model/pairtab.md)
 - [Training](doc/train/index.md)
     - [Training a model](doc/train/training.md)
     - [Advanced options](doc/train/training-advanced.md)
@@ -134,8 +135,7 @@ A full [document](doc/train/train-input-auto.rst) on options in the training inp
     - [Node.js interface](doc/inference/nodejs.md)
 - [Integrate with third-party packages](doc/third-party/index.rst)
     - [Use deep potential with ASE](doc/third-party/ase.md)
-    - [Run MD with LAMMPS](doc/third-party/lammps.md)
-    - [LAMMPS commands](doc/third-party/lammps-command.md)
+    - [Run MD with LAMMPS](doc/third-party/lammps-command.md)
     - [Run path-integral MD with i-PI](doc/third-party/ipi.md)
     - [Run MD with GROMACS](doc/third-party/gromacs.md)
     - [Interfaces out of DeePMD-kit](doc/third-party/out-of-deepmd-kit.md)
diff --git a/backend/dynamic_metadata.py b/backend/dynamic_metadata.py
index 0502684f47..ab955c3cf8 100644
--- a/backend/dynamic_metadata.py
+++ b/backend/dynamic_metadata.py
@@ -27,7 +27,7 @@ def dynamic_metadata(
     _, _, find_libpython_requires, extra_scripts, tf_version = get_argument_from_env()
     if field == "scripts":
         return {
-            "dp": "deepmd_cli.main:main",
+            "dp": "deepmd_utils.main:main",
             **extra_scripts,
         }
     elif field == "optional-dependencies":
@@ -44,7 +44,8 @@ def dynamic_metadata(
                 "sphinx>=3.1.1",
                 "sphinx_rtd_theme>=1.0.0rc1",
                 "sphinx_markdown_tables",
-                "myst-nb",
+                "myst-nb>=1.0.0rc0",
+                "myst-parser>=0.19.2",
                 "breathe",
                 "exhale",
                 "numpydoc",
@@ -56,7 +57,7 @@ def dynamic_metadata(
                 "sphinxcontrib-bibtex",
             ],
             "lmp": [
-                "lammps~=2023.8.2.1.0",
+                "lammps~=2023.8.2.2.0",
                 *find_libpython_requires,
             ],
             "ipi": [
diff --git a/backend/find_tensorflow.py b/backend/find_tensorflow.py
index aa75d5ecb4..08a73f7252 100644
--- a/backend/find_tensorflow.py
+++ b/backend/find_tensorflow.py
@@ -87,6 +87,24 @@ def find_tensorflow() -> Tuple[Optional[str], List[str]]:
         # TypeError if submodule_search_locations are None
         # IndexError if submodule_search_locations is an empty list
     except (AttributeError, TypeError, IndexError):
+        if os.environ.get("CIBUILDWHEEL", "0") == "1":
+            cuda_version = os.environ.get("CUDA_VERSION", "12.2")
+            if cuda_version == "" or cuda_version in SpecifierSet(">=12,<13"):
+                # CUDA 12.2
+                requires.extend(
+                    [
+                        "tensorflow-cpu>=2.15.0rc0; platform_machine=='x86_64' and platform_system == 'Linux'",
+                    ]
+                )
+            elif cuda_version in SpecifierSet(">=11,<12"):
+                # CUDA 11.8
+                requires.extend(
+                    [
+                        "tensorflow-cpu>=2.5.0rc0,<2.15; platform_machine=='x86_64' and platform_system == 'Linux'",
+                    ]
+                )
+            else:
+                raise RuntimeError("Unsupported CUDA version")
         requires.extend(get_tf_requirement()["cpu"])
         # setuptools will re-find tensorflow after installing setup_requires
         tf_install_dir = None
@@ -114,9 +132,9 @@ def get_tf_requirement(tf_version: str = "") -> dict:
 
     extra_requires = []
     extra_select = {}
-    if not (tf_version == "" or tf_version in SpecifierSet(">=2.12")):
+    if not (tf_version == "" or tf_version in SpecifierSet(">=2.12", prereleases=True)):
         extra_requires.append("protobuf<3.20")
-    if tf_version == "" or tf_version in SpecifierSet(">=1.15"):
+    if tf_version == "" or tf_version in SpecifierSet(">=1.15", prereleases=True):
         extra_select["mpi"] = [
             "horovod",
             "mpi4py",
@@ -129,6 +147,8 @@ def get_tf_requirement(tf_version: str = "") -> dict:
             "cpu": [
                 "tensorflow-cpu; platform_machine!='aarch64' and (platform_machine!='arm64' or platform_system != 'Darwin')",
                 "tensorflow; platform_machine=='aarch64' or (platform_machine=='arm64' and platform_system == 'Darwin')",
+                # https://github.com/tensorflow/tensorflow/issues/61830
+                "tensorflow-cpu<2.15; platform_system=='Windows'",
                 *extra_requires,
             ],
             "gpu": [
@@ -138,9 +158,9 @@ def get_tf_requirement(tf_version: str = "") -> dict:
             ],
             **extra_select,
         }
-    elif tf_version in SpecifierSet("<1.15") or tf_version in SpecifierSet(
-        ">=2.0,<2.1"
-    ):
+    elif tf_version in SpecifierSet(
+        "<1.15", prereleases=True
+    ) or tf_version in SpecifierSet(">=2.0,<2.1", prereleases=True):
         return {
             "cpu": [
                 f"tensorflow=={tf_version}",
diff --git a/codecov.yml b/codecov.yml
index 24dd9e3a23..3654859423 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -20,7 +20,7 @@ component_management:
       name: Python
       paths:
         - deepmd/**
-        - deepmd_cli/**
+        - deepmd_utils/**
     - component_id: module_op
       name: OP
       paths:
diff --git a/deepmd/__init__.py b/deepmd/__init__.py
index b02817b6fc..0190bbc124 100644
--- a/deepmd/__init__.py
+++ b/deepmd/__init__.py
@@ -32,7 +32,7 @@
 set_mkl()
 
 try:
-    from deepmd_cli._version import version as __version__
+    from deepmd_utils._version import version as __version__
 except ImportError:
     from .__about__ import (
         __version__,
diff --git a/deepmd/calculator.py b/deepmd/calculator.py
index 8636ff30d2..b9c0a81006 100644
--- a/deepmd/calculator.py
+++ b/deepmd/calculator.py
@@ -45,6 +45,8 @@ class DP(Calculator):
     type_dict : Dict[str, int], optional
         mapping of element types and their numbers, best left None and the calculator
         will infer this information from model, by default None
+    neighbor_list : ase.neighborlist.NeighborList, optional
+        The neighbor list object. If None, then build the native neighbor list.
 
     Examples
     --------
@@ -83,10 +85,11 @@ def __init__(
         model: Union[str, "Path"],
         label: str = "DP",
         type_dict: Optional[Dict[str, int]] = None,
+        neighbor_list=None,
         **kwargs,
     ) -> None:
         Calculator.__init__(self, label=label, **kwargs)
-        self.dp = DeepPotential(str(Path(model).resolve()))
+        self.dp = DeepPotential(str(Path(model).resolve()), neighbor_list=neighbor_list)
         if type_dict:
             self.type_dict = type_dict
         else:
diff --git a/deepmd/common.py b/deepmd/common.py
index 472508bb08..54e3d0a6f8 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -1,53 +1,65 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """Collection of functions and classes used throughout the whole package."""
 
-import json
 import warnings
 from functools import (
     wraps,
 )
-from pathlib import (
-    Path,
-)
 from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
-    Dict,
-    List,
-    Optional,
-    TypeVar,
     Union,
 )
 
-import numpy as np
 import tensorflow
-import yaml
 from tensorflow.python.framework import (
     tensor_util,
 )
 
 from deepmd.env import (
-    GLOBAL_NP_FLOAT_PRECISION,
     GLOBAL_TF_FLOAT_PRECISION,
     op_module,
     tf,
 )
-from deepmd.utils.path import (
-    DPPath,
+from deepmd_utils.common import (
+    add_data_requirement,
+    data_requirement,
+    expand_sys_str,
+    get_np_precision,
+    j_loader,
+    j_must_have,
+    make_default_mesh,
+    select_idx_map,
 )
 
 if TYPE_CHECKING:
-    _DICT_VAL = TypeVar("_DICT_VAL")
-    _OBJ = TypeVar("_OBJ")
-    try:
-        from typing import Literal  # python >3.6
-    except ImportError:
-        from typing_extensions import Literal  # type: ignore
-    _ACTIVATION = Literal[
-        "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu", "gelu_tf"
-    ]
-    _PRECISION = Literal["default", "float16", "float32", "float64"]
+    from deepmd_utils.common import (
+        _ACTIVATION,
+        _PRECISION,
+    )
+
+__all__ = [
+    # from deepmd_utils.common
+    "data_requirement",
+    "add_data_requirement",
+    "select_idx_map",
+    "make_default_mesh",
+    "j_must_have",
+    "j_loader",
+    "expand_sys_str",
+    "get_np_precision",
+    # from self
+    "PRECISION_DICT",
+    "gelu",
+    "gelu_tf",
+    "ACTIVATION_FN_DICT",
+    "get_activation_func",
+    "get_precision",
+    "safe_cast_tensor",
+    "cast_precision",
+    "clear_session",
+]
 
 # define constants
 PRECISION_DICT = {
@@ -115,10 +127,6 @@ def gelu_wrapper(x):
     return (lambda x: gelu_wrapper(x))(x)
 
 
-# TODO this is not a good way to do things. This is some global variable to which
-# TODO anyone can write and there is no good way to keep track of the changes
-data_requirement = {}
-
 ACTIVATION_FN_DICT = {
     "relu": tf.nn.relu,
     "relu6": tf.nn.relu6,
@@ -132,164 +140,6 @@ def gelu_wrapper(x):
 }
 
 
-def add_data_requirement(
-    key: str,
-    ndof: int,
-    atomic: bool = False,
-    must: bool = False,
-    high_prec: bool = False,
-    type_sel: Optional[bool] = None,
-    repeat: int = 1,
-    default: float = 0.0,
-    dtype: Optional[np.dtype] = None,
-):
-    """Specify data requirements for training.
-
-    Parameters
-    ----------
-    key : str
-        type of data stored in corresponding `*.npy` file e.g. `forces` or `energy`
-    ndof : int
-        number of the degrees of freedom, this is tied to `atomic` parameter e.g. forces
-        have `atomic=True` and `ndof=3`
-    atomic : bool, optional
-        specifies whwther the `ndof` keyworrd applies to per atom quantity or not,
-        by default False
-    must : bool, optional
-        specifi if the `*.npy` data file must exist, by default False
-    high_prec : bool, optional
-        if true load data to `np.float64` else `np.float32`, by default False
-    type_sel : bool, optional
-        select only certain type of atoms, by default None
-    repeat : int, optional
-        if specify repaeat data `repeat` times, by default 1
-    default : float, optional, default=0.
-        default value of data
-    dtype : np.dtype, optional
-        the dtype of data, overwrites `high_prec` if provided
-    """
-    data_requirement[key] = {
-        "ndof": ndof,
-        "atomic": atomic,
-        "must": must,
-        "high_prec": high_prec,
-        "type_sel": type_sel,
-        "repeat": repeat,
-        "default": default,
-        "dtype": dtype,
-    }
-
-
-def select_idx_map(atom_types: np.ndarray, select_types: np.ndarray) -> np.ndarray:
-    """Build map of indices for element supplied element types from all atoms list.
-
-    Parameters
-    ----------
-    atom_types : np.ndarray
-        array specifing type for each atoms as integer
-    select_types : np.ndarray
-        types of atoms you want to find indices for
-
-    Returns
-    -------
-    np.ndarray
-        indices of types of atoms defined by `select_types` in `atom_types` array
-
-    Warnings
-    --------
-    `select_types` array will be sorted before finding indices in `atom_types`
-    """
-    sort_select_types = np.sort(select_types)
-    idx_map = []
-    for ii in sort_select_types:
-        idx_map.append(np.where(atom_types == ii)[0])
-    return np.concatenate(idx_map)
-
-
-def make_default_mesh(pbc: bool, mixed_type: bool) -> np.ndarray:
-    """Make mesh.
-
-    Only the size of mesh matters, not the values:
-    * 6 for PBC, no mixed types
-    * 0 for no PBC, no mixed types
-    * 7 for PBC, mixed types
-    * 1 for no PBC, mixed types
-
-    Parameters
-    ----------
-    pbc : bool
-        if True, the mesh will be made for periodic boundary conditions
-    mixed_type : bool
-        if True, the mesh will be made for mixed types
-
-    Returns
-    -------
-    np.ndarray
-        mesh
-    """
-    mesh_size = int(pbc) * 6 + int(mixed_type)
-    default_mesh = np.zeros(mesh_size, dtype=np.int32)
-    return default_mesh
-
-
-# TODO maybe rename this to j_deprecated and only warn about deprecated keys,
-# TODO if the deprecated_key argument is left empty function puppose is only custom
-# TODO error since dict[key] already raises KeyError when the key is missing
-def j_must_have(
-    jdata: Dict[str, "_DICT_VAL"], key: str, deprecated_key: List[str] = []
-) -> "_DICT_VAL":
-    """Assert that supplied dictionary conaines specified key.
-
-    Returns
-    -------
-    _DICT_VAL
-        value that was store unde supplied key
-
-    Raises
-    ------
-    RuntimeError
-        if the key is not present
-    """
-    if key not in jdata.keys():
-        for ii in deprecated_key:
-            if ii in jdata.keys():
-                warnings.warn(f"the key {ii} is deprecated, please use {key} instead")
-                return jdata[ii]
-        else:
-            raise RuntimeError(f"json database must provide key {key}")
-    else:
-        return jdata[key]
-
-
-def j_loader(filename: Union[str, Path]) -> Dict[str, Any]:
-    """Load yaml or json settings file.
-
-    Parameters
-    ----------
-    filename : Union[str, Path]
-        path to file
-
-    Returns
-    -------
-    Dict[str, Any]
-        loaded dictionary
-
-    Raises
-    ------
-    TypeError
-        if the supplied file is of unsupported type
-    """
-    filepath = Path(filename)
-    if filepath.suffix.endswith("json"):
-        with filepath.open() as fp:
-            return json.load(fp)
-    elif filepath.suffix.endswith(("yml", "yaml")):
-        with filepath.open() as fp:
-            return yaml.safe_load(fp)
-    else:
-        raise TypeError("config file must be json, or yaml/yml")
-
-
 def get_activation_func(
     activation_fn: Union["_ACTIVATION", None],
 ) -> Union[Callable[[tf.Tensor], tf.Tensor], None]:
@@ -340,57 +190,6 @@ def get_precision(precision: "_PRECISION") -> Any:
     return PRECISION_DICT[precision]
 
 
-# TODO port completely to pathlib when all callers are ported
-def expand_sys_str(root_dir: Union[str, Path]) -> List[str]:
-    """Recursively iterate over directories taking those that contain `type.raw` file.
-
-    Parameters
-    ----------
-    root_dir : Union[str, Path]
-        starting directory
-
-    Returns
-    -------
-    List[str]
-        list of string pointing to system directories
-    """
-    root_dir = DPPath(root_dir)
-    matches = [str(d) for d in root_dir.rglob("*") if (d / "type.raw").is_file()]
-    if (root_dir / "type.raw").is_file():
-        matches.append(str(root_dir))
-    return matches
-
-
-def get_np_precision(precision: "_PRECISION") -> np.dtype:
-    """Get numpy precision constant from string.
-
-    Parameters
-    ----------
-    precision : _PRECISION
-        string name of numpy constant or default
-
-    Returns
-    -------
-    np.dtype
-        numpy presicion constant
-
-    Raises
-    ------
-    RuntimeError
-        if string is invalid
-    """
-    if precision == "default":
-        return GLOBAL_NP_FLOAT_PRECISION
-    elif precision == "float16":
-        return np.float16
-    elif precision == "float32":
-        return np.float32
-    elif precision == "float64":
-        return np.float64
-    else:
-        raise RuntimeError(f"{precision} is not a valid precision")
-
-
 def safe_cast_tensor(
     input: tf.Tensor, from_precision: tf.DType, to_precision: tf.DType
 ) -> tf.Tensor:
diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 2de0b63245..721bb0d534 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -41,6 +41,8 @@
     GraphWithoutTensorError,
 )
 from deepmd.utils.graph import (
+    get_extra_embedding_net_suffix,
+    get_extra_embedding_net_variables_from_graph_def,
     get_pattern_nodes_from_graph_def,
     get_tensor_by_name_from_graph,
 )
@@ -204,7 +206,7 @@ def __init__(
         self.type_one_side = type_one_side
         self.spin = spin
         self.stripped_type_embedding = stripped_type_embedding
-        self.extra_embeeding_net_variables = None
+        self.extra_embedding_net_variables = None
         self.layer_size = len(neuron)
 
         # extend sel_a for spin system
@@ -470,11 +472,13 @@ def enable_compression(
             )
 
         if self.stripped_type_embedding:
+            one_side_suffix = get_extra_embedding_net_suffix(type_one_side=True)
+            two_side_suffix = get_extra_embedding_net_suffix(type_one_side=False)
             ret_two_side = get_pattern_nodes_from_graph_def(
-                graph_def, f"filter_type_all{suffix}/.+_two_side_ebd"
+                graph_def, f"filter_type_all{suffix}/.+{two_side_suffix}"
             )
             ret_one_side = get_pattern_nodes_from_graph_def(
-                graph_def, f"filter_type_all{suffix}/.+_one_side_ebd"
+                graph_def, f"filter_type_all{suffix}/.+{one_side_suffix}"
             )
             if len(ret_two_side) == 0 and len(ret_one_side) == 0:
                 raise RuntimeError(
@@ -487,19 +491,19 @@ def enable_compression(
             elif len(ret_two_side) != 0:
                 self.final_type_embedding = get_two_side_type_embedding(self, graph)
                 self.matrix = get_extra_side_embedding_net_variable(
-                    self, graph_def, "two_side", "matrix", suffix
+                    self, graph_def, two_side_suffix, "matrix", suffix
                 )
                 self.bias = get_extra_side_embedding_net_variable(
-                    self, graph_def, "two_side", "bias", suffix
+                    self, graph_def, two_side_suffix, "bias", suffix
                 )
                 self.extra_embedding = make_data(self, self.final_type_embedding)
             else:
                 self.final_type_embedding = get_type_embedding(self, graph)
                 self.matrix = get_extra_side_embedding_net_variable(
-                    self, graph_def, "one_side", "matrix", suffix
+                    self, graph_def, one_side_suffix, "matrix", suffix
                 )
                 self.bias = get_extra_side_embedding_net_variable(
-                    self, graph_def, "one_side", "bias", suffix
+                    self, graph_def, one_side_suffix, "bias", suffix
                 )
                 self.extra_embedding = make_data(self, self.final_type_embedding)
 
@@ -778,16 +782,16 @@ def _pass_filter(
             type_i = -1
             if nvnmd_cfg.enable and nvnmd_cfg.quantize_descriptor:
                 inputs_i = descrpt2r4(inputs_i, natoms)
+            self.atype_nloc = tf.reshape(
+                tf.slice(atype, [0, 0], [-1, natoms[0]]), [-1]
+            )  # when nloc != nall, pass nloc to mask
             if len(self.exclude_types):
-                atype_nloc = tf.reshape(
-                    tf.slice(atype, [0, 0], [-1, natoms[0]]), [-1]
-                )  # when nloc != nall, pass nloc to mask
                 mask = self.build_type_exclude_mask(
                     self.exclude_types,
                     self.ntypes,
                     self.sel_a,
                     self.ndescrpt,
-                    atype_nloc,
+                    self.atype_nloc,
                     tf.shape(inputs_i)[0],
                 )
                 inputs_i *= mask
@@ -952,7 +956,7 @@ def _filter_lower(
                     extra_embedding_index = self.nei_type_vec
                 else:
                     padding_ntypes = type_embedding.shape[0]
-                    atype_expand = tf.reshape(self.atype, [-1, 1])
+                    atype_expand = tf.reshape(self.atype_nloc, [-1, 1])
                     idx_i = tf.tile(atype_expand * padding_ntypes, [1, self.nnei])
                     idx_j = tf.reshape(self.nei_type_vec, [-1, self.nnei])
                     idx = idx_i + idx_j
@@ -961,20 +965,21 @@ def _filter_lower(
 
                 if not self.compress:
                     if self.type_one_side:
-                        one_side_type_embedding_suffix = "_one_side_ebd"
                         net_output = embedding_net(
                             type_embedding,
                             self.filter_neuron,
                             self.filter_precision,
                             activation_fn=activation_fn,
                             resnet_dt=self.filter_resnet_dt,
-                            name_suffix=one_side_type_embedding_suffix,
+                            name_suffix=get_extra_embedding_net_suffix(
+                                self.type_one_side
+                            ),
                             stddev=stddev,
                             bavg=bavg,
                             seed=self.seed,
                             trainable=trainable,
                             uniform_seed=self.uniform_seed,
-                            initial_variables=self.extra_embeeding_net_variables,
+                            initial_variables=self.extra_embedding_net_variables,
                             mixed_prec=self.mixed_prec,
                         )
                         net_output = tf.nn.embedding_lookup(
@@ -997,27 +1002,21 @@ def _filter_lower(
                             [-1, two_side_type_embedding.shape[-1]],
                         )
 
-                        atype_expand = tf.reshape(self.atype, [-1, 1])
-                        idx_i = tf.tile(atype_expand * padding_ntypes, [1, self.nnei])
-                        idx_j = tf.reshape(self.nei_type_vec, [-1, self.nnei])
-                        idx = idx_i + idx_j
-                        index_of_two_side = tf.reshape(idx, [-1])
-                        self.extra_embedding_index = index_of_two_side
-
-                        two_side_type_embedding_suffix = "_two_side_ebd"
                         net_output = embedding_net(
                             two_side_type_embedding,
                             self.filter_neuron,
                             self.filter_precision,
                             activation_fn=activation_fn,
                             resnet_dt=self.filter_resnet_dt,
-                            name_suffix=two_side_type_embedding_suffix,
+                            name_suffix=get_extra_embedding_net_suffix(
+                                self.type_one_side
+                            ),
                             stddev=stddev,
                             bavg=bavg,
                             seed=self.seed,
                             trainable=trainable,
                             uniform_seed=self.uniform_seed,
-                            initial_variables=self.extra_embeeding_net_variables,
+                            initial_variables=self.extra_embedding_net_variables,
                             mixed_prec=self.mixed_prec,
                         )
                         net_output = tf.nn.embedding_lookup(net_output, idx)
@@ -1327,6 +1326,15 @@ def init_variables(
                 self.dstd = new_dstd
                 if self.original_sel is None:
                     self.original_sel = sel
+        if self.stripped_type_embedding:
+            self.extra_embedding_net_variables = (
+                get_extra_embedding_net_variables_from_graph_def(
+                    graph_def,
+                    suffix,
+                    get_extra_embedding_net_suffix(self.type_one_side),
+                    self.layer_size,
+                )
+            )
 
     @property
     def explicit_ntypes(self) -> bool:
diff --git a/deepmd/descriptor/se_a_mask.py b/deepmd/descriptor/se_a_mask.py
index 780b34d294..cc2e6b4fc8 100644
--- a/deepmd/descriptor/se_a_mask.py
+++ b/deepmd/descriptor/se_a_mask.py
@@ -417,3 +417,16 @@ def prod_force_virial(
         atom_virial = tf.zeros([1, natoms[1], 9], dtype=force.dtype)
 
         return force, virial, atom_virial
+
+    @classmethod
+    def update_sel(cls, global_jdata: dict, local_jdata: dict):
+        """Update the selection and perform neighbor statistics.
+
+        Parameters
+        ----------
+        global_jdata : dict
+            The global data, containing the training section
+        local_jdata : dict
+            The local data refer to the current class
+        """
+        return local_jdata
diff --git a/deepmd/descriptor/se_atten.py b/deepmd/descriptor/se_atten.py
index 8e4c3c3ef6..1ceda23065 100644
--- a/deepmd/descriptor/se_atten.py
+++ b/deepmd/descriptor/se_atten.py
@@ -42,9 +42,10 @@
 )
 from deepmd.utils.graph import (
     get_attention_layer_variables_from_graph_def,
+    get_extra_embedding_net_suffix,
+    get_extra_embedding_net_variables_from_graph_def,
     get_pattern_nodes_from_graph_def,
     get_tensor_by_name_from_graph,
-    get_tensor_by_type,
 )
 from deepmd.utils.network import (
     embedding_net,
@@ -391,11 +392,12 @@ def enable_compression(
             raise RuntimeError("can not compress model when attention layer is not 0.")
 
         ret = get_pattern_nodes_from_graph_def(
-            graph_def, f"filter_type_all{suffix}/.+_two_side_ebd"
+            graph_def,
+            f"filter_type_all{suffix}/.+{get_extra_embedding_net_suffix(type_one_side=False)}",
         )
         if len(ret) == 0:
             raise RuntimeError(
-                "can not find variables of embedding net `*_two_side_ebd` from graph_def, maybe it is not a compressible model."
+                f"can not find variables of embedding net `*{get_extra_embedding_net_suffix(type_one_side=False)}` from graph_def, maybe it is not a compressible model."
             )
 
         self.compress = True
@@ -420,11 +422,12 @@ def enable_compression(
         )
 
         self.final_type_embedding = get_two_side_type_embedding(self, graph)
+        type_side_suffix = get_extra_embedding_net_suffix(type_one_side=False)
         self.matrix = get_extra_side_embedding_net_variable(
-            self, graph_def, "two_side", "matrix", suffix
+            self, graph_def, type_side_suffix, "matrix", suffix
         )
         self.bias = get_extra_side_embedding_net_variable(
-            self, graph_def, "two_side", "bias", suffix
+            self, graph_def, type_side_suffix, "bias", suffix
         )
         self.two_embd = make_data(self, self.final_type_embedding)
 
@@ -1125,14 +1128,15 @@ def _filter_lower(
                             two_side_type_embedding,
                             [-1, two_side_type_embedding.shape[-1]],
                         )
-                        two_side_type_embedding_suffix = "_two_side_ebd"
                         embedding_of_two_side_type_embedding = embedding_net(
                             two_side_type_embedding,
                             self.filter_neuron,
                             self.filter_precision,
                             activation_fn=activation_fn,
                             resnet_dt=self.filter_resnet_dt,
-                            name_suffix=two_side_type_embedding_suffix,
+                            name_suffix=get_extra_embedding_net_suffix(
+                                type_one_side=False
+                            ),
                             stddev=stddev,
                             bavg=bavg,
                             seed=self.seed,
@@ -1292,18 +1296,6 @@ def init_variables(
         """
         super().init_variables(graph=graph, graph_def=graph_def, suffix=suffix)
 
-        if self.stripped_type_embedding:
-            self.two_side_embeeding_net_variables = {}
-            for i in range(1, self.layer_size + 1):
-                matrix_pattern = f"filter_type_all{suffix}/matrix_{i}_two_side_ebd"
-                self.two_side_embeeding_net_variables[
-                    matrix_pattern
-                ] = self._get_two_embed_variables(graph_def, matrix_pattern)
-                bias_pattern = f"filter_type_all{suffix}/bias_{i}_two_side_ebd"
-                self.two_side_embeeding_net_variables[
-                    bias_pattern
-                ] = self._get_two_embed_variables(graph_def, bias_pattern)
-
         self.attention_layer_variables = get_attention_layer_variables_from_graph_def(
             graph_def, suffix=suffix
         )
@@ -1322,18 +1314,15 @@ def init_variables(
                     f"attention_layer_{i}{suffix}/layer_normalization_{i}/gamma"
                 ]
 
-    def _get_two_embed_variables(self, graph_def, pattern: str):
-        node = get_pattern_nodes_from_graph_def(graph_def, pattern)[pattern]
-        dtype = tf.as_dtype(node.dtype).as_numpy_dtype
-        tensor_shape = tf.TensorShape(node.tensor_shape).as_list()
-        if (len(tensor_shape) != 1) or (tensor_shape[0] != 1):
-            tensor_value = np.frombuffer(
-                node.tensor_content,
-                dtype=tf.as_dtype(node.dtype).as_numpy_dtype,
+        if self.stripped_type_embedding:
+            self.two_side_embeeding_net_variables = (
+                get_extra_embedding_net_variables_from_graph_def(
+                    graph_def,
+                    suffix,
+                    get_extra_embedding_net_suffix(type_one_side=False),
+                    self.layer_size,
+                )
             )
-        else:
-            tensor_value = get_tensor_by_type(node, dtype)
-        return np.reshape(tensor_value, tensor_shape)
 
     def build_type_exclude_mask(
         self,
diff --git a/deepmd/entrypoints/doc.py b/deepmd/entrypoints/doc.py
index 087eb10f73..cc28e52930 100644
--- a/deepmd/entrypoints/doc.py
+++ b/deepmd/entrypoints/doc.py
@@ -1,20 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Module that prints train input arguments docstrings."""
-
-from deepmd.utils.argcheck import (
-    gen_doc,
-    gen_json,
+from deepmd_utils.entrypoints.doc import (
+    doc_train_input,
 )
 
 __all__ = ["doc_train_input"]
-
-
-def doc_train_input(*, out_type: str = "rst", **kwargs):
-    """Print out trining input arguments to console."""
-    if out_type == "rst":
-        doc_str = gen_doc(make_anchor=True)
-    elif out_type == "json":
-        doc_str = gen_json()
-    else:
-        raise RuntimeError("Unsupported out type %s" % out_type)
-    print(doc_str)
diff --git a/deepmd/entrypoints/gui.py b/deepmd/entrypoints/gui.py
index 8b6b9e0a09..72de65f1c2 100644
--- a/deepmd/entrypoints/gui.py
+++ b/deepmd/entrypoints/gui.py
@@ -1,31 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""DP-GUI entrypoint."""
+from deepmd_utils.entrypoints.gui import (
+    start_dpgui,
+)
 
-
-def start_dpgui(*, port: int, bind_all: bool, **kwargs):
-    """Host DP-GUI server.
-
-    Parameters
-    ----------
-    port : int
-        The port to serve DP-GUI on.
-    bind_all : bool
-        Serve on all public interfaces. This will expose your DP-GUI instance
-        to the network on both IPv4 and IPv6 (where available).
-    **kwargs
-        additional arguments
-
-    Raises
-    ------
-    ModuleNotFoundError
-        The dpgui package is not installed
-    """
-    try:
-        from dpgui import (
-            start_dpgui,
-        )
-    except ModuleNotFoundError as e:
-        raise ModuleNotFoundError(
-            "To use DP-GUI, please install the dpgui package:\npip install dpgui"
-        ) from e
-    start_dpgui(port=port, bind_all=bind_all)
+__all__ = ["start_dpgui"]
diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py
index 782136b542..2c6ac26a7f 100644
--- a/deepmd/entrypoints/main.py
+++ b/deepmd/entrypoints/main.py
@@ -32,7 +32,7 @@
 from deepmd.nvnmd.entrypoints.train import (
     train_nvnmd,
 )
-from deepmd_cli.main import (
+from deepmd_utils.main import (
     get_ll,
     main_parser,
     parse_args,
diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py
index 9469b7df90..227aa13644 100755
--- a/deepmd/entrypoints/train.py
+++ b/deepmd/entrypoints/train.py
@@ -404,9 +404,7 @@ def get_nbor_stat(jdata, rcut, one_type: bool = False):
                 None,
             )
             tmp_data.get_batch()
-            assert (
-                tmp_data.get_type_map()
-            ), f"In multi-task mode, 'type_map.raw' must be defined in data systems {systems}! "
+            assert tmp_data.get_type_map(), f"In multi-task mode, 'type_map.raw' must be defined in data systems {systems}! "
             if train_data is None:
                 train_data = tmp_data
             else:
diff --git a/deepmd/env.py b/deepmd/env.py
index 9b7f86f0d5..f290dc0a90 100644
--- a/deepmd/env.py
+++ b/deepmd/env.py
@@ -28,6 +28,11 @@
 )
 
 import deepmd.lib
+from deepmd_utils.env import (
+    GLOBAL_ENER_FLOAT_PRECISION,
+    GLOBAL_NP_FLOAT_PRECISION,
+    global_float_prec,
+)
 
 if TYPE_CHECKING:
     from types import (
@@ -475,24 +480,7 @@ def _get_package_constants(
 op_grads_module = get_module("op_grads")
 
 # FLOAT_PREC
-dp_float_prec = os.environ.get("DP_INTERFACE_PREC", "high").lower()
-if dp_float_prec in ("high", ""):
-    # default is high
-    GLOBAL_TF_FLOAT_PRECISION = tf.float64
-    GLOBAL_NP_FLOAT_PRECISION = np.float64
-    GLOBAL_ENER_FLOAT_PRECISION = np.float64
-    global_float_prec = "double"
-elif dp_float_prec == "low":
-    GLOBAL_TF_FLOAT_PRECISION = tf.float32
-    GLOBAL_NP_FLOAT_PRECISION = np.float32
-    GLOBAL_ENER_FLOAT_PRECISION = np.float64
-    global_float_prec = "float"
-else:
-    raise RuntimeError(
-        "Unsupported float precision option: %s. Supported: high,"
-        "low. Please set precision with environmental variable "
-        "DP_INTERFACE_PREC." % dp_float_prec
-    )
+GLOBAL_TF_FLOAT_PRECISION = tf.dtypes.as_dtype(GLOBAL_NP_FLOAT_PRECISION)
 
 
 def global_cvt_2_tf_float(xx: tf.Tensor) -> tf.Tensor:
diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py
index e74d4a7e6d..4c15e57124 100644
--- a/deepmd/fit/ener.py
+++ b/deepmd/fit/ener.py
@@ -514,6 +514,11 @@ def build(
                     self.bias_atom_e[type_i] = self.bias_atom_e[type_i]
             self.bias_atom_e = self.bias_atom_e[:ntypes_atom]
 
+        if nvnmd_cfg.enable:
+            # fix the bug: CNN and QNN have different t_bias_atom_e.
+            if "t_bias_atom_e" in nvnmd_cfg.weight.keys():
+                self.bias_atom_e = nvnmd_cfg.weight["t_bias_atom_e"]
+
         with tf.variable_scope("fitting_attr" + suffix, reuse=reuse):
             t_dfparam = tf.constant(self.numb_fparam, name="dfparam", dtype=tf.int32)
             t_daparam = tf.constant(self.numb_aparam, name="daparam", dtype=tf.int32)
diff --git a/deepmd/fit/polar.py b/deepmd/fit/polar.py
index 0a6f7d4242..8f6631866c 100644
--- a/deepmd/fit/polar.py
+++ b/deepmd/fit/polar.py
@@ -213,8 +213,9 @@ def compute_input_stats(self, all_stat, protection=1e-2):
                     # add polar_bias
                     polar_bias.append(all_stat["polarizability"][ss].reshape((1, 9)))
 
-            matrix, bias = np.concatenate(sys_matrix, axis=0), np.concatenate(
-                polar_bias, axis=0
+            matrix, bias = (
+                np.concatenate(sys_matrix, axis=0),
+                np.concatenate(polar_bias, axis=0),
             )
             atom_polar, _, _, _ = np.linalg.lstsq(matrix, bias, rcond=None)
             for itype in range(len(self.sel_type)):
diff --git a/deepmd/infer/__init__.py b/deepmd/infer/__init__.py
index 14d75d0c44..c1071af35c 100644
--- a/deepmd/infer/__init__.py
+++ b/deepmd/infer/__init__.py
@@ -58,6 +58,7 @@ def DeepPotential(
     load_prefix: str = "load",
     default_tf_graph: bool = False,
     input_map: Optional[dict] = None,
+    neighbor_list=None,
 ) -> Union[DeepDipole, DeepGlobalPolar, DeepPolar, DeepPot, DeepDOS, DeepWFC]:
     """Factory function that will inialize appropriate potential read from `model_file`.
 
@@ -71,6 +72,8 @@ def DeepPotential(
         If uses the default tf graph, otherwise build a new tf graph for evaluation
     input_map : dict, optional
         The input map for tf.import_graph_def. Only work with default tf graph
+    neighbor_list : ase.neighborlist.NeighborList, optional
+        The neighbor list object. If None, then build the native neighbor list.
 
     Returns
     -------
@@ -97,6 +100,7 @@ def DeepPotential(
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
             input_map=input_map,
+            neighbor_list=neighbor_list,
         )
     elif model_type == "dos":
         dp = DeepDOS(
@@ -111,6 +115,7 @@ def DeepPotential(
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
             input_map=input_map,
+            neighbor_list=neighbor_list,
         )
     elif model_type == "polar":
         dp = DeepPolar(
@@ -118,6 +123,7 @@ def DeepPotential(
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
             input_map=input_map,
+            neighbor_list=neighbor_list,
         )
     elif model_type == "global_polar":
         dp = DeepGlobalPolar(
@@ -125,6 +131,7 @@ def DeepPotential(
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
             input_map=input_map,
+            neighbor_list=neighbor_list,
         )
     elif model_type == "wfc":
         dp = DeepWFC(
diff --git a/deepmd/infer/deep_dipole.py b/deepmd/infer/deep_dipole.py
index 6020118135..aba098a9f3 100644
--- a/deepmd/infer/deep_dipole.py
+++ b/deepmd/infer/deep_dipole.py
@@ -27,6 +27,8 @@ class DeepDipole(DeepTensor):
         If uses the default tf graph, otherwise build a new tf graph for evaluation
     input_map : dict, optional
         The input map for tf.import_graph_def. Only work with default tf graph
+    neighbor_list : ase.neighborlist.NeighborList, optional
+        The neighbor list object. If None, then build the native neighbor list.
 
     Warnings
     --------
@@ -41,6 +43,7 @@ def __init__(
         load_prefix: str = "load",
         default_tf_graph: bool = False,
         input_map: Optional[dict] = None,
+        neighbor_list=None,
     ) -> None:
         # use this in favor of dict update to move attribute from class to
         # instance namespace
@@ -58,6 +61,7 @@ def __init__(
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
             input_map=input_map,
+            neighbor_list=neighbor_list,
         )
 
     def get_dim_fparam(self) -> int:
diff --git a/deepmd/infer/deep_eval.py b/deepmd/infer/deep_eval.py
index 3f5dede1ad..0ca9f21a77 100644
--- a/deepmd/infer/deep_eval.py
+++ b/deepmd/infer/deep_eval.py
@@ -45,6 +45,9 @@ class DeepEval:
         as the initial batch size.
     input_map : dict, optional
         The input map for tf.import_graph_def. Only work with default tf graph
+    neighbor_list : ase.neighborlist.NewPrimitiveNeighborList, optional
+        The ASE neighbor list class to produce the neighbor list. If None, the
+        neighbor list will be built natively in the model.
     """
 
     load_prefix: str  # set by subclass
@@ -56,6 +59,7 @@ def __init__(
         default_tf_graph: bool = False,
         auto_batch_size: Union[bool, int, AutoBatchSize] = False,
         input_map: Optional[dict] = None,
+        neighbor_list=None,
     ):
         self.graph = self._load_graph(
             model_file,
@@ -86,6 +90,8 @@ def __init__(
         else:
             raise TypeError("auto_batch_size should be bool, int, or AutoBatchSize")
 
+        self.neighbor_list = neighbor_list
+
     @property
     @lru_cache(maxsize=None)
     def model_type(self) -> str:
@@ -360,3 +366,92 @@ def eval_typeebd(self) -> np.ndarray:
         t_typeebd = self._get_tensor("t_typeebd:0")
         [typeebd] = run_sess(self.sess, [t_typeebd], feed_dict={})
         return typeebd
+
+    def build_neighbor_list(
+        self,
+        coords: np.ndarray,
+        cell: Optional[np.ndarray],
+        atype: np.ndarray,
+        imap: np.ndarray,
+        neighbor_list,
+    ):
+        """Make the mesh with neighbor list for a single frame.
+
+        Parameters
+        ----------
+        coords : np.ndarray
+            The coordinates of atoms. Should be of shape [natoms, 3]
+        cell : Optional[np.ndarray]
+            The cell of the system. Should be of shape [3, 3]
+        atype : np.ndarray
+            The type of atoms. Should be of shape [natoms]
+        imap : np.ndarray
+            The index map of atoms. Should be of shape [natoms]
+        neighbor_list : ase.neighborlist.NewPrimitiveNeighborList
+            ASE neighbor list. The following method or attribute will be
+            used/set: bothways, self_interaction, update, build, first_neigh,
+            pair_second, offset_vec.
+
+        Returns
+        -------
+        natoms_vec : np.ndarray
+            The number of atoms. This tensor has the length of Ntypes + 2
+            natoms[0]: nloc
+            natoms[1]: nall
+            natoms[i]: 2 <= i < Ntypes+2, number of type i atoms for nloc
+        coords : np.ndarray
+            The coordinates of atoms, including ghost atoms. Should be of
+            shape [nframes, nall, 3]
+        atype : np.ndarray
+            The type of atoms, including ghost atoms. Should be of shape [nall]
+        mesh : np.ndarray
+            The mesh in nei_mode=4.
+        imap : np.ndarray
+            The index map of atoms. Should be of shape [nall]
+        ghost_map : np.ndarray
+            The index map of ghost atoms. Should be of shape [nghost]
+        """
+        pbc = np.repeat(cell is not None, 3)
+        cell = cell.reshape(3, 3)
+        positions = coords.reshape(-1, 3)
+        neighbor_list.bothways = True
+        neighbor_list.self_interaction = False
+        if neighbor_list.update(pbc, cell, positions):
+            neighbor_list.build(pbc, cell, positions)
+        first_neigh = neighbor_list.first_neigh.copy()
+        pair_second = neighbor_list.pair_second.copy()
+        offset_vec = neighbor_list.offset_vec.copy()
+        # get out-of-box neighbors
+        out_mask = np.any(offset_vec != 0, axis=1)
+        out_idx = pair_second[out_mask]
+        out_offset = offset_vec[out_mask]
+        out_coords = positions[out_idx] + out_offset.dot(cell)
+        atype = np.array(atype, dtype=int)
+        out_atype = atype[out_idx]
+
+        nloc = positions.shape[0]
+        nghost = out_idx.size
+        all_coords = np.concatenate((positions, out_coords), axis=0)
+        all_atype = np.concatenate((atype, out_atype), axis=0)
+        # convert neighbor indexes
+        ghost_map = pair_second[out_mask]
+        pair_second[out_mask] = np.arange(nloc, nloc + nghost)
+        # get the mesh
+        mesh = np.zeros(16 + nloc * 2 + pair_second.size, dtype=int)
+        mesh[0] = nloc
+        # ilist
+        mesh[16 : 16 + nloc] = np.arange(nloc)
+        # numnei
+        mesh[16 + nloc : 16 + nloc * 2] = first_neigh[1:] - first_neigh[:-1]
+        # jlist
+        mesh[16 + nloc * 2 :] = pair_second
+
+        # natoms_vec
+        natoms_vec = np.zeros(self.ntypes + 2).astype(int)
+        natoms_vec[0] = nloc
+        natoms_vec[1] = nloc + nghost
+        for ii in range(self.ntypes):
+            natoms_vec[ii + 2] = np.count_nonzero(atype == ii)
+        # imap append ghost atoms
+        imap = np.concatenate((imap, np.arange(nloc, nloc + nghost)))
+        return natoms_vec, all_coords, all_atype, mesh, imap, ghost_map
diff --git a/deepmd/infer/deep_polar.py b/deepmd/infer/deep_polar.py
index 118f8c98a7..c1f981ef86 100644
--- a/deepmd/infer/deep_polar.py
+++ b/deepmd/infer/deep_polar.py
@@ -30,6 +30,8 @@ class DeepPolar(DeepTensor):
         If uses the default tf graph, otherwise build a new tf graph for evaluation
     input_map : dict, optional
         The input map for tf.import_graph_def. Only work with default tf graph
+    neighbor_list : ase.neighborlist.NeighborList, optional
+        The neighbor list object. If None, then build the native neighbor list.
 
     Warnings
     --------
@@ -44,6 +46,7 @@ def __init__(
         load_prefix: str = "load",
         default_tf_graph: bool = False,
         input_map: Optional[dict] = None,
+        neighbor_list=None,
     ) -> None:
         # use this in favor of dict update to move attribute from class to
         # instance namespace
@@ -61,6 +64,7 @@ def __init__(
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
             input_map=input_map,
+            neighbor_list=neighbor_list,
         )
 
     def get_dim_fparam(self) -> int:
@@ -83,10 +87,16 @@ class DeepGlobalPolar(DeepTensor):
         The prefix in the load computational graph
     default_tf_graph : bool
         If uses the default tf graph, otherwise build a new tf graph for evaluation
+    neighbor_list : ase.neighborlist.NeighborList, optional
+        The neighbor list object. If None, then build the native neighbor list.
     """
 
     def __init__(
-        self, model_file: str, load_prefix: str = "load", default_tf_graph: bool = False
+        self,
+        model_file: str,
+        load_prefix: str = "load",
+        default_tf_graph: bool = False,
+        neighbor_list=None,
     ) -> None:
         self.tensors.update(
             {
@@ -101,6 +111,7 @@ def __init__(
             model_file,
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
+            neighbor_list=None,
         )
 
     def eval(
diff --git a/deepmd/infer/deep_pot.py b/deepmd/infer/deep_pot.py
index fc9a6a76ed..81cfdde7a8 100644
--- a/deepmd/infer/deep_pot.py
+++ b/deepmd/infer/deep_pot.py
@@ -51,6 +51,9 @@ class DeepPot(DeepEval):
         as the initial batch size.
     input_map : dict, optional
         The input map for tf.import_graph_def. Only work with default tf graph
+    neighbor_list : ase.neighborlist.NewPrimitiveNeighborList, optional
+        The ASE neighbor list class to produce the neighbor list. If None, the
+        neighbor list will be built natively in the model.
 
     Examples
     --------
@@ -78,6 +81,7 @@ def __init__(
         default_tf_graph: bool = False,
         auto_batch_size: Union[bool, int, AutoBatchSize] = True,
         input_map: Optional[dict] = None,
+        neighbor_list=None,
     ) -> None:
         # add these tensors on top of what is defined by DeepTensor Class
         # use this in favor of dict update to move attribute from class to
@@ -112,6 +116,7 @@ def __init__(
             default_tf_graph=default_tf_graph,
             auto_batch_size=auto_batch_size,
             input_map=input_map,
+            neighbor_list=neighbor_list,
         )
 
         # load optional tensors
@@ -479,8 +484,30 @@ def _prepare_feed_dict(
             aparam = np.reshape(aparam, [nframes, natoms * fdim])
 
         # make natoms_vec and default_mesh
-        natoms_vec = self.make_natoms_vec(atom_types, mixed_type=mixed_type)
-        assert natoms_vec[0] == natoms
+        if self.neighbor_list is None:
+            natoms_vec = self.make_natoms_vec(atom_types, mixed_type=mixed_type)
+            assert natoms_vec[0] == natoms
+            mesh = make_default_mesh(pbc, mixed_type)
+            ghost_map = None
+        else:
+            if nframes > 1:
+                raise NotImplementedError(
+                    "neighbor_list does not support multiple frames"
+                )
+            (
+                natoms_vec,
+                coords,
+                atom_types,
+                mesh,
+                imap,
+                ghost_map,
+            ) = self.build_neighbor_list(
+                coords,
+                cells if cells is not None else None,
+                atom_types,
+                imap,
+                self.neighbor_list,
+            )
 
         # evaluate
         feed_dict_test = {}
@@ -501,12 +528,12 @@ def _prepare_feed_dict(
             raise RuntimeError
         if self.has_efield:
             feed_dict_test[self.t_efield] = np.reshape(efield, [-1])
-        feed_dict_test[self.t_mesh] = make_default_mesh(pbc, mixed_type)
+        feed_dict_test[self.t_mesh] = mesh
         if self.has_fparam:
             feed_dict_test[self.t_fparam] = np.reshape(fparam, [-1])
         if self.has_aparam:
             feed_dict_test[self.t_aparam] = np.reshape(aparam, [-1])
-        return feed_dict_test, imap, natoms_vec
+        return feed_dict_test, imap, natoms_vec, ghost_map
 
     def _eval_inner(
         self,
@@ -522,10 +549,13 @@ def _eval_inner(
         natoms, nframes = self._get_natoms_and_nframes(
             coords, atom_types, mixed_type=mixed_type
         )
-        feed_dict_test, imap, natoms_vec = self._prepare_feed_dict(
+        feed_dict_test, imap, natoms_vec, ghost_map = self._prepare_feed_dict(
             coords, cells, atom_types, fparam, aparam, efield, mixed_type=mixed_type
         )
 
+        nloc = natoms_vec[0]
+        nall = natoms_vec[1]
+
         t_out = [self.t_energy, self.t_force, self.t_virial]
         if atomic:
             t_out += [self.t_ae, self.t_av]
@@ -548,6 +578,13 @@ def _eval_inner(
             )
         else:
             natoms_real = natoms
+        if ghost_map is not None:
+            # add the value of ghost atoms to real atoms
+            force = np.reshape(force, [nframes, -1, 3])
+            np.add.at(force[0], ghost_map, force[0, nloc:])
+            if atomic:
+                av = np.reshape(av, [nframes, -1, 9])
+                np.add.at(av[0], ghost_map, av[0, nloc:])
 
         # reverse map of the outputs
         force = self.reverse_map(np.reshape(force, [nframes, -1, 3]), imap)
@@ -556,11 +593,15 @@ def _eval_inner(
             av = self.reverse_map(np.reshape(av, [nframes, -1, 9]), imap)
 
         energy = np.reshape(energy, [nframes, 1])
-        force = np.reshape(force, [nframes, natoms, 3])
+        force = np.reshape(force, [nframes, nall, 3])
+        if nloc < nall:
+            force = force[:, :nloc, :]
         virial = np.reshape(virial, [nframes, 9])
         if atomic:
             ae = np.reshape(ae, [nframes, natoms_real, 1])
-            av = np.reshape(av, [nframes, natoms, 9])
+            av = np.reshape(av, [nframes, nall, 9])
+            if nloc < nall:
+                av = av[:, :nloc, :]
             return energy, force, virial, ae, av
         else:
             return energy, force, virial
@@ -640,10 +681,11 @@ def _eval_descriptor_inner(
         natoms, nframes = self._get_natoms_and_nframes(
             coords, atom_types, mixed_type=mixed_type
         )
-        feed_dict_test, imap, natoms_vec = self._prepare_feed_dict(
+        feed_dict_test, imap, natoms_vec, ghost_map = self._prepare_feed_dict(
             coords, cells, atom_types, fparam, aparam, efield, mixed_type=mixed_type
         )
         (descriptor,) = run_sess(
             self.sess, [self.t_descriptor], feed_dict=feed_dict_test
         )
+        imap = imap[:natoms]
         return self.reverse_map(np.reshape(descriptor, [nframes, natoms, -1]), imap)
diff --git a/deepmd/infer/deep_tensor.py b/deepmd/infer/deep_tensor.py
index 268523e959..a803eb0c6b 100644
--- a/deepmd/infer/deep_tensor.py
+++ b/deepmd/infer/deep_tensor.py
@@ -39,6 +39,8 @@ class DeepTensor(DeepEval):
         If uses the default tf graph, otherwise build a new tf graph for evaluation
     input_map : dict, optional
         The input map for tf.import_graph_def. Only work with default tf graph
+    neighbor_list : ase.neighborlist.NeighborList, optional
+        The neighbor list object. If None, then build the native neighbor list.
     """
 
     tensors: ClassVar[Dict[str, str]] = {
@@ -63,6 +65,7 @@ def __init__(
         load_prefix: str = "load",
         default_tf_graph: bool = False,
         input_map: Optional[dict] = None,
+        neighbor_list=None,
     ) -> None:
         """Constructor."""
         DeepEval.__init__(
@@ -71,6 +74,7 @@ def __init__(
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
             input_map=input_map,
+            neighbor_list=neighbor_list,
         )
         # check model type
         model_type = self.tensors["t_tensor"][2:-2]
@@ -209,8 +213,29 @@ def eval(
         )
 
         # make natoms_vec and default_mesh
-        natoms_vec = self.make_natoms_vec(atom_types, mixed_type=mixed_type)
-        assert natoms_vec[0] == natoms
+        if self.neighbor_list is None:
+            natoms_vec = self.make_natoms_vec(atom_types, mixed_type=mixed_type)
+            assert natoms_vec[0] == natoms
+            mesh = make_default_mesh(pbc, mixed_type)
+        else:
+            if nframes > 1:
+                raise NotImplementedError(
+                    "neighbor_list does not support multiple frames"
+                )
+            (
+                natoms_vec,
+                coords,
+                atom_types,
+                mesh,
+                imap,
+                _,
+            ) = self.build_neighbor_list(
+                coords,
+                cells if cells is not None else None,
+                atom_types,
+                imap,
+                self.neighbor_list,
+            )
 
         # evaluate
         feed_dict_test = {}
@@ -223,7 +248,7 @@ def eval(
             )
         feed_dict_test[self.t_coord] = np.reshape(coords, [-1])
         feed_dict_test[self.t_box] = np.reshape(cells, [-1])
-        feed_dict_test[self.t_mesh] = make_default_mesh(pbc, mixed_type)
+        feed_dict_test[self.t_mesh] = mesh
 
         if atomic:
             assert (
@@ -333,8 +358,30 @@ def eval_full(
         )
 
         # make natoms_vec and default_mesh
-        natoms_vec = self.make_natoms_vec(atom_types, mixed_type=mixed_type)
-        assert natoms_vec[0] == natoms
+        if self.neighbor_list is None:
+            natoms_vec = self.make_natoms_vec(atom_types, mixed_type=mixed_type)
+            assert natoms_vec[0] == natoms
+            mesh = make_default_mesh(pbc, mixed_type)
+            ghost_map = None
+        else:
+            if nframes > 1:
+                raise NotImplementedError(
+                    "neighbor_list does not support multiple frames"
+                )
+            (
+                natoms_vec,
+                coords,
+                atom_types,
+                mesh,
+                imap,
+                ghost_map,
+            ) = self.build_neighbor_list(
+                coords,
+                cells if cells is not None else None,
+                atom_types,
+                imap,
+                self.neighbor_list,
+            )
 
         # evaluate
         feed_dict_test = {}
@@ -347,7 +394,7 @@ def eval_full(
             )
         feed_dict_test[self.t_coord] = np.reshape(coords, [-1])
         feed_dict_test[self.t_box] = np.reshape(cells, [-1])
-        feed_dict_test[self.t_mesh] = make_default_mesh(pbc, mixed_type)
+        feed_dict_test[self.t_mesh] = mesh
 
         t_out = [self.t_global_tensor, self.t_force, self.t_virial]
         if atomic:
@@ -361,21 +408,39 @@ def eval_full(
             at = v_out[3]  # atom tensor
             av = v_out[4]  # atom virial
 
+        nloc = natoms_vec[0]
+        nall = natoms_vec[1]
+
+        if ghost_map is not None:
+            # add the value of ghost atoms to real atoms
+            force = np.reshape(force, [nframes * nout, -1, 3])
+            # TODO: is there some way not to use for loop?
+            for ii in range(nframes * nout):
+                np.add.at(force[ii], ghost_map, force[ii, nloc:])
+            if atomic:
+                av = np.reshape(av, [nframes * nout, -1, 9])
+                for ii in range(nframes * nout):
+                    np.add.at(av[ii], ghost_map, av[ii, nloc:])
+
         # please note here the shape are wrong!
-        force = self.reverse_map(np.reshape(force, [nframes * nout, natoms, 3]), imap)
+        force = self.reverse_map(np.reshape(force, [nframes * nout, nall, 3]), imap)
         if atomic:
             at = self.reverse_map(
                 np.reshape(at, [nframes, len(sel_at), nout]), sel_imap
             )
-            av = self.reverse_map(np.reshape(av, [nframes * nout, natoms, 9]), imap)
+            av = self.reverse_map(np.reshape(av, [nframes * nout, nall, 9]), imap)
 
         # make sure the shapes are correct here
         gt = np.reshape(gt, [nframes, nout])
-        force = np.reshape(force, [nframes, nout, natoms, 3])
+        force = np.reshape(force, [nframes, nout, nall, 3])
+        if nloc < nall:
+            force = force[:, :, :nloc, :]
         virial = np.reshape(virial, [nframes, nout, 9])
         if atomic:
             at = np.reshape(at, [nframes, len(sel_at), self.output_dim])
-            av = np.reshape(av, [nframes, nout, natoms, 9])
+            av = np.reshape(av, [nframes, nout, nall, 9])
+            if nloc < nall:
+                av = av[:, :, :nloc, :]
             return gt, force, virial, at, av
         else:
             return gt, force, virial
diff --git a/deepmd/loggers/__init__.py b/deepmd/loggers/__init__.py
index 39aa76139d..71057e3056 100644
--- a/deepmd/loggers/__init__.py
+++ b/deepmd/loggers/__init__.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Module taking care of logging duties."""
+"""Alias of deepmd_utils.loggers for backward compatibility."""
 
-from .loggers import (
+from deepmd_utils.loggers.loggers import (
     set_log_handles,
 )
 
diff --git a/deepmd/loggers/loggers.py b/deepmd/loggers/loggers.py
index 015581f6bd..74ca7de63e 100644
--- a/deepmd/loggers/loggers.py
+++ b/deepmd/loggers/loggers.py
@@ -1,277 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Logger initialization for package."""
-
-import logging
-import os
-from typing import (
-    TYPE_CHECKING,
-    Optional,
+"""Alias of deepmd_utils.loggers.loggers for backward compatibility."""
+from deepmd_utils.loggers.loggers import (
+    set_log_handles,
 )
 
-if TYPE_CHECKING:
-    from pathlib import (
-        Path,
-    )
-
-    from mpi4py import (
-        MPI,
-    )
-
-    _MPI_APPEND_MODE = MPI.MODE_CREATE | MPI.MODE_APPEND
-
-logging.getLogger(__name__)
-
 __all__ = ["set_log_handles"]
-
-# logger formater
-FFORMATTER = logging.Formatter(
-    "[%(asctime)s] %(app_name)s %(levelname)-7s %(name)-45s %(message)s"
-)
-CFORMATTER = logging.Formatter(
-    #    "%(app_name)s %(levelname)-7s |-> %(name)-45s %(message)s"
-    "%(app_name)s %(levelname)-7s %(message)s"
-)
-FFORMATTER_MPI = logging.Formatter(
-    "[%(asctime)s] %(app_name)s rank:%(rank)-2s %(levelname)-7s %(name)-45s %(message)s"
-)
-CFORMATTER_MPI = logging.Formatter(
-    #    "%(app_name)s rank:%(rank)-2s %(levelname)-7s |-> %(name)-45s %(message)s"
-    "%(app_name)s rank:%(rank)-2s %(levelname)-7s %(message)s"
-)
-
-
-class _AppFilter(logging.Filter):
-    """Add field `app_name` to log messages."""
-
-    def filter(self, record):
-        record.app_name = "DEEPMD"
-        return True
-
-
-class _MPIRankFilter(logging.Filter):
-    """Add MPI rank number to log messages, adds field `rank`."""
-
-    def __init__(self, rank: int) -> None:
-        super().__init__(name="MPI_rank_id")
-        self.mpi_rank = str(rank)
-
-    def filter(self, record):
-        record.rank = self.mpi_rank
-        return True
-
-
-class _MPIMasterFilter(logging.Filter):
-    """Filter that lets through only messages emited from rank==0."""
-
-    def __init__(self, rank: int) -> None:
-        super().__init__(name="MPI_master_log")
-        self.mpi_rank = rank
-
-    def filter(self, record):
-        if self.mpi_rank == 0:
-            return True
-        else:
-            return False
-
-
-class _MPIFileStream:
-    """Wrap MPI.File` so it has the same API as python file streams.
-
-    Parameters
-    ----------
-    filename : Path
-        disk location of the file stream
-    MPI : MPI
-        MPI communicator object
-    mode : str, optional
-        file write mode, by default _MPI_APPEND_MODE
-    """
-
-    def __init__(
-        self, filename: "Path", MPI: "MPI", mode: str = "_MPI_APPEND_MODE"
-    ) -> None:
-        self.stream = MPI.File.Open(MPI.COMM_WORLD, filename, mode)
-        self.stream.Set_atomicity(True)
-        self.name = "MPIfilestream"
-
-    def write(self, msg: str):
-        """Write to MPI shared file stream.
-
-        Parameters
-        ----------
-        msg : str
-            message to write
-        """
-        b = bytearray()
-        b.extend(map(ord, msg))
-        self.stream.Write_shared(b)
-
-    def close(self):
-        """Synchronize and close MPI file stream."""
-        self.stream.Sync()
-        self.stream.Close()
-
-
-class _MPIHandler(logging.FileHandler):
-    """Emulate `logging.FileHandler` with MPI shared File that all ranks can write to.
-
-    Parameters
-    ----------
-    filename : Path
-        file path
-    MPI : MPI
-        MPI communicator object
-    mode : str, optional
-        file access mode, by default "_MPI_APPEND_MODE"
-    """
-
-    def __init__(
-        self,
-        filename: "Path",
-        MPI: "MPI",
-        mode: str = "_MPI_APPEND_MODE",
-    ) -> None:
-        self.MPI = MPI
-        super().__init__(filename, mode=mode, encoding=None, delay=False)
-
-    def _open(self):
-        return _MPIFileStream(self.baseFilename, self.MPI, self.mode)
-
-    def setStream(self, stream):
-        """Stream canot be reasigned in MPI mode."""
-        raise NotImplementedError("Unable to do for MPI file handler!")
-
-
-def set_log_handles(
-    level: int, log_path: Optional["Path"] = None, mpi_log: Optional[str] = None
-):
-    """Set desired level for package loggers and add file handlers.
-
-    Parameters
-    ----------
-    level : int
-        logging level
-    log_path : Optional[str]
-        path to log file, if None logs will be send only to console. If the parent
-        directory does not exist it will be automatically created, by default None
-    mpi_log : Optional[str], optional
-        mpi log type. Has three options. `master` will output logs to file and console
-        only from rank==0. `collect` will write messages from all ranks to one file
-        opened under rank==0 and to console. `workers` will open one log file for each
-        worker designated by its rank, console behaviour is the same as for `collect`.
-        If this argument is specified, package 'mpi4py' must be already installed.
-        by default None
-
-    Raises
-    ------
-    RuntimeError
-        If the argument `mpi_log` is specified, package `mpi4py` is not installed.
-
-    References
-    ----------
-    https://groups.google.com/g/mpi4py/c/SaNzc8bdj6U
-    https://stackoverflow.com/questions/35869137/avoid-tensorflow-print-on-standard-error
-    https://stackoverflow.com/questions/56085015/suppress-openmp-debug-messages-when-running-tensorflow-on-cpu
-
-    Notes
-    -----
-    Logging levels:
-
-    +---------+--------------+----------------+----------------+----------------+
-    |         | our notation | python logging | tensorflow cpp | OpenMP         |
-    +=========+==============+================+================+================+
-    | debug   | 10           | 10             | 0              | 1/on/true/yes  |
-    +---------+--------------+----------------+----------------+----------------+
-    | info    | 20           | 20             | 1              | 0/off/false/no |
-    +---------+--------------+----------------+----------------+----------------+
-    | warning | 30           | 30             | 2              | 0/off/false/no |
-    +---------+--------------+----------------+----------------+----------------+
-    | error   | 40           | 40             | 3              | 0/off/false/no |
-    +---------+--------------+----------------+----------------+----------------+
-
-    """
-    # silence logging for OpenMP when running on CPU if level is any other than debug
-    if level <= 10:
-        os.environ["KMP_WARNINGS"] = "FALSE"
-
-    # set TF cpp internal logging level
-    os.environ["TF_CPP_MIN_LOG_LEVEL"] = str(int((level / 10) - 1))
-
-    # get root logger
-    root_log = logging.getLogger("deepmd")
-    root_log.propagate = False
-
-    root_log.setLevel(level)
-
-    # check if arguments are present
-    MPI = None
-    if mpi_log:
-        try:
-            from mpi4py import (
-                MPI,
-            )
-        except ImportError as e:
-            raise RuntimeError(
-                "You cannot specify 'mpi_log' when mpi4py not installed"
-            ) from e
-
-    # * add console handler ************************************************************
-    ch = logging.StreamHandler()
-    if MPI:
-        rank = MPI.COMM_WORLD.Get_rank()
-        if mpi_log == "master":
-            ch.setFormatter(CFORMATTER)
-            ch.addFilter(_MPIMasterFilter(rank))
-        else:
-            ch.setFormatter(CFORMATTER_MPI)
-            ch.addFilter(_MPIRankFilter(rank))
-    else:
-        ch.setFormatter(CFORMATTER)
-
-    ch.setLevel(level)
-    ch.addFilter(_AppFilter())
-    # clean old handlers before adding new one
-    root_log.handlers.clear()
-    root_log.addHandler(ch)
-
-    # * add file handler ***************************************************************
-    if log_path:
-        # create directory
-        log_path.parent.mkdir(exist_ok=True, parents=True)
-
-        fh = None
-
-        if mpi_log == "master":
-            rank = MPI.COMM_WORLD.Get_rank()
-            if rank == 0:
-                fh = logging.FileHandler(log_path, mode="w")
-                fh.addFilter(_MPIMasterFilter(rank))
-                fh.setFormatter(FFORMATTER)
-        elif mpi_log == "collect":
-            rank = MPI.COMM_WORLD.Get_rank()
-            fh = _MPIHandler(log_path, MPI, mode=MPI.MODE_WRONLY | MPI.MODE_CREATE)
-            fh.addFilter(_MPIRankFilter(rank))
-            fh.setFormatter(FFORMATTER_MPI)
-        elif mpi_log == "workers":
-            rank = MPI.COMM_WORLD.Get_rank()
-            # if file has suffix than inser rank number before suffix
-            # e.g deepmd.log -> deepmd_<rank>.log
-            # if no suffix is present, insert rank as suffix
-            # e.g. deepmdlog -> deepmdlog.<rank>
-            if log_path.suffix:
-                worker_log = (log_path.parent / f"{log_path.stem}_{rank}").with_suffix(
-                    log_path.suffix
-                )
-            else:
-                worker_log = log_path.with_suffix(f".{rank}")
-
-            fh = logging.FileHandler(worker_log, mode="w")
-            fh.setFormatter(FFORMATTER)
-        else:
-            fh = logging.FileHandler(log_path, mode="w")
-            fh.setFormatter(FFORMATTER)
-
-        if fh:
-            fh.setLevel(level)
-            fh.addFilter(_AppFilter())
-            root_log.addHandler(fh)
diff --git a/deepmd/loss/dos.py b/deepmd/loss/dos.py
index fa30552486..7d38f2b17a 100644
--- a/deepmd/loss/dos.py
+++ b/deepmd/loss/dos.py
@@ -143,16 +143,20 @@ def build(self, learning_rate, natoms, model_dict, label_dict, suffix):
         more_loss = {}
         if self.has_dos:
             l2_loss += atom_norm_ener * (pref_dos * l2_dos_loss)
-            more_loss["l2_dos_loss"] = l2_dos_loss
+            more_loss["l2_dos_loss"] = self.display_if_exist(l2_dos_loss, find_dos)
         if self.has_cdf:
             l2_loss += atom_norm_ener * (pref_cdf * l2_cdf_loss)
-            more_loss["l2_cdf_loss"] = l2_cdf_loss
+            more_loss["l2_cdf_loss"] = self.display_if_exist(l2_cdf_loss, find_dos)
         if self.has_ados:
             l2_loss += global_cvt_2_ener_float(pref_ados * l2_atom_dos_loss)
-            more_loss["l2_atom_dos_loss"] = l2_atom_dos_loss
+            more_loss["l2_atom_dos_loss"] = self.display_if_exist(
+                l2_atom_dos_loss, find_atom_dos
+            )
         if self.has_acdf:
             l2_loss += global_cvt_2_ener_float(pref_acdf * l2_atom_cdf_loss)
-            more_loss["l2_atom_cdf_loss"] = l2_atom_cdf_loss
+            more_loss["l2_atom_cdf_loss"] = self.display_if_exist(
+                l2_atom_cdf_loss, find_atom_dos
+            )
 
         # only used when tensorboard was set as true
         self.l2_loss_summary = tf.summary.scalar("l2_loss_" + suffix, tf.sqrt(l2_loss))
diff --git a/deepmd/loss/ener.py b/deepmd/loss/ener.py
index 95997bad10..d7f83f09e5 100644
--- a/deepmd/loss/ener.py
+++ b/deepmd/loss/ener.py
@@ -291,22 +291,32 @@ def build(self, learning_rate, natoms, model_dict, label_dict, suffix):
         more_loss = {}
         if self.has_e:
             l2_loss += atom_norm_ener * (pref_e * l2_ener_loss)
-            more_loss["l2_ener_loss"] = l2_ener_loss
+            more_loss["l2_ener_loss"] = self.display_if_exist(l2_ener_loss, find_energy)
         if self.has_f:
             l2_loss += global_cvt_2_ener_float(pref_f * l2_force_loss)
-            more_loss["l2_force_loss"] = l2_force_loss
+            more_loss["l2_force_loss"] = self.display_if_exist(
+                l2_force_loss, find_force
+            )
         if self.has_v:
             l2_loss += global_cvt_2_ener_float(atom_norm * (pref_v * l2_virial_loss))
-            more_loss["l2_virial_loss"] = l2_virial_loss
+            more_loss["l2_virial_loss"] = self.display_if_exist(
+                l2_virial_loss, find_virial
+            )
         if self.has_ae:
             l2_loss += global_cvt_2_ener_float(pref_ae * l2_atom_ener_loss)
-            more_loss["l2_atom_ener_loss"] = l2_atom_ener_loss
+            more_loss["l2_atom_ener_loss"] = self.display_if_exist(
+                l2_atom_ener_loss, find_atom_ener
+            )
         if self.has_pf:
             l2_loss += global_cvt_2_ener_float(pref_pf * l2_pref_force_loss)
-            more_loss["l2_pref_force_loss"] = l2_pref_force_loss
+            more_loss["l2_pref_force_loss"] = self.display_if_exist(
+                l2_pref_force_loss, find_atom_pref
+            )
         if self.has_gf:
             l2_loss += global_cvt_2_ener_float(pref_gf * l2_gen_force_loss)
-            more_loss["l2_gen_force_loss"] = l2_gen_force_loss
+            more_loss["l2_gen_force_loss"] = self.display_if_exist(
+                l2_gen_force_loss, find_drdq
+            )
 
         # only used when tensorboard was set as true
         self.l2_loss_summary = tf.summary.scalar("l2_loss_" + suffix, tf.sqrt(l2_loss))
@@ -553,19 +563,25 @@ def build(self, learning_rate, natoms, model_dict, label_dict, suffix):
         more_loss = {}
         if self.has_e:
             l2_loss += atom_norm_ener * (pref_e * l2_ener_loss)
-        more_loss["l2_ener_loss"] = l2_ener_loss
+        more_loss["l2_ener_loss"] = self.display_if_exist(l2_ener_loss, find_energy)
         if self.has_fr:
             l2_loss += global_cvt_2_ener_float(pref_fr * l2_force_r_loss)
-        more_loss["l2_force_r_loss"] = l2_force_r_loss
+        more_loss["l2_force_r_loss"] = self.display_if_exist(
+            l2_force_r_loss, find_force
+        )
         if self.has_fm:
             l2_loss += global_cvt_2_ener_float(pref_fm * l2_force_m_loss)
-        more_loss["l2_force_m_loss"] = l2_force_m_loss
+        more_loss["l2_force_m_loss"] = self.display_if_exist(
+            l2_force_m_loss, find_force
+        )
         if self.has_v:
             l2_loss += global_cvt_2_ener_float(atom_norm * (pref_v * l2_virial_loss))
-        more_loss["l2_virial_loss"] = l2_virial_loss
+        more_loss["l2_virial_loss"] = self.display_if_exist(l2_virial_loss, find_virial)
         if self.has_ae:
             l2_loss += global_cvt_2_ener_float(pref_ae * l2_atom_ener_loss)
-        more_loss["l2_atom_ener_loss"] = l2_atom_ener_loss
+        more_loss["l2_atom_ener_loss"] = self.display_if_exist(
+            l2_atom_ener_loss, find_atom_ener
+        )
 
         # only used when tensorboard was set as true
         self.l2_loss_summary = tf.summary.scalar("l2_loss", tf.sqrt(l2_loss))
@@ -785,8 +801,10 @@ def build(self, learning_rate, natoms, model_dict, label_dict, suffix):
         more_loss = {}
         l2_loss += atom_norm_ener * (pref_e * l2_ener_loss)
         l2_loss += global_cvt_2_ener_float(pref_ed * l2_ener_dipole_loss)
-        more_loss["l2_ener_loss"] = l2_ener_loss
-        more_loss["l2_ener_dipole_loss"] = l2_ener_dipole_loss
+        more_loss["l2_ener_loss"] = self.display_if_exist(l2_ener_loss, find_energy)
+        more_loss["l2_ener_dipole_loss"] = self.display_if_exist(
+            l2_ener_dipole_loss, find_ener_dipole
+        )
 
         self.l2_loss_summary = tf.summary.scalar("l2_loss_" + suffix, tf.sqrt(l2_loss))
         self.l2_loss_ener_summary = tf.summary.scalar(
diff --git a/deepmd/loss/loss.py b/deepmd/loss/loss.py
index 9324077691..a719a08d81 100644
--- a/deepmd/loss/loss.py
+++ b/deepmd/loss/loss.py
@@ -8,6 +8,8 @@
     Tuple,
 )
 
+import numpy as np
+
 from deepmd.env import (
     tf,
 )
@@ -72,3 +74,20 @@ def eval(
             A dictionary that maps keys to values. It
             should contain key `natoms`
         """
+
+    @staticmethod
+    def display_if_exist(loss: tf.Tensor, find_property: float) -> tf.Tensor:
+        """Display NaN if labeled property is not found.
+
+        Parameters
+        ----------
+        loss : tf.Tensor
+            the loss tensor
+        find_property : float
+            whether the property is found
+        """
+        return tf.cond(
+            tf.cast(find_property, tf.bool),
+            lambda: loss,
+            lambda: tf.cast(np.nan, dtype=loss.dtype),
+        )
diff --git a/deepmd/loss/tensor.py b/deepmd/loss/tensor.py
index 74eb2b74dc..a40f95a18e 100644
--- a/deepmd/loss/tensor.py
+++ b/deepmd/loss/tensor.py
@@ -87,7 +87,7 @@ def build(self, learning_rate, natoms, model_dict, label_dict, suffix):
             local_loss = global_cvt_2_tf_float(find_atomic) * tf.reduce_mean(
                 tf.square(self.scale * (polar - atomic_polar_hat)), name="l2_" + suffix
             )
-            more_loss["local_loss"] = local_loss
+            more_loss["local_loss"] = self.display_if_exist(local_loss, find_atomic)
             l2_loss += self.local_weight * local_loss
             self.l2_loss_local_summary = tf.summary.scalar(
                 "l2_local_loss_" + suffix, tf.sqrt(more_loss["local_loss"])
@@ -118,7 +118,7 @@ def build(self, learning_rate, natoms, model_dict, label_dict, suffix):
                 tf.square(self.scale * (global_polar - polar_hat)), name="l2_" + suffix
             )
 
-            more_loss["global_loss"] = global_loss
+            more_loss["global_loss"] = self.display_if_exist(global_loss, find_global)
             self.l2_loss_global_summary = tf.summary.scalar(
                 "l2_global_loss_" + suffix,
                 tf.sqrt(more_loss["global_loss"]) / global_cvt_2_tf_float(atoms),
diff --git a/deepmd/model/dos.py b/deepmd/model/dos.py
index 697fad9a9e..22e291a0f0 100644
--- a/deepmd/model/dos.py
+++ b/deepmd/model/dos.py
@@ -155,10 +155,12 @@ def build(
 
         # type embedding if any
         if self.typeebd is not None:
-            type_embedding = self.typeebd.build(
+            type_embedding = self.build_type_embedding(
                 self.ntypes,
                 reuse=reuse,
                 suffix=suffix,
+                frz_model=frz_model,
+                ckpt_meta=ckpt_meta,
             )
             input_dict["type_embedding"] = type_embedding
         input_dict["atype"] = atype_
diff --git a/deepmd/model/ener.py b/deepmd/model/ener.py
index 1976c1ad51..0d8d66b305 100644
--- a/deepmd/model/ener.py
+++ b/deepmd/model/ener.py
@@ -203,10 +203,12 @@ def build(
 
         # type embedding if any
         if self.typeebd is not None and "type_embedding" not in input_dict:
-            type_embedding = self.typeebd.build(
+            type_embedding = self.build_type_embedding(
                 self.ntypes,
                 reuse=reuse,
                 suffix=suffix,
+                ckpt_meta=ckpt_meta,
+                frz_model=frz_model,
             )
             input_dict["type_embedding"] = type_embedding
         # spin if any
diff --git a/deepmd/model/model.py b/deepmd/model/model.py
index 3f24e42aec..6117b4942d 100644
--- a/deepmd/model/model.py
+++ b/deepmd/model/model.py
@@ -97,6 +97,9 @@ def get_class_by_input(cls, input: dict):
         from deepmd.model.multi import (
             MultiModel,
         )
+        from deepmd.model.pairtab import (
+            PairTabModel,
+        )
         from deepmd.model.pairwise_dprc import (
             PairwiseDPRc,
         )
@@ -112,6 +115,8 @@ def get_class_by_input(cls, input: dict):
             return FrozenModel
         elif model_type == "linear_ener":
             return LinearEnergyModel
+        elif model_type == "pairtab":
+            return PairTabModel
         else:
             raise ValueError(f"unknown model type: {model_type}")
 
@@ -331,6 +336,60 @@ def build_descrpt(
             self.descrpt.pass_tensors_from_frz_model(*imported_tensors[:-1])
         return dout
 
+    def build_type_embedding(
+        self,
+        ntypes: int,
+        frz_model: Optional[str] = None,
+        ckpt_meta: Optional[str] = None,
+        suffix: str = "",
+        reuse: Optional[Union[bool, Enum]] = None,
+    ) -> tf.Tensor:
+        """Build the type embedding part of the model.
+
+        Parameters
+        ----------
+        ntypes : int
+            The number of types
+        frz_model : str, optional
+            The path to the frozen model
+        ckpt_meta : str, optional
+            The path prefix of the checkpoint and meta files
+        suffix : str, optional
+            The suffix of the scope
+        reuse : bool or tf.AUTO_REUSE, optional
+            Whether to reuse the variables
+
+        Returns
+        -------
+        tf.Tensor
+            The type embedding tensor
+        """
+        assert self.typeebd is not None
+        if frz_model is None and ckpt_meta is None:
+            dout = self.typeebd.build(
+                ntypes,
+                reuse=reuse,
+                suffix=suffix,
+            )
+        else:
+            # nothing input
+            feed_dict = {}
+            return_elements = [
+                f"t_typeebd{suffix}:0",
+            ]
+            if frz_model is not None:
+                imported_tensors = self._import_graph_def_from_frz_model(
+                    frz_model, feed_dict, return_elements
+                )
+            elif ckpt_meta is not None:
+                imported_tensors = self._import_graph_def_from_ckpt_meta(
+                    ckpt_meta, feed_dict, return_elements
+                )
+            else:
+                raise RuntimeError("should not reach here")  # pragma: no cover
+            dout = imported_tensors[-1]
+        return dout
+
     def _import_graph_def_from_frz_model(
         self, frz_model: str, feed_dict: dict, return_elements: List[str]
     ):
diff --git a/deepmd/model/model_stat.py b/deepmd/model/model_stat.py
index d2cc918b64..933a634ce8 100644
--- a/deepmd/model/model_stat.py
+++ b/deepmd/model/model_stat.py
@@ -1,68 +1,13 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from collections import (
-    defaultdict,
+"""Alias for backward compatibility."""
+from deepmd_utils.utils.model_stat import (
+    _make_all_stat_ref,
+    make_stat_input,
+    merge_sys_stat,
 )
 
-import numpy as np
-
-
-def _make_all_stat_ref(data, nbatches):
-    all_stat = defaultdict(list)
-    for ii in range(data.get_nsystems()):
-        for jj in range(nbatches):
-            stat_data = data.get_batch(sys_idx=ii)
-            for dd in stat_data:
-                if dd == "natoms_vec":
-                    stat_data[dd] = stat_data[dd].astype(np.int32)
-                all_stat[dd].append(stat_data[dd])
-    return all_stat
-
-
-def make_stat_input(data, nbatches, merge_sys=True):
-    """Pack data for statistics.
-
-    Parameters
-    ----------
-    data
-        The data
-    nbatches : int
-        The number of batches
-    merge_sys : bool (True)
-        Merge system data
-
-    Returns
-    -------
-    all_stat:
-        A dictionary of list of list storing data for stat.
-        if merge_sys == False data can be accessed by
-            all_stat[key][sys_idx][batch_idx][frame_idx]
-        else merge_sys == True can be accessed by
-            all_stat[key][batch_idx][frame_idx]
-    """
-    all_stat = defaultdict(list)
-    for ii in range(data.get_nsystems()):
-        sys_stat = defaultdict(list)
-        for jj in range(nbatches):
-            stat_data = data.get_batch(sys_idx=ii)
-            for dd in stat_data:
-                if dd == "natoms_vec":
-                    stat_data[dd] = stat_data[dd].astype(np.int32)
-                sys_stat[dd].append(stat_data[dd])
-        for dd in sys_stat:
-            if merge_sys:
-                for bb in sys_stat[dd]:
-                    all_stat[dd].append(bb)
-            else:
-                all_stat[dd].append(sys_stat[dd])
-    return all_stat
-
-
-def merge_sys_stat(all_stat):
-    first_key = next(iter(all_stat.keys()))
-    nsys = len(all_stat[first_key])
-    ret = defaultdict(list)
-    for ii in range(nsys):
-        for dd in all_stat:
-            for bb in all_stat[dd][ii]:
-                ret[dd].append(bb)
-    return ret
+__all__ = [
+    "make_stat_input",
+    "merge_sys_stat",
+    "_make_all_stat_ref",  # used by tests
+]
diff --git a/deepmd/model/multi.py b/deepmd/model/multi.py
index bfc67b9792..83b231c0e8 100644
--- a/deepmd/model/multi.py
+++ b/deepmd/model/multi.py
@@ -317,10 +317,12 @@ def build(
 
         # type embedding if any
         if self.typeebd is not None:
-            type_embedding = self.typeebd.build(
+            type_embedding = self.build_type_embedding(
                 self.ntypes,
                 reuse=reuse,
                 suffix=suffix,
+                frz_model=frz_model,
+                ckpt_meta=ckpt_meta,
             )
             input_dict["type_embedding"] = type_embedding
         input_dict["atype"] = atype_
diff --git a/deepmd/model/pairtab.py b/deepmd/model/pairtab.py
new file mode 100644
index 0000000000..38934818e6
--- /dev/null
+++ b/deepmd/model/pairtab.py
@@ -0,0 +1,288 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from enum import (
+    Enum,
+)
+from typing import (
+    List,
+    Optional,
+    Union,
+)
+
+import numpy as np
+
+from deepmd.env import (
+    GLOBAL_TF_FLOAT_PRECISION,
+    MODEL_VERSION,
+    global_cvt_2_ener_float,
+    op_module,
+    tf,
+)
+from deepmd.fit.fitting import (
+    Fitting,
+)
+from deepmd.loss.loss import (
+    Loss,
+)
+from deepmd.model.model import (
+    Model,
+)
+from deepmd.utils.pair_tab import (
+    PairTab,
+)
+
+
+class PairTabModel(Model):
+    """Pairwise tabulation energy model.
+
+    This model can be used to tabulate the pairwise energy between atoms for either
+    short-range or long-range interactions, such as D3, LJ, ZBL, etc. It should not
+    be used alone, but rather as one submodel of a linear (sum) model, such as
+    DP+D3.
+
+    Do not put the model on the first model of a linear model, since the linear
+    model fetches the type map from the first model.
+
+    At this moment, the model does not smooth the energy at the cutoff radius, so
+    one needs to make sure the energy has been smoothed to zero.
+
+    Parameters
+    ----------
+    tab_file : str
+        The path to the tabulation file.
+    rcut : float
+        The cutoff radius
+    sel : int or list[int]
+        The maxmum number of atoms in the cut-off radius
+    """
+
+    model_type = "ener"
+
+    def __init__(
+        self, tab_file: str, rcut: float, sel: Union[int, List[int]], **kwargs
+    ):
+        super().__init__()
+        self.tab_file = tab_file
+        self.tab = PairTab(self.tab_file)
+        self.ntypes = self.tab.ntypes
+        self.rcut = rcut
+        if isinstance(sel, int):
+            self.sel = sel
+        elif isinstance(sel, list):
+            self.sel = sum(sel)
+        else:
+            raise TypeError("sel must be int or list[int]")
+
+    def build(
+        self,
+        coord_: tf.Tensor,
+        atype_: tf.Tensor,
+        natoms: tf.Tensor,
+        box: tf.Tensor,
+        mesh: tf.Tensor,
+        input_dict: dict,
+        frz_model: Optional[str] = None,
+        ckpt_meta: Optional[str] = None,
+        suffix: str = "",
+        reuse: Optional[Union[bool, Enum]] = None,
+    ):
+        """Build the model.
+
+        Parameters
+        ----------
+        coord_ : tf.Tensor
+            The coordinates of atoms
+        atype_ : tf.Tensor
+            The atom types of atoms
+        natoms : tf.Tensor
+            The number of atoms
+        box : tf.Tensor
+            The box vectors
+        mesh : tf.Tensor
+            The mesh vectors
+        input_dict : dict
+            The input dict
+        frz_model : str, optional
+            The path to the frozen model
+        ckpt_meta : str, optional
+            The path prefix of the checkpoint and meta files
+        suffix : str, optional
+            The suffix of the scope
+        reuse : bool or tf.AUTO_REUSE, optional
+            Whether to reuse the variables
+
+        Returns
+        -------
+        dict
+            The output dict
+        """
+        tab_info, tab_data = self.tab.get()
+        with tf.variable_scope("model_attr" + suffix, reuse=reuse):
+            self.tab_info = tf.get_variable(
+                "t_tab_info",
+                tab_info.shape,
+                dtype=tf.float64,
+                trainable=False,
+                initializer=tf.constant_initializer(tab_info, dtype=tf.float64),
+            )
+            self.tab_data = tf.get_variable(
+                "t_tab_data",
+                tab_data.shape,
+                dtype=tf.float64,
+                trainable=False,
+                initializer=tf.constant_initializer(tab_data, dtype=tf.float64),
+            )
+            t_tmap = tf.constant(" ".join(self.type_map), name="tmap", dtype=tf.string)
+            t_mt = tf.constant(self.model_type, name="model_type", dtype=tf.string)
+            t_ver = tf.constant(MODEL_VERSION, name="model_version", dtype=tf.string)
+
+        with tf.variable_scope("fitting_attr" + suffix, reuse=reuse):
+            t_dfparam = tf.constant(0, name="dfparam", dtype=tf.int32)
+            t_daparam = tf.constant(0, name="daparam", dtype=tf.int32)
+        with tf.variable_scope("descrpt_attr" + suffix, reuse=reuse):
+            t_ntypes = tf.constant(self.ntypes, name="ntypes", dtype=tf.int32)
+            t_rcut = tf.constant(
+                self.rcut, name="rcut", dtype=GLOBAL_TF_FLOAT_PRECISION
+            )
+        coord = tf.reshape(coord_, [-1, natoms[1] * 3])
+        atype = tf.reshape(atype_, [-1, natoms[1]])
+        box = tf.reshape(box, [-1, 9])
+        # perhaps we need a OP that only outputs rij and nlist
+        (
+            _,
+            _,
+            rij,
+            nlist,
+            _,
+            _,
+        ) = op_module.prod_env_mat_a_mix(
+            coord,
+            atype,
+            natoms,
+            box,
+            mesh,
+            np.zeros([self.ntypes, self.sel * 4]),
+            np.ones([self.ntypes, self.sel * 4]),
+            rcut_a=-1,
+            rcut_r=self.rcut,
+            rcut_r_smth=self.rcut,
+            sel_a=[self.sel],
+            sel_r=[0],
+        )
+        scale = tf.ones([tf.shape(coord)[0], natoms[0]], dtype=tf.float64)
+        tab_atom_ener, tab_force, tab_atom_virial = op_module.pair_tab(
+            self.tab_info,
+            self.tab_data,
+            atype,
+            rij,
+            nlist,
+            natoms,
+            scale,
+            sel_a=[self.sel],
+            sel_r=[0],
+        )
+        energy_raw = tf.reshape(
+            tab_atom_ener, [-1, natoms[0]], name="o_atom_energy" + suffix
+        )
+        energy = tf.reduce_sum(
+            global_cvt_2_ener_float(energy_raw), axis=1, name="o_energy" + suffix
+        )
+        force = tf.reshape(tab_force, [-1, 3 * natoms[1]], name="o_force" + suffix)
+        virial = tf.reshape(
+            tf.reduce_sum(tf.reshape(tab_atom_virial, [-1, natoms[1], 9]), axis=1),
+            [-1, 9],
+            name="o_virial" + suffix,
+        )
+        atom_virial = tf.reshape(
+            tab_atom_virial, [-1, 9 * natoms[1]], name="o_atom_virial" + suffix
+        )
+        model_dict = {}
+        model_dict["energy"] = energy
+        model_dict["force"] = force
+        model_dict["virial"] = virial
+        model_dict["atom_ener"] = energy_raw
+        model_dict["atom_virial"] = atom_virial
+        model_dict["coord"] = coord
+        model_dict["atype"] = atype
+
+        return model_dict
+
+    def init_variables(
+        self,
+        graph: tf.Graph,
+        graph_def: tf.GraphDef,
+        model_type: str = "original_model",
+        suffix: str = "",
+    ) -> None:
+        """Init the embedding net variables with the given frozen model.
+
+        Parameters
+        ----------
+        graph : tf.Graph
+            The input frozen model graph
+        graph_def : tf.GraphDef
+            The input frozen model graph_def
+        model_type : str
+            the type of the model
+        suffix : str
+            suffix to name scope
+        """
+        # skip. table can be initialized from the file
+
+    def get_fitting(self) -> Union[Fitting, dict]:
+        """Get the fitting(s)."""
+        # nothing needs to do
+        return {}
+
+    def get_loss(self, loss: dict, lr) -> Optional[Union[Loss, dict]]:
+        """Get the loss function(s)."""
+        # nothing nees to do
+        return
+
+    def get_rcut(self) -> float:
+        """Get cutoff radius of the model."""
+        return self.rcut
+
+    def get_ntypes(self) -> int:
+        """Get the number of types."""
+        return self.ntypes
+
+    def data_stat(self, data: dict):
+        """Data staticis."""
+        # nothing needs to do
+
+    def enable_compression(self, suffix: str = "") -> None:
+        """Enable compression.
+
+        Parameters
+        ----------
+        suffix : str
+            suffix to name scope
+        """
+        # nothing needs to do
+
+    @classmethod
+    def update_sel(cls, global_jdata: dict, local_jdata: dict) -> dict:
+        """Update the selection and perform neighbor statistics.
+
+        Notes
+        -----
+        Do not modify the input data without copying it.
+
+        Parameters
+        ----------
+        global_jdata : dict
+            The global data, containing the training section
+        local_jdata : dict
+            The local data refer to the current class
+
+        Returns
+        -------
+        dict
+            The updated local data
+        """
+        from deepmd.entrypoints.train import (
+            update_one_sel,
+        )
+
+        local_jdata_cpy = local_jdata.copy()
+        return update_one_sel(global_jdata, local_jdata_cpy, True)
diff --git a/deepmd/model/pairwise_dprc.py b/deepmd/model/pairwise_dprc.py
index 6983a31cfd..f74571febb 100644
--- a/deepmd/model/pairwise_dprc.py
+++ b/deepmd/model/pairwise_dprc.py
@@ -173,10 +173,12 @@ def build(
         atype_qmmm = gather_placeholder(atype_qmmm, forward_qmmm_map, placeholder=-1)
         box_qm = box
 
-        type_embedding = self.typeebd.build(
+        type_embedding = self.build_type_embedding(
             self.ntypes,
             reuse=reuse,
             suffix=suffix,
+            frz_model=frz_model,
+            ckpt_meta=ckpt_meta,
         )
         input_dict_qm["type_embedding"] = type_embedding
         input_dict_qmmm["type_embedding"] = type_embedding
diff --git a/deepmd/model/tensor.py b/deepmd/model/tensor.py
index 9099b753a4..6a21e085f3 100644
--- a/deepmd/model/tensor.py
+++ b/deepmd/model/tensor.py
@@ -135,10 +135,12 @@ def build(
 
         # type embedding if any
         if self.typeebd is not None:
-            type_embedding = self.typeebd.build(
+            type_embedding = self.build_type_embedding(
                 self.ntypes,
                 reuse=reuse,
                 suffix=suffix,
+                ckpt_meta=ckpt_meta,
+                frz_model=frz_model,
             )
             input_dict["type_embedding"] = type_embedding
             input_dict["atype"] = atype_
diff --git a/deepmd/nvnmd/data/data.py b/deepmd/nvnmd/data/data.py
index 29c8b84a37..9e6dd4cc89 100644
--- a/deepmd/nvnmd/data/data.py
+++ b/deepmd/nvnmd/data/data.py
@@ -60,6 +60,7 @@
     },
     "ctrl": {
         # NSTDM
+        "MAX_NNEI": 128,
         "NSTDM": 64,
         "NSTDM_M1": 32,
         "NSTDM_M2": 2,
@@ -67,6 +68,7 @@
         "NSEL": "NSTDM*NTYPE_MAX",
         "NSADV": "NSTDM+1",
         "VERSION": 0,
+        "SUB_VERSION": 1,
     },
     "nbit": {
         # general
@@ -116,6 +118,22 @@
     "end": "",
 }
 
+# change the configuration accordng to the max_nnei
+jdata_config_v0_ni128 = jdata_config_v0.copy()
+jdata_config_v0_ni256 = jdata_config_v0.copy()
+jdata_config_v0_ni256["ctrl"] = {
+    "MAX_NNEI": 256,
+    "NSTDM": 128,
+    "NSTDM_M1": 32,
+    "NSTDM_M2": 4,
+    "NSTDM_M1X": 8,
+    "NSEL": "NSTDM*NTYPE_MAX",
+    "NSADV": "NSTDM+1",
+    "VERSION": 0,
+    "SUB_VERSION": 1,
+}
+jdata_config_v0_ni256["nbit"]["NBIT_NEIB"] = 9
+
 jdata_config_v1 = {
     "dscp": {
         # basic config from deepmd model
@@ -174,6 +192,7 @@
     },
     "ctrl": {
         # NSTDM
+        "MAX_NNEI": 128,
         "NSTDM": 64,
         "NSTDM_M1": 32,
         "NSTDM_M2": 2,
@@ -181,6 +200,7 @@
         "NSEL": "NSTDM",
         "NSADV": "NSTDM+1",
         "VERSION": 1,
+        "SUB_VERSION": 1,
     },
     "nbit": {
         # general
@@ -230,6 +250,22 @@
     "end": "",
 }
 
+# change the configuration accordng to the max_nnei
+jdata_config_v1_ni128 = jdata_config_v1.copy()
+jdata_config_v1_ni256 = jdata_config_v1.copy()
+jdata_config_v1_ni256["ctrl"] = {
+    "MAX_NNEI": 256,
+    "NSTDM": 128,
+    "NSTDM_M1": 32,
+    "NSTDM_M2": 4,
+    "NSTDM_M1X": 8,
+    "NSEL": "NSTDM",
+    "NSADV": "NSTDM+1",
+    "VERSION": 1,
+    "SUB_VERSION": 1,
+}
+jdata_config_v1_ni256["nbit"]["NBIT_NEIB"] = 9
+
 jdata_deepmd_input_v0 = {
     "model": {
         "descriptor": {
@@ -247,6 +283,7 @@
     },
     "nvnmd": {
         "version": 0,
+        "max_nnei": 128,  # 128 or 256
         "net_size": 128,
         "config_file": "none",
         "weight_file": "none",
@@ -286,6 +323,10 @@
     },
 }
 
+jdata_deepmd_input_v0_ni128 = jdata_deepmd_input_v0.copy()
+jdata_deepmd_input_v0_ni256 = jdata_deepmd_input_v0.copy()
+jdata_deepmd_input_v0_ni256["nvnmd"]["max_nnei"] = 256
+
 jdata_deepmd_input_v1 = {
     "model": {
         "descriptor": {
@@ -308,6 +349,7 @@
     },
     "nvnmd": {
         "version": 1,
+        "max_nnei": 128,  # 128 or 256
         "net_size": 128,
         "config_file": "none",
         "weight_file": "none",
@@ -347,6 +389,10 @@
     },
 }
 
+jdata_deepmd_input_v1_ni128 = jdata_deepmd_input_v1.copy()
+jdata_deepmd_input_v1_ni256 = jdata_deepmd_input_v1.copy()
+jdata_deepmd_input_v1_ni256["nvnmd"]["max_nnei"] = 256
+
 NVNMD_WELCOME = (
     r" _   _  __     __  _   _   __  __   ____  ",
     r"| \ | | \ \   / / | \ | | |  \/  | |  _ \ ",
diff --git a/deepmd/nvnmd/descriptor/se_a.py b/deepmd/nvnmd/descriptor/se_a.py
index 67ea45924b..816f17cfa3 100644
--- a/deepmd/nvnmd/descriptor/se_a.py
+++ b/deepmd/nvnmd/descriptor/se_a.py
@@ -50,12 +50,17 @@ def check_switch_range(davg, dstd):
         else:
             min_dist = nvnmd_cfg.weight["train_attr.min_nbor_dist"]
     else:
-        min_dist = rmin
+        min_dist = None
+
+    # fix the bug: if model initial mode is 'init_from_model',
+    # we need dmin to calculate smin and smax in mapt.py
+    if min_dist is not None:
+        nvnmd_cfg.dscp["dmin"] = min_dist
+        nvnmd_cfg.save()
 
     # if davg and dstd is None, the model initial mode is in
     #  'init_from_model', 'restart', 'init_from_frz_model', 'finetune'
     if (davg is not None) and (dstd is not None):
-        nvnmd_cfg.dscp["dmin"] = min_dist
         nvnmd_cfg.get_s_range(davg, dstd)
 
 
diff --git a/deepmd/nvnmd/descriptor/se_atten.py b/deepmd/nvnmd/descriptor/se_atten.py
index 727a93ca45..cfffb8a90b 100644
--- a/deepmd/nvnmd/descriptor/se_atten.py
+++ b/deepmd/nvnmd/descriptor/se_atten.py
@@ -49,7 +49,13 @@ def check_switch_range(davg, dstd):
         else:
             min_dist = nvnmd_cfg.weight["train_attr.min_nbor_dist"]
     else:
-        min_dist = rmin
+        min_dist = None
+
+    # fix the bug: if model initial mode is 'init_from_model',
+    # we need dmin to calculate smin and smax in mapt.py
+    if min_dist is not None:
+        nvnmd_cfg.dscp["dmin"] = min_dist
+        nvnmd_cfg.save()
 
     # if davg and dstd is None, the model initial mode is in
     #  'init_from_model', 'restart', 'init_from_frz_model', 'finetune'
@@ -58,7 +64,6 @@ def check_switch_range(davg, dstd):
             davg = np.zeros([ntype, ndescrpt])
         if dstd is None:
             dstd = np.ones([ntype, ndescrpt])
-        nvnmd_cfg.dscp["dmin"] = min_dist
         nvnmd_cfg.get_s_range(davg, dstd)
 
 
diff --git a/deepmd/nvnmd/entrypoints/freeze.py b/deepmd/nvnmd/entrypoints/freeze.py
index 6c356c6118..e56a0c2130 100644
--- a/deepmd/nvnmd/entrypoints/freeze.py
+++ b/deepmd/nvnmd/entrypoints/freeze.py
@@ -52,6 +52,7 @@ def filter_tensorVariableList(tensorVariableList) -> dict:
         p1 = p1 or name.startswith("filter_type_")
         p1 = p1 or name.startswith("layer_")
         p1 = p1 or name.startswith("final_layer")
+        p1 = p1 or name.endswith("t_bias_atom_e")
         p2 = "Adam" not in name
         p3 = "XXX" not in name
         if p1 and p2 and p3:
@@ -75,4 +76,5 @@ def save_weight(sess, file_name: str = "nvnmd/weight.npy"):
     else:
         min_dist = 0.0
     dic_key_value["train_attr.min_nbor_dist"] = min_dist
+    dic_key_value["t_bias_atom_e"] = dic_key_value["fitting_attr.t_bias_atom_e"]
     FioDic().save(file_name, dic_key_value)
diff --git a/deepmd/nvnmd/entrypoints/mapt.py b/deepmd/nvnmd/entrypoints/mapt.py
index eb77913983..1299d7a74e 100644
--- a/deepmd/nvnmd/entrypoints/mapt.py
+++ b/deepmd/nvnmd/entrypoints/mapt.py
@@ -87,9 +87,22 @@ def __init__(self, config_file: str, weight_file: str, map_file: str):
         jdata["weight_file"] = weight_file
         jdata["enable"] = True
 
+        # 0 : xyz_scatter = xyz_scatter * two_embd + xyz_scatter;
+        # Gs + 1, Gt + 0
+        # 1 : xyz_scatter = xyz_scatter * two_embd + two_embd   ;
+        # Gs + 0, Gt + 1
+        self.Gs_Gt_mode = 1
+
         nvnmd_cfg.init_from_jdata(jdata)
 
     def build_map(self):
+        if self.Gs_Gt_mode == 0:
+            self.shift_Gs = 1
+            self.shift_Gt = 0
+        if self.Gs_Gt_mode == 1:
+            self.shift_Gs = 0
+            self.shift_Gt = 1
+        #
         M = nvnmd_cfg.dscp["M1"]
         if nvnmd_cfg.version == 0:
             ndim = nvnmd_cfg.dscp["ntype"]
@@ -482,7 +495,7 @@ def build_s2g_grad(self):
             shift = 0
         if nvnmd_cfg.version == 1:
             ndim = 1
-            shift = 1
+            shift = self.shift_Gs
         #
         dic_ph = {}
         dic_ph["s"] = tf.placeholder(tf.float64, [None, 1], "t_s")
@@ -496,6 +509,13 @@ def run_s2g(self):
         r"""Build s-> graph and run it to get value of mapping table."""
         smin = nvnmd_cfg.dscp["smin"]
         smax = nvnmd_cfg.dscp["smax"]
+        # fix the bug: if model initial mode is 'init_from_model',
+        # we need dmin to calculate smin and smax in mapt.py
+        if smin == -2:
+            davg, dstd = get_normalize(nvnmd_cfg.weight)
+            nvnmd_cfg.get_s_range(davg, dstd)
+            smin = nvnmd_cfg.dscp["smin"]
+            smax = nvnmd_cfg.dscp["smax"]
 
         tf.reset_default_graph()
         dic_ph = self.build_s2g_grad()
@@ -567,9 +587,11 @@ def build_t2g(self):
             two_side_type_embedding,
             [-1, two_side_type_embedding.shape[-1]],
         )
-
+        # see se_atten.py in dp
         wbs = [get_filter_type_weight(nvnmd_cfg.weight, ll) for ll in range(1, 5)]
-        dic_ph["gt"] = self.build_embedding_net(two_side_type_embedding, wbs)
+        dic_ph["gt"] = (
+            self.build_embedding_net(two_side_type_embedding, wbs) + self.shift_Gt
+        )
         return dic_ph
 
     def run_t2g(self):
diff --git a/deepmd/nvnmd/entrypoints/train.py b/deepmd/nvnmd/entrypoints/train.py
index cb3dad0792..6e14b6f865 100644
--- a/deepmd/nvnmd/entrypoints/train.py
+++ b/deepmd/nvnmd/entrypoints/train.py
@@ -100,6 +100,7 @@ def normalized_input_qnn(jdata, PATH_QNN, CONFIG_CNN, WEIGHT_CNN, MAP_CNN):
     jdata_nvnmd = jdata_deepmd_input_v0["nvnmd"]
     jdata_nvnmd["enable"] = True
     jdata_nvnmd["version"] = nvnmd_cfg.version
+    jdata_nvnmd["max_nnei"] = nvnmd_cfg.max_nnei
     jdata_nvnmd["config_file"] = CONFIG_CNN
     jdata_nvnmd["weight_file"] = WEIGHT_CNN
     jdata_nvnmd["map_file"] = MAP_CNN
@@ -117,6 +118,7 @@ def normalized_input_qnn(jdata, PATH_QNN, CONFIG_CNN, WEIGHT_CNN, MAP_CNN):
 def train_nvnmd(
     *,
     INPUT: str,
+    init_model: Optional[str],
     restart: Optional[str],
     step: str,
     skip_neighbor_stat: bool = False,
@@ -142,6 +144,7 @@ def train_nvnmd(
         jdata = jdata_cmd_train.copy()
         jdata["INPUT"] = INPUT_CNN
         jdata["log_path"] = LOG_CNN
+        jdata["init_model"] = init_model
         jdata["restart"] = restart
         jdata["skip_neighbor_stat"] = skip_neighbor_stat
         train(**jdata)
diff --git a/deepmd/nvnmd/entrypoints/wrap.py b/deepmd/nvnmd/entrypoints/wrap.py
index 455dd999df..1ba2ed7384 100644
--- a/deepmd/nvnmd/entrypoints/wrap.py
+++ b/deepmd/nvnmd/entrypoints/wrap.py
@@ -156,33 +156,75 @@ def wrap_head(self, nhs, nws):
         r"""Wrap the head information.
 
         version
+        nhead
         nheight
-        nweight
-        rcut
+        nwidth
+        rcut       cut-off radius
+        ntype      number of atomic species
+        nnei       number of neighbors
+        atom_ener  atom bias energy
         """
         nbit = nvnmd_cfg.nbit
         ctrl = nvnmd_cfg.ctrl
+        dscp = nvnmd_cfg.dscp
+        fitn = nvnmd_cfg.fitn
+        weight = nvnmd_cfg.weight
         VERSION = ctrl["VERSION"]
+        SUB_VERSION = ctrl["SUB_VERSION"]
+        MAX_NNEI = ctrl["MAX_NNEI"]
+        nhead = 128
         NBIT_MODEL_HEAD = nbit["NBIT_MODEL_HEAD"]
         NBIT_FIXD_FL = nbit["NBIT_FIXD_FL"]
-        rcut = nvnmd_cfg.dscp["rcut"]
+        rcut = dscp["rcut"]
+        ntype = dscp["ntype"]
+        SEL = dscp["SEL"]
 
         bs = ""
         e = Encode()
         # version
-        bs = e.dec2bin(VERSION, NBIT_MODEL_HEAD)[0] + bs
+        vv = VERSION + 256 * SUB_VERSION + 256 * 256 * MAX_NNEI
+        bs = e.dec2bin(vv, NBIT_MODEL_HEAD)[0] + bs
+        # nhead
+        bs = e.dec2bin(nhead, NBIT_MODEL_HEAD)[0] + bs
         # height
         for n in nhs:
             bs = e.dec2bin(n, NBIT_MODEL_HEAD)[0] + bs
-        # weight
+        # width
         for n in nws:
             bs = e.dec2bin(n, NBIT_MODEL_HEAD)[0] + bs
-        # dscp
+        # rcut
         RCUT = e.qr(rcut, NBIT_FIXD_FL)
         bs = e.dec2bin(RCUT, NBIT_MODEL_HEAD)[0] + bs
+        # ntype
+        bs = e.dec2bin(ntype, NBIT_MODEL_HEAD)[0] + bs
+        # nnei
+        if VERSION == 0:
+            for tt in range(ntype):
+                bs = e.dec2bin(SEL[tt], NBIT_MODEL_HEAD)[0] + bs
+        if VERSION == 1:
+            bs = e.dec2bin(SEL, NBIT_MODEL_HEAD)[0] + bs
+        # atom_ener
+        # fix the bug: the different energy between qnn and lammps
+        if "t_bias_atom_e" in weight.keys():
+            atom_ener = weight["t_bias_atom_e"]
+        else:
+            atom_ener = [0] * 32
+        nlayer_fit = fitn["nlayer_fit"]
+        if VERSION == 0:
+            for tt in range(ntype):
+                w, b, _idt = get_fitnet_weight(weight, tt, nlayer_fit - 1, nlayer_fit)
+                shift = atom_ener[tt] + b[0]
+                SHIFT = e.qr(shift, NBIT_FIXD_FL)
+                bs = e.dec2bin(SHIFT, NBIT_MODEL_HEAD, signed=True)[0] + bs
+        if VERSION == 1:
+            for tt in range(ntype):
+                w, b, _idt = get_fitnet_weight(weight, 0, nlayer_fit - 1, nlayer_fit)
+                shift = atom_ener[tt] + b[0]
+                SHIFT = e.qr(shift, NBIT_FIXD_FL)
+                bs = e.dec2bin(SHIFT, NBIT_MODEL_HEAD, signed=True)[0] + bs
         # extend
         hs = e.bin2hex(bs)
-        hs = e.extend_hex(hs, NBIT_MODEL_HEAD * 32)
+        hs = e.extend_hex(hs, NBIT_MODEL_HEAD * nhead)
         return hs
 
     def wrap_dscp(self):
diff --git a/deepmd/nvnmd/utils/argcheck.py b/deepmd/nvnmd/utils/argcheck.py
index 2cbff3cbdc..2b9362efb0 100644
--- a/deepmd/nvnmd/utils/argcheck.py
+++ b/deepmd/nvnmd/utils/argcheck.py
@@ -1,68 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from dargs import (
-    Argument,
+"""Alias for backward compatibility."""
+from deepmd_utils.utils.argcheck_nvnmd import (
+    nvnmd_args,
 )
 
-
-def nvnmd_args():
-    doc_version = (
-        "configuration the nvnmd version (0 | 1), 0 for 4 types, 1 for 32 types"
-    )
-    doc_net_size_file = (
-        "configuration the number of nodes of fitting_net, just can be set as 128"
-    )
-    doc_map_file = "A file containing the mapping tables to replace the calculation of embedding nets"
-    doc_config_file = "A file containing the parameters about how to implement the model in certain hardware"
-    doc_weight_file = "a *.npy file containing the weights of the model"
-    doc_enable = "enable the nvnmd training"
-    doc_restore_descriptor = (
-        "enable to restore the parameter of embedding_net from weight.npy"
-    )
-    doc_restore_fitting_net = (
-        "enable to restore the parameter of fitting_net from weight.npy"
-    )
-    doc_quantize_descriptor = "enable the quantizatioin of descriptor"
-    doc_quantize_fitting_net = "enable the quantizatioin of fitting_net"
-    args = [
-        Argument("version", int, optional=False, default=0, doc=doc_version),
-        Argument("net_size", int, optional=False, default=128, doc=doc_net_size_file),
-        Argument("map_file", str, optional=False, default="none", doc=doc_map_file),
-        Argument(
-            "config_file", str, optional=False, default="none", doc=doc_config_file
-        ),
-        Argument(
-            "weight_file", str, optional=False, default="none", doc=doc_weight_file
-        ),
-        Argument("enable", bool, optional=False, default=False, doc=doc_enable),
-        Argument(
-            "restore_descriptor",
-            bool,
-            optional=False,
-            default=False,
-            doc=doc_restore_descriptor,
-        ),
-        Argument(
-            "restore_fitting_net",
-            bool,
-            optional=False,
-            default=False,
-            doc=doc_restore_fitting_net,
-        ),
-        Argument(
-            "quantize_descriptor",
-            bool,
-            optional=False,
-            default=False,
-            doc=doc_quantize_descriptor,
-        ),
-        Argument(
-            "quantize_fitting_net",
-            bool,
-            optional=False,
-            default=False,
-            doc=doc_quantize_fitting_net,
-        ),
-    ]
-
-    doc_nvnmd = "The nvnmd options."
-    return Argument("nvnmd", dict, args, [], optional=True, doc=doc_nvnmd)
+__all__ = [
+    "nvnmd_args",
+]
diff --git a/deepmd/nvnmd/utils/config.py b/deepmd/nvnmd/utils/config.py
index 96ca74c4c9..5bfd9ea54f 100644
--- a/deepmd/nvnmd/utils/config.py
+++ b/deepmd/nvnmd/utils/config.py
@@ -7,9 +7,15 @@
     NVNMD_CITATION,
     NVNMD_WELCOME,
     jdata_config_v0,
-    jdata_config_v1,
+    jdata_config_v0_ni128,
+    jdata_config_v0_ni256,
+    jdata_config_v1_ni128,
+    jdata_config_v1_ni256,
     jdata_deepmd_input_v0,
-    jdata_deepmd_input_v1,
+    jdata_deepmd_input_v0_ni128,
+    jdata_deepmd_input_v0_ni256,
+    jdata_deepmd_input_v1_ni128,
+    jdata_deepmd_input_v1_ni256,
 )
 from deepmd.nvnmd.utils.fio import (
     FioDic,
@@ -50,6 +56,7 @@ def init_from_jdata(self, jdata: dict = {}):
             return None
 
         self.version = jdata["version"]
+        self.max_nnei = jdata["max_nnei"]
         self.net_size = jdata["net_size"]
         self.map_file = jdata["map_file"]
         self.config_file = jdata["config_file"]
@@ -65,7 +72,7 @@ def init_from_jdata(self, jdata: dict = {}):
             self.map = FioDic().load(self.map_file, {})
             self.weight = FioDic().load(self.weight_file, {})
 
-            self.init_config_by_version(self.version)
+            self.init_config_by_version(self.version, self.max_nnei)
             load_config = FioDic().load(self.config_file, self.config)
             self.init_from_config(load_config)
             # if load the file, set net_size
@@ -106,7 +113,11 @@ def init_from_config(self, jdata):
         r"""Initialize member element one by one."""
         if "ctrl" in jdata.keys():
             if "VERSION" in jdata["ctrl"].keys():
-                self.init_config_by_version(jdata["ctrl"]["VERSION"])
+                if "MAX_NNEI" not in jdata["ctrl"].keys():
+                    jdata["ctrl"]["MAX_NNEI"] = 128
+                self.init_config_by_version(
+                    jdata["ctrl"]["VERSION"], jdata["ctrl"]["MAX_NNEI"]
+                )
         #
         self.config = FioDic().update(jdata, self.config)
         self.config["dscp"] = self.init_dscp(self.config["dscp"], self.config)
@@ -117,16 +128,29 @@ def init_from_config(self, jdata):
         self.config["nbit"] = self.init_nbit(self.config["nbit"], self.config)
         self.init_value()
 
-    def init_config_by_version(self, version):
+    def init_config_by_version(self, version, max_nnei):
         r"""Initialize version-dependent parameters."""
         self.version = version
+        self.max_nnei = max_nnei
         log.debug("#Set nvnmd version as %d " % self.version)
         if self.version == 0:
-            self.jdata_deepmd_input = jdata_deepmd_input_v0.copy()
-            self.config = jdata_config_v0.copy()
+            if self.max_nnei == 128:
+                self.jdata_deepmd_input = jdata_deepmd_input_v0_ni128.copy()
+                self.config = jdata_config_v0_ni128.copy()
+            elif self.max_nnei == 256:
+                self.jdata_deepmd_input = jdata_deepmd_input_v0_ni256.copy()
+                self.config = jdata_config_v0_ni256.copy()
+            else:
+                log.error("The max_nnei only can be set as 128|256 for version 0")
         if self.version == 1:
-            self.jdata_deepmd_input = jdata_deepmd_input_v1.copy()
-            self.config = jdata_config_v1.copy()
+            if self.max_nnei == 128:
+                self.jdata_deepmd_input = jdata_deepmd_input_v1_ni128.copy()
+                self.config = jdata_config_v1_ni128.copy()
+            elif self.max_nnei == 256:
+                self.jdata_deepmd_input = jdata_deepmd_input_v1_ni256.copy()
+                self.config = jdata_config_v1_ni256.copy()
+            else:
+                log.error("The max_nnei only can be set as 128|256 for version 1")
 
     def init_net_size(self):
         r"""Initialize net_size."""
@@ -154,10 +178,15 @@ def init_dscp(self, jdata: dict, jdata_parent: dict = {}) -> dict:
             jdata["M1"] = jdata["neuron"][-1]
             jdata["M2"] = jdata["axis_neuron"]
             jdata["SEL"] = (jdata["sel"] + [0, 0, 0, 0])[0:4]
+            for s in jdata["sel"]:
+                if s > self.max_nnei:
+                    log.error("The sel cannot be greater than the max_nnei")
+                    exit(1)
             jdata["NNODE_FEAS"] = [1] + jdata["neuron"]
             jdata["nlayer_fea"] = len(jdata["neuron"])
             jdata["same_net"] = 1 if jdata["type_one_side"] else 0
             # neighbor
+            jdata["NI"] = self.max_nnei
             jdata["NIDP"] = int(np.sum(jdata["sel"]))
             jdata["NIX"] = 2 ** int(np.ceil(np.log2(jdata["NIDP"] / 1.5)))
             # type
@@ -168,10 +197,14 @@ def init_dscp(self, jdata: dict, jdata_parent: dict = {}) -> dict:
             jdata["M1"] = jdata["neuron"][-1]
             jdata["M2"] = jdata["axis_neuron"]
             jdata["SEL"] = jdata["sel"]
+            if jdata["sel"] > self.max_nnei:
+                log.error("The sel cannot be greater than the max_nnei")
+                exit(1)
             jdata["NNODE_FEAS"] = [1] + jdata["neuron"]
             jdata["nlayer_fea"] = len(jdata["neuron"])
             jdata["same_net"] = 1 if jdata["type_one_side"] else 0
             # neighbor
+            jdata["NI"] = self.max_nnei
             jdata["NIDP"] = int(jdata["sel"])
             jdata["NIX"] = 2 ** int(np.ceil(np.log2(jdata["NIDP"] / 1.5)))
             # type
@@ -306,6 +339,7 @@ def get_nvnmd_jdata(self):
         r"""Generate `nvnmd` in input script."""
         jdata = self.jdata_deepmd_input["nvnmd"]
         jdata["net_size"] = self.net_size
+        jdata["max_nnei"] = self.max_nnei
         jdata["config_file"] = self.config_file
         jdata["weight_file"] = self.weight_file
         jdata["map_file"] = self.map_file
diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py
index bbcb305404..3b81740a93 100644
--- a/deepmd/train/trainer.py
+++ b/deepmd/train/trainer.py
@@ -943,6 +943,7 @@ def print_header(fp, train_results, valid_results, multi_task_mode=False):
                     for k in train_results[fitting_key].keys():
                         print_str += prop_fmt % (k + "_trn")
                 print_str += "   %8s\n" % (fitting_key + "_lr")
+        print_str += "# If there is no available reference data, rmse_*_{val,trn} will print nan\n"
         fp.write(print_str)
         fp.flush()
 
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 7104eb1de4..05e7c767b8 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -1,2015 +1,19 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-import json
-import logging
-from typing import (
-    Callable,
-    List,
-    Optional,
+"""Alias for backward compatibility."""
+from deepmd_utils.utils.argcheck import (
+    gen_args,
+    gen_doc,
+    gen_json,
+    list_to_doc,
+    normalize,
+    type_embedding_args,
 )
 
-from dargs import (
-    Argument,
-    ArgumentEncoder,
-    Variant,
-    dargs,
-)
-
-from deepmd.common import (
-    ACTIVATION_FN_DICT,
-    PRECISION_DICT,
-)
-from deepmd.nvnmd.utils.argcheck import (
-    nvnmd_args,
-)
-from deepmd.utils.plugin import (
-    Plugin,
-)
-
-log = logging.getLogger(__name__)
-
-
-def list_to_doc(xx):
-    items = []
-    for ii in xx:
-        if len(items) == 0:
-            items.append(f'"{ii}"')
-        else:
-            items.append(f', "{ii}"')
-    items.append(".")
-    return "".join(items)
-
-
-def make_link(content, ref_key):
-    return (
-        f"`{content} <{ref_key}_>`_"
-        if not dargs.RAW_ANCHOR
-        else f"`{content} <#{ref_key}>`_"
-    )
-
-
-def type_embedding_args():
-    doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
-    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_seed = "Random seed for parameter initialization"
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
-    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
-    doc_trainable = "If the parameters in the embedding net are trainable"
-
-    return [
-        Argument("neuron", List[int], optional=True, default=[8], doc=doc_neuron),
-        Argument(
-            "activation_function",
-            str,
-            optional=True,
-            default="tanh",
-            doc=doc_activation_function,
-        ),
-        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
-        Argument("precision", str, optional=True, default="default", doc=doc_precision),
-        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
-        Argument("seed", [int, None], optional=True, default=None, doc=doc_seed),
-    ]
-
-
-def spin_args():
-    doc_use_spin = "Whether to use atomic spin model for each atom type"
-    doc_spin_norm = "The magnitude of atomic spin for each atom type with spin"
-    doc_virtual_len = "The distance between virtual atom representing spin and its corresponding real atom for each atom type with spin"
-
-    return [
-        Argument("use_spin", List[bool], doc=doc_use_spin),
-        Argument("spin_norm", List[float], doc=doc_spin_norm),
-        Argument("virtual_len", List[float], doc=doc_virtual_len),
-    ]
-
-
-#  --- Descriptor configurations: --- #
-
-
-class ArgsPlugin:
-    def __init__(self) -> None:
-        self.__plugin = Plugin()
-
-    def register(
-        self, name: str, alias: Optional[List[str]] = None
-    ) -> Callable[[], List[Argument]]:
-        """Register a descriptor argument plugin.
-
-        Parameters
-        ----------
-        name : str
-            the name of a descriptor
-        alias : List[str], optional
-            the list of aliases of this descriptor
-
-        Returns
-        -------
-        Callable[[], List[Argument]]
-            the registered descriptor argument method
-
-        Examples
-        --------
-        >>> some_plugin = ArgsPlugin()
-        >>> @some_plugin.register("some_descrpt")
-            def descrpt_some_descrpt_args():
-                return []
-        """
-        # convert alias to hashed item
-        if isinstance(alias, list):
-            alias = tuple(alias)
-        return self.__plugin.register((name, alias))
-
-    def get_all_argument(self, exclude_hybrid: bool = False) -> List[Argument]:
-        """Get all arguments.
-
-        Parameters
-        ----------
-        exclude_hybrid : bool
-            exclude hybrid descriptor to prevent circular calls
-
-        Returns
-        -------
-        List[Argument]
-            all arguments
-        """
-        arguments = []
-        for (name, alias), metd in self.__plugin.plugins.items():
-            if exclude_hybrid and name == "hybrid":
-                continue
-            arguments.append(
-                Argument(name=name, dtype=dict, sub_fields=metd(), alias=alias)
-            )
-        return arguments
-
-
-descrpt_args_plugin = ArgsPlugin()
-
-
-@descrpt_args_plugin.register("loc_frame")
-def descrpt_local_frame_args():
-    doc_sel_a = "A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_a[i]` gives the selected number of type-i neighbors. The full relative coordinates of the neighbors are used by the descriptor."
-    doc_sel_r = "A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_r[i]` gives the selected number of type-i neighbors. Only relative distance of the neighbors are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius."
-    doc_rcut = "The cut-off radius. The default value is 6.0"
-    doc_axis_rule = "A list of integers. The length should be 6 times of the number of types. \n\n\
-- axis_rule[i*6+0]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.\n\n\
-- axis_rule[i*6+1]: type of the atom defining the first axis of type-i atom.\n\n\
-- axis_rule[i*6+2]: index of the axis atom defining the first axis. Note that the neighbors with the same class and type are sorted according to their relative distance.\n\n\
-- axis_rule[i*6+3]: class of the atom defining the second axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.\n\n\
-- axis_rule[i*6+4]: type of the atom defining the second axis of type-i atom.\n\n\
-- axis_rule[i*6+5]: index of the axis atom defining the second axis. Note that the neighbors with the same class and type are sorted according to their relative distance."
-
-    return [
-        Argument("sel_a", List[int], optional=False, doc=doc_sel_a),
-        Argument("sel_r", List[int], optional=False, doc=doc_sel_r),
-        Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
-        Argument("axis_rule", List[int], optional=False, doc=doc_axis_rule),
-    ]
-
-
-@descrpt_args_plugin.register("se_e2_a", alias=["se_a"])
-def descrpt_se_a_args():
-    doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
-    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
-    - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
-    doc_rcut = "The cut-off radius."
-    doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
-    doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
-    doc_axis_neuron = "Size of the submatrix of G (embedding matrix)."
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
-    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
-    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
-    doc_trainable = "If the parameters in the embedding net is trainable"
-    doc_seed = "Random seed for parameter initialization"
-    doc_exclude_types = "The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1."
-    doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used"
-
-    return [
-        Argument("sel", [List[int], str], optional=True, default="auto", doc=doc_sel),
-        Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
-        Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
-        Argument(
-            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
-        ),
-        Argument(
-            "axis_neuron",
-            int,
-            optional=True,
-            default=4,
-            alias=["n_axis_neuron"],
-            doc=doc_axis_neuron,
-        ),
-        Argument(
-            "activation_function",
-            str,
-            optional=True,
-            default="tanh",
-            doc=doc_activation_function,
-        ),
-        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
-        Argument(
-            "type_one_side", bool, optional=True, default=False, doc=doc_type_one_side
-        ),
-        Argument("precision", str, optional=True, default="default", doc=doc_precision),
-        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
-        Argument("seed", [int, None], optional=True, doc=doc_seed),
-        Argument(
-            "exclude_types",
-            List[List[int]],
-            optional=True,
-            default=[],
-            doc=doc_exclude_types,
-        ),
-        Argument(
-            "set_davg_zero", bool, optional=True, default=False, doc=doc_set_davg_zero
-        ),
-    ]
-
-
-@descrpt_args_plugin.register("se_e3", alias=["se_at", "se_a_3be", "se_t"])
-def descrpt_se_t_args():
-    doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
-    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
-    - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
-    doc_rcut = "The cut-off radius."
-    doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
-    doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
-    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
-    doc_trainable = "If the parameters in the embedding net are trainable"
-    doc_seed = "Random seed for parameter initialization"
-    doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used"
-
-    return [
-        Argument("sel", [List[int], str], optional=True, default="auto", doc=doc_sel),
-        Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
-        Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
-        Argument(
-            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
-        ),
-        Argument(
-            "activation_function",
-            str,
-            optional=True,
-            default="tanh",
-            doc=doc_activation_function,
-        ),
-        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
-        Argument("precision", str, optional=True, default="default", doc=doc_precision),
-        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
-        Argument("seed", [int, None], optional=True, doc=doc_seed),
-        Argument(
-            "set_davg_zero", bool, optional=True, default=False, doc=doc_set_davg_zero
-        ),
-    ]
-
-
-@descrpt_args_plugin.register("se_a_tpe", alias=["se_a_ebd"])
-def descrpt_se_a_tpe_args():
-    doc_type_nchanl = "number of channels for type embedding"
-    doc_type_nlayer = "number of hidden layers of type embedding net"
-    doc_numb_aparam = "dimension of atomic parameter. if set to a value > 0, the atomic parameters are embedded."
-
-    return [
-        *descrpt_se_a_args(),
-        Argument("type_nchanl", int, optional=True, default=4, doc=doc_type_nchanl),
-        Argument("type_nlayer", int, optional=True, default=2, doc=doc_type_nlayer),
-        Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
-    ]
-
-
-@descrpt_args_plugin.register("se_e2_r", alias=["se_r"])
-def descrpt_se_r_args():
-    doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
-    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
-    - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
-    doc_rcut = "The cut-off radius."
-    doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
-    doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
-    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
-    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
-    doc_trainable = "If the parameters in the embedding net are trainable"
-    doc_seed = "Random seed for parameter initialization"
-    doc_exclude_types = "The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1."
-    doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used"
-
-    return [
-        Argument("sel", [List[int], str], optional=True, default="auto", doc=doc_sel),
-        Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
-        Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
-        Argument(
-            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
-        ),
-        Argument(
-            "activation_function",
-            str,
-            optional=True,
-            default="tanh",
-            doc=doc_activation_function,
-        ),
-        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
-        Argument(
-            "type_one_side", bool, optional=True, default=False, doc=doc_type_one_side
-        ),
-        Argument("precision", str, optional=True, default="default", doc=doc_precision),
-        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
-        Argument("seed", [int, None], optional=True, doc=doc_seed),
-        Argument(
-            "exclude_types",
-            List[List[int]],
-            optional=True,
-            default=[],
-            doc=doc_exclude_types,
-        ),
-        Argument(
-            "set_davg_zero", bool, optional=True, default=False, doc=doc_set_davg_zero
-        ),
-    ]
-
-
-@descrpt_args_plugin.register("hybrid")
-def descrpt_hybrid_args():
-    doc_list = "A list of descriptor definitions"
-
-    return [
-        Argument(
-            "list",
-            list,
-            optional=False,
-            doc=doc_list,
-            repeat=True,
-            sub_fields=[],
-            sub_variants=[descrpt_variant_type_args(exclude_hybrid=True)],
-            fold_subdoc=True,
-        )
-    ]
-
-
-def descrpt_se_atten_common_args():
-    doc_sel = 'This parameter set the number of selected neighbors. Note that this parameter is a little different from that in other descriptors. Instead of separating each type of atoms, only the summation matters. And this number is highly related with the efficiency, thus one should not make it too large. Usually 200 or less is enough, far away from the GPU limitation 4096. It can be:\n\n\
-    - `int`. The maximum number of neighbor atoms to be considered. We recommend it to be less than 200. \n\n\
-    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.\
-    - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
-    doc_rcut = "The cut-off radius."
-    doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
-    doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
-    doc_axis_neuron = "Size of the submatrix of G (embedding matrix)."
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
-    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
-    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
-    doc_trainable = "If the parameters in the embedding net is trainable"
-    doc_seed = "Random seed for parameter initialization"
-    doc_exclude_types = "The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1."
-    doc_attn = "The length of hidden vectors in attention layers"
-    doc_attn_layer = "The number of attention layers. Note that model compression of `se_atten` is only enabled when attn_layer==0 and stripped_type_embedding is True"
-    doc_attn_dotr = "Whether to do dot product with the normalized relative coordinates"
-    doc_attn_mask = "Whether to do mask on the diagonal in the attention matrix"
-
-    return [
-        Argument(
-            "sel", [int, List[int], str], optional=True, default="auto", doc=doc_sel
-        ),
-        Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
-        Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
-        Argument(
-            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
-        ),
-        Argument(
-            "axis_neuron",
-            int,
-            optional=True,
-            default=4,
-            alias=["n_axis_neuron"],
-            doc=doc_axis_neuron,
-        ),
-        Argument(
-            "activation_function",
-            str,
-            optional=True,
-            default="tanh",
-            doc=doc_activation_function,
-        ),
-        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
-        Argument(
-            "type_one_side", bool, optional=True, default=False, doc=doc_type_one_side
-        ),
-        Argument("precision", str, optional=True, default="default", doc=doc_precision),
-        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
-        Argument("seed", [int, None], optional=True, doc=doc_seed),
-        Argument(
-            "exclude_types",
-            List[List[int]],
-            optional=True,
-            default=[],
-            doc=doc_exclude_types,
-        ),
-        Argument("attn", int, optional=True, default=128, doc=doc_attn),
-        Argument("attn_layer", int, optional=True, default=2, doc=doc_attn_layer),
-        Argument("attn_dotr", bool, optional=True, default=True, doc=doc_attn_dotr),
-        Argument("attn_mask", bool, optional=True, default=False, doc=doc_attn_mask),
-    ]
-
-
-@descrpt_args_plugin.register("se_atten")
-def descrpt_se_atten_args():
-    doc_stripped_type_embedding = "Whether to strip the type embedding into a separated embedding network. Setting it to `False` will fall back to the previous version of `se_atten` which is non-compressible."
-    doc_smooth_type_embdding = "When using stripped type embedding, whether to dot smooth factor on the network output of type embedding to keep the network smooth, instead of setting `set_davg_zero` to be True."
-    doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `se_atten` descriptor or `atom_ener` in the energy fitting is used"
-
-    return [
-        *descrpt_se_atten_common_args(),
-        Argument(
-            "stripped_type_embedding",
-            bool,
-            optional=True,
-            default=False,
-            doc=doc_stripped_type_embedding,
-        ),
-        Argument(
-            "smooth_type_embdding",
-            bool,
-            optional=True,
-            default=False,
-            doc=doc_smooth_type_embdding,
-        ),
-        Argument(
-            "set_davg_zero", bool, optional=True, default=True, doc=doc_set_davg_zero
-        ),
-    ]
-
-
-@descrpt_args_plugin.register("se_atten_v2")
-def descrpt_se_atten_v2_args():
-    doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `se_atten` descriptor or `atom_ener` in the energy fitting is used"
-
-    return [
-        *descrpt_se_atten_common_args(),
-        Argument(
-            "set_davg_zero", bool, optional=True, default=False, doc=doc_set_davg_zero
-        ),
-    ]
-
-
-@descrpt_args_plugin.register("se_a_ebd_v2", alias=["se_a_tpe_v2"])
-def descrpt_se_a_ebd_v2_args():
-    return descrpt_se_a_args()
-
-
-@descrpt_args_plugin.register("se_a_mask")
-def descrpt_se_a_mask_args():
-    doc_sel = 'This parameter sets the number of selected neighbors for each type of atom. It can be:\n\n\
-    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
-    - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
-
-    doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
-    doc_axis_neuron = "Size of the submatrix of G (embedding matrix)."
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
-    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
-    doc_exclude_types = "The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1."
-    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
-    doc_trainable = "If the parameters in the embedding net is trainable"
-    doc_seed = "Random seed for parameter initialization"
-
-    return [
-        Argument("sel", [List[int], str], optional=True, default="auto", doc=doc_sel),
-        Argument(
-            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
-        ),
-        Argument(
-            "axis_neuron",
-            int,
-            optional=True,
-            default=4,
-            alias=["n_axis_neuron"],
-            doc=doc_axis_neuron,
-        ),
-        Argument(
-            "activation_function",
-            str,
-            optional=True,
-            default="tanh",
-            doc=doc_activation_function,
-        ),
-        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
-        Argument(
-            "type_one_side", bool, optional=True, default=False, doc=doc_type_one_side
-        ),
-        Argument(
-            "exclude_types",
-            List[List[int]],
-            optional=True,
-            default=[],
-            doc=doc_exclude_types,
-        ),
-        Argument("precision", str, optional=True, default="default", doc=doc_precision),
-        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
-        Argument("seed", [int, None], optional=True, doc=doc_seed),
-    ]
-
-
-def descrpt_variant_type_args(exclude_hybrid: bool = False) -> Variant:
-    link_lf = make_link("loc_frame", "model/descriptor[loc_frame]")
-    link_se_e2_a = make_link("se_e2_a", "model/descriptor[se_e2_a]")
-    link_se_e2_r = make_link("se_e2_r", "model/descriptor[se_e2_r]")
-    link_se_e3 = make_link("se_e3", "model/descriptor[se_e3]")
-    link_se_a_tpe = make_link("se_a_tpe", "model/descriptor[se_a_tpe]")
-    link_hybrid = make_link("hybrid", "model/descriptor[hybrid]")
-    link_se_atten = make_link("se_atten", "model/descriptor[se_atten]")
-    link_se_atten_v2 = make_link("se_atten_v2", "model/descriptor[se_atten_v2]")
-    doc_descrpt_type = "The type of the descritpor. See explanation below. \n\n\
-- `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.\n\n\
-- `se_e2_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.\n\n\
-- `se_e2_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.\n\n\
-- `se_e3`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Three-body embedding will be used by this descriptor.\n\n\
-- `se_a_tpe`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Type embedding will be used by this descriptor.\n\n\
-- `se_atten`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Attention mechanism will be used by this descriptor.\n\n\
-- `se_atten_v2`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Attention mechanism with new modifications will be used by this descriptor.\n\n\
-- `se_a_mask`: Used by the smooth edition of Deep Potential. It can accept a variable number of atoms in a frame (Non-PBC system). *aparam* are required as an indicator matrix for the real/virtual sign of input atoms. \n\n\
-- `hybrid`: Concatenate of a list of descriptors as a new descriptor."
-
-    return Variant(
-        "type",
-        descrpt_args_plugin.get_all_argument(exclude_hybrid=exclude_hybrid),
-        doc=doc_descrpt_type,
-    )
-
-
-#  --- Fitting net configurations: --- #
-fitting_args_plugin = ArgsPlugin()
-
-
-@fitting_args_plugin.register("ener")
-def fitting_ener():
-    doc_numb_fparam = "The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams."
-    doc_numb_aparam = "The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams."
-    doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
-    doc_precision = f"The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
-    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_trainable = "Whether the parameters in the fitting net are trainable. This option can be\n\n\
-- bool: True if all parameters of the fitting net are trainable, False otherwise.\n\n\
-- list of bool: Specifies if each layer is trainable. Since the fitting net is composed by hidden layers followed by a output layer, the length of this list should be equal to len(`neuron`)+1."
-    doc_rcond = "The condition number used to determine the inital energy shift for each type of atoms. See `rcond` in :py:meth:`numpy.linalg.lstsq` for more details."
-    doc_seed = "Random seed for parameter initialization of the fitting net"
-    doc_atom_ener = "Specify the atomic energy in vacuum for each type"
-    doc_layer_name = (
-        "The name of the each layer. The length of this list should be equal to n_neuron + 1. "
-        "If two layers, either in the same fitting or different fittings, "
-        "have the same name, they will share the same neural network parameters. "
-        "The shape of these layers should be the same. "
-        "If null is given for a layer, parameters will not be shared."
-    )
-    doc_use_aparam_as_mask = (
-        "Whether to use the aparam as a mask in input."
-        "If True, the aparam will not be used in fitting net for embedding."
-        "When descrpt is se_a_mask, the aparam will be used as a mask to indicate the input atom is real/virtual. And use_aparam_as_mask should be set to True."
-    )
-
-    return [
-        Argument("numb_fparam", int, optional=True, default=0, doc=doc_numb_fparam),
-        Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
-        Argument(
-            "neuron",
-            List[int],
-            optional=True,
-            default=[120, 120, 120],
-            alias=["n_neuron"],
-            doc=doc_neuron,
-        ),
-        Argument(
-            "activation_function",
-            str,
-            optional=True,
-            default="tanh",
-            doc=doc_activation_function,
-        ),
-        Argument("precision", str, optional=True, default="default", doc=doc_precision),
-        Argument("resnet_dt", bool, optional=True, default=True, doc=doc_resnet_dt),
-        Argument(
-            "trainable",
-            [List[bool], bool],
-            optional=True,
-            default=True,
-            doc=doc_trainable,
-        ),
-        Argument(
-            "rcond", [float, type(None)], optional=True, default=None, doc=doc_rcond
-        ),
-        Argument("seed", [int, None], optional=True, doc=doc_seed),
-        Argument(
-            "atom_ener",
-            List[Optional[float]],
-            optional=True,
-            default=[],
-            doc=doc_atom_ener,
-        ),
-        Argument("layer_name", List[str], optional=True, doc=doc_layer_name),
-        Argument(
-            "use_aparam_as_mask",
-            bool,
-            optional=True,
-            default=False,
-            doc=doc_use_aparam_as_mask,
-        ),
-    ]
-
-
-@fitting_args_plugin.register("dos")
-def fitting_dos():
-    doc_numb_fparam = "The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams."
-    doc_numb_aparam = "The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams."
-    doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
-    doc_precision = f"The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
-    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_trainable = "Whether the parameters in the fitting net are trainable. This option can be\n\n\
-- bool: True if all parameters of the fitting net are trainable, False otherwise.\n\n\
-- list of bool: Specifies if each layer is trainable. Since the fitting net is composed by hidden layers followed by a output layer, the length of tihs list should be equal to len(`neuron`)+1."
-    doc_rcond = "The condition number used to determine the inital energy shift for each type of atoms. See `rcond` in :py:meth:`numpy.linalg.lstsq` for more details."
-    doc_seed = "Random seed for parameter initialization of the fitting net"
-    doc_numb_dos = (
-        "The number of gridpoints on which the DOS is evaluated (NEDOS in VASP)"
-    )
-
-    return [
-        Argument("numb_fparam", int, optional=True, default=0, doc=doc_numb_fparam),
-        Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
-        Argument(
-            "neuron", List[int], optional=True, default=[120, 120, 120], doc=doc_neuron
-        ),
-        Argument(
-            "activation_function",
-            str,
-            optional=True,
-            default="tanh",
-            doc=doc_activation_function,
-        ),
-        Argument("precision", str, optional=True, default="float64", doc=doc_precision),
-        Argument("resnet_dt", bool, optional=True, default=True, doc=doc_resnet_dt),
-        Argument(
-            "trainable",
-            [List[bool], bool],
-            optional=True,
-            default=True,
-            doc=doc_trainable,
-        ),
-        Argument(
-            "rcond", [float, type(None)], optional=True, default=None, doc=doc_rcond
-        ),
-        Argument("seed", [int, None], optional=True, doc=doc_seed),
-        Argument("numb_dos", int, optional=True, default=300, doc=doc_numb_dos),
-    ]
-
-
-@fitting_args_plugin.register("polar")
-def fitting_polar():
-    doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
-    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_precision = f"The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
-    doc_scale = "The output of the fitting net (polarizability matrix) will be scaled by ``scale``"
-    # doc_diag_shift = 'The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.'
-    doc_fit_diag = "Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix."
-    doc_sel_type = "The atom types for which the atomic polarizability will be provided. If not set, all types will be selected."
-    doc_seed = "Random seed for parameter initialization of the fitting net"
-
-    # YWolfeee: user can decide whether to use shift diag
-    doc_shift_diag = "Whether to shift the diagonal of polar, which is beneficial to training. Default is true."
-
-    return [
-        Argument(
-            "neuron",
-            List[int],
-            optional=True,
-            default=[120, 120, 120],
-            alias=["n_neuron"],
-            doc=doc_neuron,
-        ),
-        Argument(
-            "activation_function",
-            str,
-            optional=True,
-            default="tanh",
-            doc=doc_activation_function,
-        ),
-        Argument("resnet_dt", bool, optional=True, default=True, doc=doc_resnet_dt),
-        Argument("precision", str, optional=True, default="default", doc=doc_precision),
-        Argument("fit_diag", bool, optional=True, default=True, doc=doc_fit_diag),
-        Argument(
-            "scale", [List[float], float], optional=True, default=1.0, doc=doc_scale
-        ),
-        # Argument("diag_shift", [list,float], optional = True, default = 0.0, doc = doc_diag_shift),
-        Argument("shift_diag", bool, optional=True, default=True, doc=doc_shift_diag),
-        Argument(
-            "sel_type",
-            [List[int], int, None],
-            optional=True,
-            alias=["pol_type"],
-            doc=doc_sel_type,
-        ),
-        Argument("seed", [int, None], optional=True, doc=doc_seed),
-    ]
-
-
-# def fitting_global_polar():
-#    return fitting_polar()
-
-
-@fitting_args_plugin.register("dipole")
-def fitting_dipole():
-    doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
-    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_precision = f"The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
-    doc_sel_type = "The atom types for which the atomic dipole will be provided. If not set, all types will be selected."
-    doc_seed = "Random seed for parameter initialization of the fitting net"
-    return [
-        Argument(
-            "neuron",
-            List[int],
-            optional=True,
-            default=[120, 120, 120],
-            alias=["n_neuron"],
-            doc=doc_neuron,
-        ),
-        Argument(
-            "activation_function",
-            str,
-            optional=True,
-            default="tanh",
-            doc=doc_activation_function,
-        ),
-        Argument("resnet_dt", bool, optional=True, default=True, doc=doc_resnet_dt),
-        Argument("precision", str, optional=True, default="default", doc=doc_precision),
-        Argument(
-            "sel_type",
-            [List[int], int, None],
-            optional=True,
-            alias=["dipole_type"],
-            doc=doc_sel_type,
-        ),
-        Argument("seed", [int, None], optional=True, doc=doc_seed),
-    ]
-
-
-#   YWolfeee: Delete global polar mode, merge it into polar mode and use loss setting to support.
-def fitting_variant_type_args():
-    doc_descrpt_type = "The type of the fitting. See explanation below. \n\n\
-- `ener`: Fit an energy model (potential energy surface).\n\n\
-- `dos` : Fit a density of states model. The total density of states / site-projected density of states labels should be provided by `dos.npy` or `atom_dos.npy` in each data system. The file has number of frames lines and number of energy grid columns (times number of atoms in `atom_dos.npy`). See `loss` parameter. \n\n\
-- `dipole`: Fit an atomic dipole model. Global dipole labels or atomic dipole labels for all the selected atoms (see `sel_type`) should be provided by `dipole.npy` in each data system. The file either has number of frames lines and 3 times of number of selected atoms columns, or has number of frames lines and 3 columns. See `loss` parameter.\n\n\
-- `polar`: Fit an atomic polarizability model. Global polarizazbility labels or atomic polarizability labels for all the selected atoms (see `sel_type`) should be provided by `polarizability.npy` in each data system. The file eith has number of frames lines and 9 times of number of selected atoms columns, or has number of frames lines and 9 columns. See `loss` parameter.\n\n"
-
-    return Variant(
-        "type",
-        fitting_args_plugin.get_all_argument(),
-        optional=True,
-        default_tag="ener",
-        doc=doc_descrpt_type,
-    )
-
-
-#  --- Modifier configurations: --- #
-def modifier_dipole_charge():
-    doc_model_name = "The name of the frozen dipole model file."
-    doc_model_charge_map = f"The charge of the WFCC. The list length should be the same as the {make_link('sel_type', 'model/fitting_net[dipole]/sel_type')}. "
-    doc_sys_charge_map = f"The charge of real atoms. The list length should be the same as the {make_link('type_map', 'model/type_map')}"
-    doc_ewald_h = "The grid spacing of the FFT grid. Unit is A"
-    doc_ewald_beta = f"The splitting parameter of Ewald sum. Unit is A^{-1}"
-
-    return [
-        Argument("model_name", str, optional=False, doc=doc_model_name),
-        Argument(
-            "model_charge_map", List[float], optional=False, doc=doc_model_charge_map
-        ),
-        Argument("sys_charge_map", List[float], optional=False, doc=doc_sys_charge_map),
-        Argument("ewald_beta", float, optional=True, default=0.4, doc=doc_ewald_beta),
-        Argument("ewald_h", float, optional=True, default=1.0, doc=doc_ewald_h),
-    ]
-
-
-def modifier_variant_type_args():
-    doc_modifier_type = "The type of modifier. See explanation below.\n\n\
--`dipole_charge`: Use WFCC to model the electronic structure of the system. Correct the long-range interaction"
-    return Variant(
-        "type",
-        [
-            Argument("dipole_charge", dict, modifier_dipole_charge()),
-        ],
-        optional=False,
-        doc=doc_modifier_type,
-    )
-
-
-#  --- model compression configurations: --- #
-def model_compression():
-    doc_model_file = "The input model file, which will be compressed by the DeePMD-kit."
-    doc_table_config = "The arguments of model compression, including extrapolate(scale of model extrapolation), stride(uniform stride of tabulation's first and second table), and frequency(frequency of tabulation overflow check)."
-    doc_min_nbor_dist = (
-        "The nearest distance between neighbor atoms saved in the frozen model."
-    )
-
-    return [
-        Argument("model_file", str, optional=False, doc=doc_model_file),
-        Argument("table_config", List[float], optional=False, doc=doc_table_config),
-        Argument("min_nbor_dist", float, optional=False, doc=doc_min_nbor_dist),
-    ]
-
-
-#  --- model compression configurations: --- #
-def model_compression_type_args():
-    doc_compress_type = "The type of model compression, which should be consistent with the descriptor type."
-
-    return Variant(
-        "type",
-        [Argument("se_e2_a", dict, model_compression(), alias=["se_a"])],
-        optional=True,
-        default_tag="se_e2_a",
-        doc=doc_compress_type,
-    )
-
-
-def model_args(exclude_hybrid=False):
-    doc_type_map = "A list of strings. Give the name to each type of atoms. It is noted that the number of atom type of training system must be less than 128 in a GPU environment. If not given, type.raw in each system should use the same type indexes, and type_map.raw will take no effect."
-    doc_data_stat_nbatch = "The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics."
-    doc_data_stat_protect = "Protect parameter for atomic energy regression."
-    doc_data_bias_nsample = "The number of training samples in a system to compute and change the energy bias."
-    doc_type_embedding = "The type embedding."
-    doc_modifier = "The modifier of model output."
-    doc_use_srtab = "The table for the short-range pairwise interaction added on top of DP. The table is a text data file with (N_t + 1) * N_t / 2 + 1 columes. The first colume is the distance between atoms. The second to the last columes are energies for pairs of certain types. For example we have two atom types, 0 and 1. The columes from 2nd to 4th are for 0-0, 0-1 and 1-1 correspondingly."
-    doc_smin_alpha = "The short-range tabulated interaction will be swithed according to the distance of the nearest neighbor. This distance is calculated by softmin. This parameter is the decaying parameter in the softmin. It is only required when `use_srtab` is provided."
-    doc_sw_rmin = "The lower boundary of the interpolation between short-range tabulated interaction and DP. It is only required when `use_srtab` is provided."
-    doc_sw_rmax = "The upper boundary of the interpolation between short-range tabulated interaction and DP. It is only required when `use_srtab` is provided."
-    doc_srtab_add_bias = "Whether add energy bias from the statistics of the data to short-range tabulated atomic energy. It only takes effect when `use_srtab` is provided."
-    doc_compress_config = "Model compression configurations"
-    doc_spin = "The settings for systems with spin."
-    hybrid_models = []
-    if not exclude_hybrid:
-        hybrid_models.extend(
-            [
-                pairwise_dprc(),
-                linear_ener_model_args(),
-            ]
-        )
-    return Argument(
-        "model",
-        dict,
-        [
-            Argument("type_map", List[str], optional=True, doc=doc_type_map),
-            Argument(
-                "data_stat_nbatch",
-                int,
-                optional=True,
-                default=10,
-                doc=doc_data_stat_nbatch,
-            ),
-            Argument(
-                "data_stat_protect",
-                float,
-                optional=True,
-                default=1e-2,
-                doc=doc_data_stat_protect,
-            ),
-            Argument(
-                "data_bias_nsample",
-                int,
-                optional=True,
-                default=10,
-                doc=doc_data_bias_nsample,
-            ),
-            Argument("use_srtab", str, optional=True, doc=doc_use_srtab),
-            Argument("smin_alpha", float, optional=True, doc=doc_smin_alpha),
-            Argument("sw_rmin", float, optional=True, doc=doc_sw_rmin),
-            Argument("sw_rmax", float, optional=True, doc=doc_sw_rmax),
-            Argument(
-                "srtab_add_bias",
-                bool,
-                optional=True,
-                default=True,
-                doc=doc_srtab_add_bias,
-            ),
-            Argument(
-                "type_embedding",
-                dict,
-                type_embedding_args(),
-                [],
-                optional=True,
-                doc=doc_type_embedding,
-            ),
-            Argument(
-                "modifier",
-                dict,
-                [],
-                [modifier_variant_type_args()],
-                optional=True,
-                doc=doc_modifier,
-            ),
-            Argument(
-                "compress",
-                dict,
-                [],
-                [model_compression_type_args()],
-                optional=True,
-                doc=doc_compress_config,
-                fold_subdoc=True,
-            ),
-            Argument("spin", dict, spin_args(), [], optional=True, doc=doc_spin),
-        ],
-        [
-            Variant(
-                "type",
-                [
-                    standard_model_args(),
-                    multi_model_args(),
-                    frozen_model_args(),
-                    *hybrid_models,
-                ],
-                optional=True,
-                default_tag="standard",
-            ),
-        ],
-    )
-
-
-def standard_model_args() -> Argument:
-    doc_descrpt = "The descriptor of atomic environment."
-    doc_fitting = "The fitting of physical properties."
-
-    ca = Argument(
-        "standard",
-        dict,
-        [
-            Argument(
-                "descriptor", dict, [], [descrpt_variant_type_args()], doc=doc_descrpt
-            ),
-            Argument(
-                "fitting_net",
-                dict,
-                [],
-                [fitting_variant_type_args()],
-                doc=doc_fitting,
-            ),
-        ],
-        doc="Stardard model, which contains a descriptor and a fitting.",
-    )
-    return ca
-
-
-def multi_model_args() -> Argument:
-    doc_descrpt = "The descriptor of atomic environment. See model[standard]/descriptor for details."
-    doc_fitting_net_dict = "The dictionary of multiple fitting nets in multi-task mode. Each fitting_net_dict[fitting_key] is the single definition of fitting of physical properties with user-defined name `fitting_key`."
-
-    ca = Argument(
-        "multi",
-        dict,
-        [
-            Argument(
-                "descriptor",
-                dict,
-                [],
-                [descrpt_variant_type_args()],
-                doc=doc_descrpt,
-                fold_subdoc=True,
-            ),
-            Argument("fitting_net_dict", dict, doc=doc_fitting_net_dict),
-        ],
-        doc="Multiple-task model.",
-    )
-    return ca
-
-
-def pairwise_dprc() -> Argument:
-    qm_model_args = model_args(exclude_hybrid=True)
-    qm_model_args.name = "qm_model"
-    qm_model_args.fold_subdoc = True
-    qmmm_model_args = model_args(exclude_hybrid=True)
-    qmmm_model_args.name = "qmmm_model"
-    qmmm_model_args.fold_subdoc = True
-    ca = Argument(
-        "pairwise_dprc",
-        dict,
-        [
-            qm_model_args,
-            qmmm_model_args,
-        ],
-    )
-    return ca
-
-
-def frozen_model_args() -> Argument:
-    doc_model_file = "Path to the frozen model file."
-    ca = Argument(
-        "frozen",
-        dict,
-        [
-            Argument("model_file", str, optional=False, doc=doc_model_file),
-        ],
-    )
-    return ca
-
-
-def linear_ener_model_args() -> Argument:
-    doc_weights = (
-        "If the type is list of float, a list of weights for each model. "
-        'If "mean", the weights are set to be 1 / len(models). '
-        'If "sum", the weights are set to be 1.'
-    )
-    models_args = model_args(exclude_hybrid=True)
-    models_args.name = "models"
-    models_args.fold_subdoc = True
-    models_args.set_dtype(list)
-    models_args.set_repeat(True)
-    models_args.doc = "The sub-models."
-    ca = Argument(
-        "linear_ener",
-        dict,
-        [
-            models_args,
-            Argument(
-                "weights",
-                [list, str],
-                optional=False,
-                doc=doc_weights,
-            ),
-        ],
-    )
-    return ca
-
-
-#  --- Learning rate configurations: --- #
-def learning_rate_exp():
-    doc_start_lr = "The learning rate at the start of the training."
-    doc_stop_lr = "The desired learning rate at the end of the training."
-    doc_decay_steps = (
-        "The learning rate is decaying every this number of training steps."
-    )
-
-    args = [
-        Argument("start_lr", float, optional=True, default=1e-3, doc=doc_start_lr),
-        Argument("stop_lr", float, optional=True, default=1e-8, doc=doc_stop_lr),
-        Argument("decay_steps", int, optional=True, default=5000, doc=doc_decay_steps),
-    ]
-    return args
-
-
-def learning_rate_variant_type_args():
-    doc_lr = "The type of the learning rate."
-
-    return Variant(
-        "type",
-        [Argument("exp", dict, learning_rate_exp())],
-        optional=True,
-        default_tag="exp",
-        doc=doc_lr,
-    )
-
-
-def learning_rate_args():
-    doc_scale_by_worker = "When parallel training or batch size scaled, how to alter learning rate. Valid values are `linear`(default), `sqrt` or `none`."
-    doc_lr = "The definitio of learning rate"
-    return Argument(
-        "learning_rate",
-        dict,
-        [
-            Argument(
-                "scale_by_worker",
-                str,
-                optional=True,
-                default="linear",
-                doc=doc_scale_by_worker,
-            )
-        ],
-        [learning_rate_variant_type_args()],
-        optional=True,
-        doc=doc_lr,
-    )
-
-
-def learning_rate_dict_args():
-    doc_learning_rate_dict = (
-        "The dictionary of definitions of learning rates in multi-task mode. "
-        "Each learning_rate_dict[fitting_key], with user-defined name `fitting_key` in `model/fitting_net_dict`, is the single definition of learning rate.\n"
-    )
-    ca = Argument(
-        "learning_rate_dict", dict, [], [], optional=True, doc=doc_learning_rate_dict
-    )
-    return ca
-
-
-#  --- Loss configurations: --- #
-def start_pref(item, label=None, abbr=None):
-    if label is None:
-        label = item
-    if abbr is None:
-        abbr = item
-    return f"The prefactor of {item} loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the {label} label should be provided by file {label}.npy in each data system. If both start_pref_{abbr} and limit_pref_{abbr} are set to 0, then the {item} will be ignored."
-
-
-def limit_pref(item):
-    return f"The prefactor of {item} loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity."
-
-
-loss_args_plugin = ArgsPlugin()
-
-
-@loss_args_plugin.register("ener")
-def loss_ener():
-    doc_start_pref_e = start_pref("energy", abbr="e")
-    doc_limit_pref_e = limit_pref("energy")
-    doc_start_pref_f = start_pref("force", abbr="f")
-    doc_limit_pref_f = limit_pref("force")
-    doc_start_pref_v = start_pref("virial", abbr="v")
-    doc_limit_pref_v = limit_pref("virial")
-    doc_start_pref_ae = start_pref("atomic energy", label="atom_ener", abbr="ae")
-    doc_limit_pref_ae = limit_pref("atomic energy")
-    doc_start_pref_pf = start_pref(
-        "atomic prefactor force", label="atom_pref", abbr="pf"
-    )
-    doc_limit_pref_pf = limit_pref("atomic prefactor force")
-    doc_start_pref_gf = start_pref("generalized force", label="drdq", abbr="gf")
-    doc_limit_pref_gf = limit_pref("generalized force")
-    doc_numb_generalized_coord = "The dimension of generalized coordinates. Required when generalized force loss is used."
-    doc_relative_f = "If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label."
-    doc_enable_atom_ener_coeff = "If true, the energy will be computed as \\sum_i c_i E_i. c_i should be provided by file atom_ener_coeff.npy in each data system, otherwise it's 1."
-    return [
-        Argument(
-            "start_pref_e",
-            [float, int],
-            optional=True,
-            default=0.02,
-            doc=doc_start_pref_e,
-        ),
-        Argument(
-            "limit_pref_e",
-            [float, int],
-            optional=True,
-            default=1.00,
-            doc=doc_limit_pref_e,
-        ),
-        Argument(
-            "start_pref_f",
-            [float, int],
-            optional=True,
-            default=1000,
-            doc=doc_start_pref_f,
-        ),
-        Argument(
-            "limit_pref_f",
-            [float, int],
-            optional=True,
-            default=1.00,
-            doc=doc_limit_pref_f,
-        ),
-        Argument(
-            "start_pref_v",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_start_pref_v,
-        ),
-        Argument(
-            "limit_pref_v",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_limit_pref_v,
-        ),
-        Argument(
-            "start_pref_ae",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_start_pref_ae,
-        ),
-        Argument(
-            "limit_pref_ae",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_limit_pref_ae,
-        ),
-        Argument(
-            "start_pref_pf",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_start_pref_pf,
-        ),
-        Argument(
-            "limit_pref_pf",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_limit_pref_pf,
-        ),
-        Argument("relative_f", [float, None], optional=True, doc=doc_relative_f),
-        Argument(
-            "enable_atom_ener_coeff",
-            [bool],
-            optional=True,
-            default=False,
-            doc=doc_enable_atom_ener_coeff,
-        ),
-        Argument(
-            "start_pref_gf",
-            float,
-            optional=True,
-            default=0.0,
-            doc=doc_start_pref_gf,
-        ),
-        Argument(
-            "limit_pref_gf",
-            float,
-            optional=True,
-            default=0.0,
-            doc=doc_limit_pref_gf,
-        ),
-        Argument(
-            "numb_generalized_coord",
-            int,
-            optional=True,
-            default=0,
-            doc=doc_numb_generalized_coord,
-        ),
-    ]
-
-
-@loss_args_plugin.register("ener_spin")
-def loss_ener_spin():
-    doc_start_pref_e = start_pref("energy")
-    doc_limit_pref_e = limit_pref("energy")
-    doc_start_pref_fr = start_pref("force_real_atom")
-    doc_limit_pref_fr = limit_pref("force_real_atom")
-    doc_start_pref_fm = start_pref("force_magnetic")
-    doc_limit_pref_fm = limit_pref("force_magnetic")
-    doc_start_pref_v = start_pref("virial")
-    doc_limit_pref_v = limit_pref("virial")
-    doc_start_pref_ae = start_pref("atom_ener")
-    doc_limit_pref_ae = limit_pref("atom_ener")
-    doc_start_pref_pf = start_pref("atom_pref")
-    doc_limit_pref_pf = limit_pref("atom_pref")
-    doc_relative_f = "If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label."
-    doc_enable_atom_ener_coeff = r"If true, the energy will be computed as \sum_i c_i E_i. c_i should be provided by file atom_ener_coeff.npy in each data system, otherwise it's 1."
-    return [
-        Argument(
-            "start_pref_e",
-            [float, int],
-            optional=True,
-            default=0.02,
-            doc=doc_start_pref_e,
-        ),
-        Argument(
-            "limit_pref_e",
-            [float, int],
-            optional=True,
-            default=1.00,
-            doc=doc_limit_pref_e,
-        ),
-        Argument(
-            "start_pref_fr",
-            [float, int],
-            optional=True,
-            default=1000,
-            doc=doc_start_pref_fr,
-        ),
-        Argument(
-            "limit_pref_fr",
-            [float, int],
-            optional=True,
-            default=1.00,
-            doc=doc_limit_pref_fr,
-        ),
-        Argument(
-            "start_pref_fm",
-            [float, int],
-            optional=True,
-            default=10000,
-            doc=doc_start_pref_fm,
-        ),
-        Argument(
-            "limit_pref_fm",
-            [float, int],
-            optional=True,
-            default=10.0,
-            doc=doc_limit_pref_fm,
-        ),
-        Argument(
-            "start_pref_v",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_start_pref_v,
-        ),
-        Argument(
-            "limit_pref_v",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_limit_pref_v,
-        ),
-        Argument(
-            "start_pref_ae",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_start_pref_ae,
-        ),
-        Argument(
-            "limit_pref_ae",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_limit_pref_ae,
-        ),
-        Argument(
-            "start_pref_pf",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_start_pref_pf,
-        ),
-        Argument(
-            "limit_pref_pf",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_limit_pref_pf,
-        ),
-        Argument("relative_f", [float, None], optional=True, doc=doc_relative_f),
-        Argument(
-            "enable_atom_ener_coeff",
-            [bool],
-            optional=True,
-            default=False,
-            doc=doc_enable_atom_ener_coeff,
-        ),
-    ]
-
-
-@loss_args_plugin.register("dos")
-def loss_dos():
-    doc_start_pref_dos = start_pref("Density of State (DOS)")
-    doc_limit_pref_dos = limit_pref("Density of State (DOS)")
-    doc_start_pref_cdf = start_pref(
-        "Cumulative Distribution Function (cumulative intergral of DOS)"
-    )
-    doc_limit_pref_cdf = limit_pref(
-        "Cumulative Distribution Function (cumulative intergral of DOS)"
-    )
-    doc_start_pref_ados = start_pref("atomic DOS (site-projected DOS)")
-    doc_limit_pref_ados = limit_pref("atomic DOS (site-projected DOS)")
-    doc_start_pref_acdf = start_pref("Cumulative integral of atomic DOS")
-    doc_limit_pref_acdf = limit_pref("Cumulative integral of atomic DOS")
-    return [
-        Argument(
-            "start_pref_dos",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_start_pref_dos,
-        ),
-        Argument(
-            "limit_pref_dos",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_limit_pref_dos,
-        ),
-        Argument(
-            "start_pref_cdf",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_start_pref_cdf,
-        ),
-        Argument(
-            "limit_pref_cdf",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_limit_pref_cdf,
-        ),
-        Argument(
-            "start_pref_ados",
-            [float, int],
-            optional=True,
-            default=1.00,
-            doc=doc_start_pref_ados,
-        ),
-        Argument(
-            "limit_pref_ados",
-            [float, int],
-            optional=True,
-            default=1.00,
-            doc=doc_limit_pref_ados,
-        ),
-        Argument(
-            "start_pref_acdf",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_start_pref_acdf,
-        ),
-        Argument(
-            "limit_pref_acdf",
-            [float, int],
-            optional=True,
-            default=0.00,
-            doc=doc_limit_pref_acdf,
-        ),
-    ]
-
-
-# YWolfeee: Modified to support tensor type of loss args.
-@loss_args_plugin.register("tensor")
-def loss_tensor():
-    # doc_global_weight = "The prefactor of the weight of global loss. It should be larger than or equal to 0. If only `pref` is provided or both are not provided, training will be global mode, i.e. the shape of 'polarizability.npy` or `dipole.npy` should be #frams x [9 or 3]."
-    # doc_local_weight =  "The prefactor of the weight of atomic loss. It should be larger than or equal to 0. If only `pref_atomic` is provided, training will be atomic mode, i.e. the shape of `polarizability.npy` or `dipole.npy` should be #frames x ([9 or 3] x #selected atoms). If both `pref` and `pref_atomic` are provided, training will be combined mode, and atomic label should be provided as well."
-    doc_global_weight = "The prefactor of the weight of global loss. It should be larger than or equal to 0. If controls the weight of loss corresponding to global label, i.e. 'polarizability.npy` or `dipole.npy`, whose shape should be #frames x [9 or 3]. If it's larger than 0.0, this npy should be included."
-    doc_local_weight = "The prefactor of the weight of atomic loss. It should be larger than or equal to 0. If controls the weight of loss corresponding to atomic label, i.e. `atomic_polarizability.npy` or `atomic_dipole.npy`, whose shape should be #frames x ([9 or 3] x #selected atoms). If it's larger than 0.0, this npy should be included. Both `pref` and `pref_atomic` should be provided, and either can be set to 0.0."
-    return [
-        Argument(
-            "pref", [float, int], optional=False, default=None, doc=doc_global_weight
-        ),
-        Argument(
-            "pref_atomic",
-            [float, int],
-            optional=False,
-            default=None,
-            doc=doc_local_weight,
-        ),
-    ]
-
-
-def loss_variant_type_args():
-    doc_loss = "The type of the loss. When the fitting type is `ener`, the loss type should be set to `ener` or left unset. When the fitting type is `dipole` or `polar`, the loss type should be set to `tensor`."
-
-    return Variant(
-        "type",
-        loss_args_plugin.get_all_argument(),
-        optional=True,
-        default_tag="ener",
-        doc=doc_loss,
-    )
-
-
-def loss_args():
-    doc_loss = "The definition of loss function. The loss type should be set to `tensor`, `ener` or left unset."
-    ca = Argument(
-        "loss", dict, [], [loss_variant_type_args()], optional=True, doc=doc_loss
-    )
-    return ca
-
-
-def loss_dict_args():
-    doc_loss_dict = (
-        "The dictionary of definitions of multiple loss functions in multi-task mode. "
-        "Each loss_dict[fitting_key], with user-defined name `fitting_key` in `model/fitting_net_dict`, is the single definition of loss function, whose type should be set to `tensor`, `ener` or left unset.\n"
-    )
-    ca = Argument("loss_dict", dict, [], [], optional=True, doc=doc_loss_dict)
-    return ca
-
-
-#  --- Training configurations: --- #
-def training_data_args():  # ! added by Ziyao: new specification style for data systems.
-    link_sys = make_link("systems", "training/training_data/systems")
-    doc_systems = (
-        "The data systems for training. "
-        "This key can be provided with a list that specifies the systems, or be provided with a string "
-        "by which the prefix of all systems are given and the list of the systems is automatically generated."
-    )
-    doc_set_prefix = f"The prefix of the sets in the {link_sys}."
-    doc_batch_size = f'This key can be \n\n\
-- list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\
-- int: all {link_sys} use the same batch size.\n\n\
-- string "auto": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.\n\n\
-- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.\n\n\
-- string "mixed:N": the batch data will be sampled from all systems and merged into a mixed system with the batch size N. Only support the se_atten descriptor.\n\n\
-If MPI is used, the value should be considered as the batch size per task.'
-    doc_auto_prob_style = 'Determine the probability of systems automatically. The method is assigned by this key and can be\n\n\
-- "prob_uniform"  : the probability all the systems are equal, namely 1.0/self.get_nsystems()\n\n\
-- "prob_sys_size" : the probability of a system is proportional to the number of batches in the system\n\n\
-- "prob_sys_size;stt_idx:end_idx:weight;stt_idx:end_idx:weight;..." : the list of systems is devided into blocks. A block is specified by `stt_idx:end_idx:weight`, where `stt_idx` is the starting index of the system, `end_idx` is then ending (not including) index of the system, the probabilities of the systems in this block sums up to `weight`, and the relatively probabilities within this block is proportional to the number of batches in the system.'
-    doc_sys_probs = (
-        "A list of float if specified. "
-        "Should be of the same length as `systems`, "
-        "specifying the probability of each system."
-    )
-
-    args = [
-        Argument(
-            "systems", [List[str], str], optional=False, default=".", doc=doc_systems
-        ),
-        Argument("set_prefix", str, optional=True, default="set", doc=doc_set_prefix),
-        Argument(
-            "batch_size",
-            [List[int], int, str],
-            optional=True,
-            default="auto",
-            doc=doc_batch_size,
-        ),
-        Argument(
-            "auto_prob",
-            str,
-            optional=True,
-            default="prob_sys_size",
-            doc=doc_auto_prob_style,
-            alias=[
-                "auto_prob_style",
-            ],
-        ),
-        Argument(
-            "sys_probs",
-            List[float],
-            optional=True,
-            default=None,
-            doc=doc_sys_probs,
-            alias=["sys_weights"],
-        ),
-    ]
-
-    doc_training_data = "Configurations of training data."
-    return Argument(
-        "training_data",
-        dict,
-        optional=True,
-        sub_fields=args,
-        sub_variants=[],
-        doc=doc_training_data,
-    )
-
-
-def validation_data_args():  # ! added by Ziyao: new specification style for data systems.
-    link_sys = make_link("systems", "training/validation_data/systems")
-    doc_systems = (
-        "The data systems for validation. "
-        "This key can be provided with a list that specifies the systems, or be provided with a string "
-        "by which the prefix of all systems are given and the list of the systems is automatically generated."
-    )
-    doc_set_prefix = f"The prefix of the sets in the {link_sys}."
-    doc_batch_size = f'This key can be \n\n\
-- list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\
-- int: all {link_sys} use the same batch size.\n\n\
-- string "auto": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.\n\n\
-- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.'
-    doc_auto_prob_style = 'Determine the probability of systems automatically. The method is assigned by this key and can be\n\n\
-- "prob_uniform"  : the probability all the systems are equal, namely 1.0/self.get_nsystems()\n\n\
-- "prob_sys_size" : the probability of a system is proportional to the number of batches in the system\n\n\
-- "prob_sys_size;stt_idx:end_idx:weight;stt_idx:end_idx:weight;..." : the list of systems is devided into blocks. A block is specified by `stt_idx:end_idx:weight`, where `stt_idx` is the starting index of the system, `end_idx` is then ending (not including) index of the system, the probabilities of the systems in this block sums up to `weight`, and the relatively probabilities within this block is proportional to the number of batches in the system.'
-    doc_sys_probs = (
-        "A list of float if specified. "
-        "Should be of the same length as `systems`, "
-        "specifying the probability of each system."
-    )
-    doc_numb_btch = "An integer that specifies the number of batches to be sampled for each validation period."
-
-    args = [
-        Argument(
-            "systems", [List[str], str], optional=False, default=".", doc=doc_systems
-        ),
-        Argument("set_prefix", str, optional=True, default="set", doc=doc_set_prefix),
-        Argument(
-            "batch_size",
-            [List[int], int, str],
-            optional=True,
-            default="auto",
-            doc=doc_batch_size,
-        ),
-        Argument(
-            "auto_prob",
-            str,
-            optional=True,
-            default="prob_sys_size",
-            doc=doc_auto_prob_style,
-            alias=[
-                "auto_prob_style",
-            ],
-        ),
-        Argument(
-            "sys_probs",
-            List[float],
-            optional=True,
-            default=None,
-            doc=doc_sys_probs,
-            alias=["sys_weights"],
-        ),
-        Argument(
-            "numb_btch",
-            int,
-            optional=True,
-            default=1,
-            doc=doc_numb_btch,
-            alias=[
-                "numb_batch",
-            ],
-        ),
-    ]
-
-    doc_validation_data = (
-        "Configurations of validation data. Similar to that of training data, "
-        "except that a `numb_btch` argument may be configured"
-    )
-    return Argument(
-        "validation_data",
-        dict,
-        optional=True,
-        default=None,
-        sub_fields=args,
-        sub_variants=[],
-        doc=doc_validation_data,
-    )
-
-
-def mixed_precision_args():  # ! added by Denghui.
-    doc_output_prec = 'The precision for mixed precision params. " \
-        "The trainable variables precision during the mixed precision training process, " \
-        "supported options are float32 only currently.'
-    doc_compute_prec = 'The precision for mixed precision compute. " \
-        "The compute precision during the mixed precision training process, "" \
-        "supported options are float16 and bfloat16 currently.'
-
-    args = [
-        Argument(
-            "output_prec", str, optional=True, default="float32", doc=doc_output_prec
-        ),
-        Argument(
-            "compute_prec", str, optional=False, default="float16", doc=doc_compute_prec
-        ),
-    ]
-
-    doc_mixed_precision = "Configurations of mixed precision."
-    return Argument(
-        "mixed_precision",
-        dict,
-        optional=True,
-        sub_fields=args,
-        sub_variants=[],
-        doc=doc_mixed_precision,
-    )
-
-
-def training_args():  # ! modified by Ziyao: data configuration isolated.
-    doc_numb_steps = "Number of training batch. Each training uses one batch of data."
-    doc_seed = "The random seed for getting frames from the training data set."
-    doc_disp_file = "The file for printing learning curve."
-    doc_disp_freq = "The frequency of printing learning curve."
-    doc_save_freq = "The frequency of saving check point."
-    doc_save_ckpt = "The path prefix of saving check point files."
-    doc_disp_training = "Displaying verbose information during training."
-    doc_time_training = "Timing durining training."
-    doc_profiling = "Profiling during training."
-    doc_profiling_file = "Output file for profiling."
-    doc_enable_profiler = "Enable TensorFlow Profiler (available in TensorFlow 2.3) to analyze performance. The log will be saved to `tensorboard_log_dir`."
-    doc_tensorboard = "Enable tensorboard"
-    doc_tensorboard_log_dir = "The log directory of tensorboard outputs"
-    doc_tensorboard_freq = "The frequency of writing tensorboard events."
-    doc_data_dict = (
-        "The dictionary of multi DataSystems in multi-task mode. "
-        "Each data_dict[fitting_key], with user-defined name `fitting_key` in `model/fitting_net_dict`, "
-        "contains training data and optional validation data definitions."
-    )
-    doc_fitting_weight = (
-        "Each fitting_weight[fitting_key], with user-defined name `fitting_key` in `model/fitting_net_dict`, "
-        "is the training weight of fitting net `fitting_key`. "
-        "Fitting nets with higher weights will be selected with higher probabilities to be trained in one step. "
-        "Weights will be normalized and minus ones will be ignored. "
-        "If not set, each fitting net will be equally selected when training."
-    )
-
-    arg_training_data = training_data_args()
-    arg_validation_data = validation_data_args()
-    mixed_precision_data = mixed_precision_args()
-
-    args = [
-        arg_training_data,
-        arg_validation_data,
-        mixed_precision_data,
-        Argument(
-            "numb_steps", int, optional=False, doc=doc_numb_steps, alias=["stop_batch"]
-        ),
-        Argument("seed", [int, None], optional=True, doc=doc_seed),
-        Argument(
-            "disp_file", str, optional=True, default="lcurve.out", doc=doc_disp_file
-        ),
-        Argument("disp_freq", int, optional=True, default=1000, doc=doc_disp_freq),
-        Argument("save_freq", int, optional=True, default=1000, doc=doc_save_freq),
-        Argument(
-            "save_ckpt", str, optional=True, default="model.ckpt", doc=doc_save_ckpt
-        ),
-        Argument(
-            "disp_training", bool, optional=True, default=True, doc=doc_disp_training
-        ),
-        Argument(
-            "time_training", bool, optional=True, default=True, doc=doc_time_training
-        ),
-        Argument("profiling", bool, optional=True, default=False, doc=doc_profiling),
-        Argument(
-            "profiling_file",
-            str,
-            optional=True,
-            default="timeline.json",
-            doc=doc_profiling_file,
-        ),
-        Argument(
-            "enable_profiler",
-            bool,
-            optional=True,
-            default=False,
-            doc=doc_enable_profiler,
-        ),
-        Argument(
-            "tensorboard", bool, optional=True, default=False, doc=doc_tensorboard
-        ),
-        Argument(
-            "tensorboard_log_dir",
-            str,
-            optional=True,
-            default="log",
-            doc=doc_tensorboard_log_dir,
-        ),
-        Argument(
-            "tensorboard_freq", int, optional=True, default=1, doc=doc_tensorboard_freq
-        ),
-        Argument("data_dict", dict, optional=True, doc=doc_data_dict),
-        Argument("fitting_weight", dict, optional=True, doc=doc_fitting_weight),
-    ]
-
-    doc_training = "The training options."
-    return Argument("training", dict, args, [], doc=doc_training)
-
-
-def make_index(keys):
-    ret = []
-    for ii in keys:
-        ret.append(make_link(ii, ii))
-    return ", ".join(ret)
-
-
-def gen_doc(*, make_anchor=True, make_link=True, **kwargs):
-    if make_link:
-        make_anchor = True
-    ptr = []
-    for ii in gen_args():
-        ptr.append(ii.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs))
-
-    key_words = []
-    for ii in "\n\n".join(ptr).split("\n"):
-        if "argument path" in ii:
-            key_words.append(ii.split(":")[1].replace("`", "").strip())
-    # ptr.insert(0, make_index(key_words))
-
-    return "\n\n".join(ptr)
-
-
-def gen_json(**kwargs):
-    return json.dumps(
-        tuple(gen_args()),
-        cls=ArgumentEncoder,
-    )
-
-
-def gen_args(**kwargs) -> List[Argument]:
-    return [
-        model_args(),
-        learning_rate_args(),
-        learning_rate_dict_args(),
-        loss_args(),
-        loss_dict_args(),
-        training_args(),
-        nvnmd_args(),
-    ]
-
-
-def normalize_multi_task(data):
-    # single-task or multi-task mode
-    if data["model"].get("type", "standard") not in ("standard", "multi"):
-        return data
-    single_fitting_net = "fitting_net" in data["model"].keys()
-    single_training_data = "training_data" in data["training"].keys()
-    single_valid_data = "validation_data" in data["training"].keys()
-    single_loss = "loss" in data.keys()
-    single_learning_rate = "learning_rate" in data.keys()
-    multi_fitting_net = "fitting_net_dict" in data["model"].keys()
-    multi_training_data = "data_dict" in data["training"].keys()
-    multi_loss = "loss_dict" in data.keys()
-    multi_fitting_weight = "fitting_weight" in data["training"].keys()
-    multi_learning_rate = "learning_rate_dict" in data.keys()
-    assert (single_fitting_net == single_training_data) and (
-        multi_fitting_net == multi_training_data
-    ), (
-        "In single-task mode, 'model/fitting_net' and 'training/training_data' must be defined at the same time! "
-        "While in multi-task mode, 'model/fitting_net_dict', 'training/data_dict' "
-        "must be defined at the same time! Please check your input script. "
-    )
-    assert not (single_fitting_net and multi_fitting_net), (
-        "Single-task mode and multi-task mode can not be performed together. "
-        "Please check your input script and choose just one format! "
-    )
-    assert (
-        single_fitting_net or multi_fitting_net
-    ), "Please define your fitting net and training data! "
-    if multi_fitting_net:
-        assert not single_valid_data, (
-            "In multi-task mode, 'training/validation_data' should not appear "
-            "outside 'training/data_dict'! Please check your input script."
-        )
-        assert (
-            not single_loss
-        ), "In multi-task mode, please use 'model/loss_dict' in stead of 'model/loss'! "
-        assert (
-            "type_map" in data["model"]
-        ), "In multi-task mode, 'model/type_map' must be defined! "
-        data["model"]["type"] = "multi"
-        data["model"]["fitting_net_dict"] = normalize_fitting_net_dict(
-            data["model"]["fitting_net_dict"]
-        )
-        data["training"]["data_dict"] = normalize_data_dict(
-            data["training"]["data_dict"]
-        )
-        data["loss_dict"] = (
-            normalize_loss_dict(
-                data["model"]["fitting_net_dict"].keys(), data["loss_dict"]
-            )
-            if multi_loss
-            else {}
-        )
-        if multi_learning_rate:
-            data["learning_rate_dict"] = normalize_learning_rate_dict(
-                data["model"]["fitting_net_dict"].keys(), data["learning_rate_dict"]
-            )
-        elif single_learning_rate:
-            data[
-                "learning_rate_dict"
-            ] = normalize_learning_rate_dict_with_single_learning_rate(
-                data["model"]["fitting_net_dict"].keys(), data["learning_rate"]
-            )
-        fitting_weight = (
-            data["training"]["fitting_weight"] if multi_fitting_weight else None
-        )
-        data["training"]["fitting_weight"] = normalize_fitting_weight(
-            data["model"]["fitting_net_dict"].keys(),
-            data["training"]["data_dict"].keys(),
-            fitting_weight=fitting_weight,
-        )
-    else:
-        assert (
-            not multi_loss
-        ), "In single-task mode, please use 'model/loss' in stead of 'model/loss_dict'! "
-        assert (
-            not multi_learning_rate
-        ), "In single-task mode, please use 'model/learning_rate' in stead of 'model/learning_rate_dict'! "
-    return data
-
-
-def normalize_fitting_net_dict(fitting_net_dict):
-    new_dict = {}
-    base = Argument("base", dict, [], [fitting_variant_type_args()], doc="")
-    for fitting_key_item in fitting_net_dict:
-        data = base.normalize_value(
-            fitting_net_dict[fitting_key_item], trim_pattern="_*"
-        )
-        base.check_value(data, strict=True)
-        new_dict[fitting_key_item] = data
-    return new_dict
-
-
-def normalize_data_dict(data_dict):
-    new_dict = {}
-    base = Argument(
-        "base", dict, [training_data_args(), validation_data_args()], [], doc=""
-    )
-    for data_system_key_item in data_dict:
-        data = base.normalize_value(data_dict[data_system_key_item], trim_pattern="_*")
-        base.check_value(data, strict=True)
-        new_dict[data_system_key_item] = data
-    return new_dict
-
-
-def normalize_loss_dict(fitting_keys, loss_dict):
-    # check the loss dict
-    failed_loss_keys = [item for item in loss_dict if item not in fitting_keys]
-    assert (
-        not failed_loss_keys
-    ), "Loss dict key(s) {} not have corresponding fitting keys in {}! ".format(
-        str(failed_loss_keys), str(list(fitting_keys))
-    )
-    new_dict = {}
-    base = Argument("base", dict, [], [loss_variant_type_args()], doc="")
-    for item in loss_dict:
-        data = base.normalize_value(loss_dict[item], trim_pattern="_*")
-        base.check_value(data, strict=True)
-        new_dict[item] = data
-    return new_dict
-
-
-def normalize_learning_rate_dict(fitting_keys, learning_rate_dict):
-    # check the learning_rate dict
-    failed_learning_rate_keys = [
-        item for item in learning_rate_dict if item not in fitting_keys
-    ]
-    assert (
-        not failed_learning_rate_keys
-    ), "Learning rate dict key(s) {} not have corresponding fitting keys in {}! ".format(
-        str(failed_learning_rate_keys), str(list(fitting_keys))
-    )
-    new_dict = {}
-    base = Argument("base", dict, [], [learning_rate_variant_type_args()], doc="")
-    for item in learning_rate_dict:
-        data = base.normalize_value(learning_rate_dict[item], trim_pattern="_*")
-        base.check_value(data, strict=True)
-        new_dict[item] = data
-    return new_dict
-
-
-def normalize_learning_rate_dict_with_single_learning_rate(fitting_keys, learning_rate):
-    new_dict = {}
-    base = Argument("base", dict, [], [learning_rate_variant_type_args()], doc="")
-    data = base.normalize_value(learning_rate, trim_pattern="_*")
-    base.check_value(data, strict=True)
-    for fitting_key in fitting_keys:
-        new_dict[fitting_key] = data
-    return new_dict
-
-
-def normalize_fitting_weight(fitting_keys, data_keys, fitting_weight=None):
-    # check the mapping
-    failed_data_keys = [item for item in data_keys if item not in fitting_keys]
-    assert (
-        not failed_data_keys
-    ), "Data dict key(s) {} not have corresponding fitting keys in {}! ".format(
-        str(failed_data_keys), str(list(fitting_keys))
-    )
-    empty_fitting_keys = []
-    valid_fitting_keys = []
-    for item in fitting_keys:
-        if item not in data_keys:
-            empty_fitting_keys.append(item)
-        else:
-            valid_fitting_keys.append(item)
-    if empty_fitting_keys:
-        log.warning(
-            "Fitting net(s) {} have no data and will not be used in training.".format(
-                str(empty_fitting_keys)
-            )
-        )
-    num_pair = len(valid_fitting_keys)
-    assert num_pair > 0, "No valid training data systems for fitting nets!"
-
-    # check and normalize the fitting weight
-    new_weight = {}
-    if fitting_weight is None:
-        equal_weight = 1.0 / num_pair
-        for item in fitting_keys:
-            new_weight[item] = equal_weight if item in valid_fitting_keys else 0.0
-    else:
-        failed_weight_keys = [
-            item for item in fitting_weight if item not in fitting_keys
-        ]
-        assert (
-            not failed_weight_keys
-        ), "Fitting weight key(s) {} not have corresponding fitting keys in {}! ".format(
-            str(failed_weight_keys), str(list(fitting_keys))
-        )
-        sum_prob = 0.0
-        for item in fitting_keys:
-            if item in valid_fitting_keys:
-                if (
-                    item in fitting_weight
-                    and isinstance(fitting_weight[item], (int, float))
-                    and fitting_weight[item] > 0.0
-                ):
-                    sum_prob += fitting_weight[item]
-                    new_weight[item] = fitting_weight[item]
-                else:
-                    valid_fitting_keys.remove(item)
-                    log.warning(
-                        f"Fitting net '{item}' has zero or invalid weight "
-                        "and will not be used in training."
-                    )
-                    new_weight[item] = 0.0
-            else:
-                new_weight[item] = 0.0
-        assert sum_prob > 0.0, "No valid training weight for fitting nets!"
-        # normalize
-        for item in new_weight:
-            new_weight[item] /= sum_prob
-    return new_weight
-
-
-def normalize(data):
-    data = normalize_multi_task(data)
-
-    base = Argument("base", dict, gen_args())
-    data = base.normalize_value(data, trim_pattern="_*")
-    base.check_value(data, strict=True)
-
-    return data
-
-
-if __name__ == "__main__":
-    gen_doc()
+__all__ = [
+    "list_to_doc",
+    "normalize",
+    "gen_doc",
+    "gen_json",
+    "gen_args",
+    "type_embedding_args",
+]
diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py
index 2b3117d849..863520b3f4 100644
--- a/deepmd/utils/batch_size.py
+++ b/deepmd/utils/batch_size.py
@@ -1,207 +1,40 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-import logging
-import os
-from typing import (
-    Callable,
-    Tuple,
+from packaging.version import (
+    Version,
 )
 
-import numpy as np
-
 from deepmd.env import (
+    TF_VERSION,
     tf,
 )
 from deepmd.utils.errors import (
     OutOfMemoryError,
 )
+from deepmd_utils.utils.batch_size import AutoBatchSize as AutoBatchSizeBase
 
-log = logging.getLogger(__name__)
-
-
-class AutoBatchSize:
-    """This class allows DeePMD-kit to automatically decide the maximum
-    batch size that will not cause an OOM error.
-
-    Notes
-    -----
-    In some CPU environments, the program may be directly killed when OOM. In
-    this case, by default the batch size will not be increased for CPUs. The
-    environment variable `DP_INFER_BATCH_SIZE` can be set as the batch size.
-
-    In other cases, we assume all OOM error will raise :class:`OutOfMemoryError`.
-
-    Parameters
-    ----------
-    initial_batch_size : int, default: 1024
-        initial batch size (number of total atoms) when DP_INFER_BATCH_SIZE
-        is not set
-    factor : float, default: 2.
-        increased factor
-
-    Attributes
-    ----------
-    current_batch_size : int
-        current batch size (number of total atoms)
-    maximum_working_batch_size : int
-        maximum working batch size
-    minimal_not_working_batch_size : int
-        minimal not working batch size
-    """
-
-    def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None:
-        # See also PyTorchLightning/pytorch-lightning#1638
-        # TODO: discuss a proper initial batch size
-        self.current_batch_size = initial_batch_size
-        DP_INFER_BATCH_SIZE = int(os.environ.get("DP_INFER_BATCH_SIZE", 0))
-        if DP_INFER_BATCH_SIZE > 0:
-            self.current_batch_size = DP_INFER_BATCH_SIZE
-            self.maximum_working_batch_size = DP_INFER_BATCH_SIZE
-            self.minimal_not_working_batch_size = self.maximum_working_batch_size + 1
-        else:
-            self.maximum_working_batch_size = initial_batch_size
-            if tf.test.is_gpu_available():
-                self.minimal_not_working_batch_size = 2**31
-            else:
-                self.minimal_not_working_batch_size = (
-                    self.maximum_working_batch_size + 1
-                )
-                log.warning(
-                    "You can use the environment variable DP_INFER_BATCH_SIZE to"
-                    "control the inference batch size (nframes * natoms). "
-                    "The default value is %d." % initial_batch_size
-                )
-
-        self.factor = factor
 
-    def execute(
-        self, callable: Callable, start_index: int, natoms: int
-    ) -> Tuple[int, tuple]:
-        """Excuate a method with given batch size.
-
-        Parameters
-        ----------
-        callable : Callable
-            The method should accept the batch size and start_index as parameters,
-            and returns executed batch size and data.
-        start_index : int
-            start index
-        natoms : int
-            natoms
+class AutoBatchSize(AutoBatchSizeBase):
+    def is_gpu_available(self) -> bool:
+        """Check if GPU is available.
 
         Returns
         -------
-        int
-            executed batch size * number of atoms
-        tuple
-            result from callable, None if failing to execute
-
-        Raises
-        ------
-        OutOfMemoryError
-            OOM when batch size is 1
+        bool
+            True if GPU is available
         """
-        if natoms > 0:
-            batch_nframes = self.current_batch_size // natoms
-        else:
-            batch_nframes = self.current_batch_size
-        try:
-            n_batch, result = callable(max(batch_nframes, 1), start_index)
-        except OutOfMemoryError as e:
-            # TODO: it's very slow to catch OOM error; I don't know what TF is doing here
-            # but luckily we only need to catch once
-            self.minimal_not_working_batch_size = min(
-                self.minimal_not_working_batch_size, self.current_batch_size
-            )
-            if self.maximum_working_batch_size >= self.minimal_not_working_batch_size:
-                self.maximum_working_batch_size = int(
-                    self.minimal_not_working_batch_size / self.factor
-                )
-            if self.minimal_not_working_batch_size <= natoms:
-                raise OutOfMemoryError(
-                    "The callable still throws an out-of-memory (OOM) error even when batch size is 1!"
-                ) from e
-            # adjust the next batch size
-            self._adjust_batch_size(1.0 / self.factor)
-            return 0, None
-        else:
-            n_tot = n_batch * natoms
-            self.maximum_working_batch_size = max(
-                self.maximum_working_batch_size, n_tot
-            )
-            # adjust the next batch size
-            if (
-                n_tot + natoms > self.current_batch_size
-                and self.current_batch_size * self.factor
-                < self.minimal_not_working_batch_size
-            ):
-                self._adjust_batch_size(self.factor)
-            return n_batch, result
+        return (
+            Version(TF_VERSION) >= Version("1.14")
+            and tf.config.experimental.get_visible_devices("GPU")
+        ) or tf.test.is_gpu_available()
 
-    def _adjust_batch_size(self, factor: float):
-        old_batch_size = self.current_batch_size
-        self.current_batch_size = int(self.current_batch_size * factor)
-        log.info(
-            "Adjust batch size from %d to %d"
-            % (old_batch_size, self.current_batch_size)
-        )
-
-    def execute_all(
-        self, callable: Callable, total_size: int, natoms: int, *args, **kwargs
-    ) -> Tuple[np.ndarray]:
-        """Excuate a method with all given data.
+    def is_oom_error(self, e: Exception) -> bool:
+        """Check if the exception is an OOM error.
 
         Parameters
         ----------
-        callable : Callable
-            The method should accept *args and **kwargs as input and return the similiar array.
-        total_size : int
-            Total size
-        natoms : int
-            The number of atoms
-        *args
-            Variable length argument list.
-        **kwargs
-            If 2D np.ndarray, assume the first axis is batch; otherwise do nothing.
+        e : Exception
+            Exception
         """
-
-        def execute_with_batch_size(
-            batch_size: int, start_index: int
-        ) -> Tuple[int, Tuple[np.ndarray]]:
-            end_index = start_index + batch_size
-            end_index = min(end_index, total_size)
-            return (end_index - start_index), callable(
-                *[
-                    (
-                        vv[start_index:end_index]
-                        if isinstance(vv, np.ndarray) and vv.ndim > 1
-                        else vv
-                    )
-                    for vv in args
-                ],
-                **{
-                    kk: (
-                        vv[start_index:end_index]
-                        if isinstance(vv, np.ndarray) and vv.ndim > 1
-                        else vv
-                    )
-                    for kk, vv in kwargs.items()
-                },
-            )
-
-        index = 0
-        results = []
-        while index < total_size:
-            n_batch, result = self.execute(execute_with_batch_size, index, natoms)
-            if not isinstance(result, tuple):
-                result = (result,)
-            index += n_batch
-            if n_batch:
-                for rr in result:
-                    rr.reshape((n_batch, -1))
-                results.append(result)
-
-        r = tuple([np.concatenate(r, axis=0) for r in zip(*results)])
-        if len(r) == 1:
-            # avoid returning tuple if callable doesn't return tuple
-            r = r[0]
-        return r
+        # TODO: it's very slow to catch OOM error; I don't know what TF is doing here
+        # but luckily we only need to catch once
+        return isinstance(e, (tf.errors.ResourceExhaustedError, OutOfMemoryError))
diff --git a/deepmd/utils/compat.py b/deepmd/utils/compat.py
index 5f9c14e6d8..91bf4021ee 100644
--- a/deepmd/utils/compat.py
+++ b/deepmd/utils/compat.py
@@ -1,392 +1,15 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Module providing compatibility between `0.x.x` and `1.x.x` input versions."""
-
-import json
-import warnings
-from pathlib import (
-    Path,
-)
-from typing import (
-    Any,
-    Dict,
-    Optional,
-    Sequence,
-    Union,
-)
-
-import numpy as np
-
-from deepmd.common import (
-    j_must_have,
+"""Alias for backward compatibility."""
+from deepmd_utils.utils.compat import (
+    convert_input_v0_v1,
+    convert_input_v1_v2,
+    deprecate_numb_test,
+    update_deepmd_input,
 )
 
-
-def convert_input_v0_v1(
-    jdata: Dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
-) -> Dict[str, Any]:
-    """Convert input from v0 format to v1.
-
-    Parameters
-    ----------
-    jdata : Dict[str, Any]
-        loaded json/yaml file
-    warning : bool, optional
-        whether to show deprecation warning, by default True
-    dump : Optional[Union[str, Path]], optional
-        whether to dump converted file, by default None
-
-    Returns
-    -------
-    Dict[str, Any]
-        converted output
-    """
-    output = {}
-    output["model"] = _model(jdata, jdata["use_smooth"])
-    output["learning_rate"] = _learning_rate(jdata)
-    output["loss"] = _loss(jdata)
-    output["training"] = _training(jdata)
-    if warning:
-        _warning_input_v0_v1(dump)
-    if dump is not None:
-        with open(dump, "w") as fp:
-            json.dump(output, fp, indent=4)
-    return output
-
-
-def _warning_input_v0_v1(fname: Optional[Union[str, Path]]):
-    msg = (
-        "It seems that you are using a deepmd-kit input of version 0.x.x, "
-        "which is deprecated. we have converted the input to >2.0.0 compatible"
-    )
-    if fname is not None:
-        msg += f", and output it to file {fname}"
-    warnings.warn(msg)
-
-
-def _model(jdata: Dict[str, Any], smooth: bool) -> Dict[str, Dict[str, Any]]:
-    """Convert data to v1 input for non-smooth model.
-
-    Parameters
-    ----------
-    jdata : Dict[str, Any]
-        parsed input json/yaml data
-    smooth : bool
-        whether to use smooth or non-smooth descriptor version
-
-    Returns
-    -------
-    Dict[str, Dict[str, Any]]
-        dictionary with model input parameters and sub-dictionaries for descriptor and
-        fitting net
-    """
-    model = {}
-    model["descriptor"] = (
-        _smth_descriptor(jdata) if smooth else _nonsmth_descriptor(jdata)
-    )
-    model["fitting_net"] = _fitting_net(jdata)
-    return model
-
-
-def _nonsmth_descriptor(jdata: Dict[str, Any]) -> Dict[str, Any]:
-    """Convert data to v1 input for non-smooth descriptor.
-
-    Parameters
-    ----------
-    jdata : Dict[str, Any]
-        parsed input json/yaml data
-
-    Returns
-    -------
-    Dict[str, Any]
-        dict with descriptor parameters
-    """
-    descriptor = {}
-    descriptor["type"] = "loc_frame"
-    _jcopy(jdata, descriptor, ("sel_a", "sel_r", "rcut", "axis_rule"))
-    return descriptor
-
-
-def _smth_descriptor(jdata: Dict[str, Any]) -> Dict[str, Any]:
-    """Convert data to v1 input for smooth descriptor.
-
-    Parameters
-    ----------
-    jdata : Dict[str, Any]
-        parsed input json/yaml data
-
-    Returns
-    -------
-    Dict[str, Any]
-        dict with descriptor parameters
-    """
-    descriptor = {}
-    seed = jdata.get("seed", None)
-    if seed is not None:
-        descriptor["seed"] = seed
-    descriptor["type"] = "se_a"
-    descriptor["sel"] = jdata["sel_a"]
-    _jcopy(jdata, descriptor, ("rcut",))
-    descriptor["rcut_smth"] = jdata.get("rcut_smth", descriptor["rcut"])
-    descriptor["neuron"] = j_must_have(jdata, "filter_neuron")
-    descriptor["axis_neuron"] = j_must_have(jdata, "axis_neuron", ["n_axis_neuron"])
-    descriptor["resnet_dt"] = False
-    if "resnet_dt" in jdata:
-        descriptor["resnet_dt"] = jdata["filter_resnet_dt"]
-
-    return descriptor
-
-
-def _fitting_net(jdata: Dict[str, Any]) -> Dict[str, Any]:
-    """Convert data to v1 input for fitting net.
-
-    Parameters
-    ----------
-    jdata : Dict[str, Any]
-        parsed input json/yaml data
-
-    Returns
-    -------
-    Dict[str, Any]
-        dict with fitting net parameters
-    """
-    fitting_net = {}
-
-    seed = jdata.get("seed", None)
-    if seed is not None:
-        fitting_net["seed"] = seed
-    fitting_net["neuron"] = j_must_have(jdata, "fitting_neuron", ["n_neuron"])
-    fitting_net["resnet_dt"] = True
-    if "resnet_dt" in jdata:
-        fitting_net["resnet_dt"] = jdata["resnet_dt"]
-    if "fitting_resnet_dt" in jdata:
-        fitting_net["resnet_dt"] = jdata["fitting_resnet_dt"]
-    return fitting_net
-
-
-def _learning_rate(jdata: Dict[str, Any]) -> Dict[str, Any]:
-    """Convert data to v1 input for learning rate section.
-
-    Parameters
-    ----------
-    jdata : Dict[str, Any]
-        parsed input json/yaml data
-
-    Returns
-    -------
-    Dict[str, Any]
-        dict with learning rate parameters
-    """
-    learning_rate = {}
-    learning_rate["type"] = "exp"
-    _jcopy(jdata, learning_rate, ("decay_steps", "decay_rate", "start_lr"))
-    return learning_rate
-
-
-def _loss(jdata: Dict[str, Any]) -> Dict[str, Any]:
-    """Convert data to v1 input for loss function.
-
-    Parameters
-    ----------
-    jdata : Dict[str, Any]
-        parsed input json/yaml data
-
-    Returns
-    -------
-    Dict[str, Any]
-        dict with loss function parameters
-    """
-    loss: Dict[str, Any] = {}
-    _jcopy(
-        jdata,
-        loss,
-        (
-            "start_pref_e",
-            "limit_pref_e",
-            "start_pref_f",
-            "limit_pref_f",
-            "start_pref_v",
-            "limit_pref_v",
-        ),
-    )
-    if "start_pref_ae" in jdata:
-        loss["start_pref_ae"] = jdata["start_pref_ae"]
-    if "limit_pref_ae" in jdata:
-        loss["limit_pref_ae"] = jdata["limit_pref_ae"]
-    return loss
-
-
-def _training(jdata: Dict[str, Any]) -> Dict[str, Any]:
-    """Convert data to v1 input for training.
-
-    Parameters
-    ----------
-    jdata : Dict[str, Any]
-        parsed input json/yaml data
-
-    Returns
-    -------
-    Dict[str, Any]
-        dict with training parameters
-    """
-    training = {}
-    seed = jdata.get("seed", None)
-    if seed is not None:
-        training["seed"] = seed
-
-    _jcopy(jdata, training, ("systems", "set_prefix", "stop_batch", "batch_size"))
-    training["disp_file"] = "lcurve.out"
-    if "disp_file" in jdata:
-        training["disp_file"] = jdata["disp_file"]
-    training["disp_freq"] = j_must_have(jdata, "disp_freq")
-    training["numb_test"] = j_must_have(jdata, "numb_test")
-    training["save_freq"] = j_must_have(jdata, "save_freq")
-    training["save_ckpt"] = j_must_have(jdata, "save_ckpt")
-    training["disp_training"] = j_must_have(jdata, "disp_training")
-    training["time_training"] = j_must_have(jdata, "time_training")
-    if "profiling" in jdata:
-        training["profiling"] = jdata["profiling"]
-        if training["profiling"]:
-            training["profiling_file"] = j_must_have(jdata, "profiling_file")
-    return training
-
-
-def _jcopy(src: Dict[str, Any], dst: Dict[str, Any], keys: Sequence[str]):
-    """Copy specified keys from one dict to another.
-
-    Parameters
-    ----------
-    src : Dict[str, Any]
-        source dictionary
-    dst : Dict[str, Any]
-        destination dictionary, will be modified in place
-    keys : Sequence[str]
-        list of keys to copy
-    """
-    for k in keys:
-        dst[k] = src[k]
-
-
-def remove_decay_rate(jdata: Dict[str, Any]):
-    """Convert decay_rate to stop_lr.
-
-    Parameters
-    ----------
-    jdata : Dict[str, Any]
-        input data
-    """
-    lr = jdata["learning_rate"]
-    if "decay_rate" in lr:
-        decay_rate = lr["decay_rate"]
-        start_lr = lr["start_lr"]
-        stop_step = jdata["training"]["stop_batch"]
-        decay_steps = lr["decay_steps"]
-        stop_lr = np.exp(np.log(decay_rate) * (stop_step / decay_steps)) * start_lr
-        lr["stop_lr"] = stop_lr
-        lr.pop("decay_rate")
-
-
-def convert_input_v1_v2(
-    jdata: Dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
-) -> Dict[str, Any]:
-    tr_cfg = jdata["training"]
-    tr_data_keys = {
-        "systems",
-        "set_prefix",
-        "batch_size",
-        "sys_prob",
-        "auto_prob",
-        # alias included
-        "sys_weights",
-        "auto_prob_style",
-    }
-
-    tr_data_cfg = {k: v for k, v in tr_cfg.items() if k in tr_data_keys}
-    new_tr_cfg = {k: v for k, v in tr_cfg.items() if k not in tr_data_keys}
-    new_tr_cfg["training_data"] = tr_data_cfg
-    if "training_data" in tr_cfg:
-        raise RuntimeError(
-            "Both v1 (training/systems) and v2 (training/training_data) parameters are given."
-        )
-
-    jdata["training"] = new_tr_cfg
-
-    # remove deprecated arguments
-    remove_decay_rate(jdata)
-
-    if warning:
-        _warning_input_v1_v2(dump)
-    if dump is not None:
-        with open(dump, "w") as fp:
-            json.dump(jdata, fp, indent=4)
-
-    return jdata
-
-
-def _warning_input_v1_v2(fname: Optional[Union[str, Path]]):
-    msg = (
-        "It seems that you are using a deepmd-kit input of version 1.x.x, "
-        "which is deprecated. we have converted the input to >2.0.0 compatible"
-    )
-    if fname is not None:
-        msg += f", and output it to file {fname}"
-    warnings.warn(msg)
-
-
-def deprecate_numb_test(
-    jdata: Dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
-) -> Dict[str, Any]:
-    """Deprecate `numb_test` since v2.1. It has taken no effect since v2.0.
-
-    See `#1243 <https://github.com/deepmodeling/deepmd-kit/discussions/1243>`_.
-
-    Parameters
-    ----------
-    jdata : Dict[str, Any]
-        loaded json/yaml file
-    warning : bool, optional
-        whether to show deprecation warning, by default True
-    dump : Optional[Union[str, Path]], optional
-        whether to dump converted file, by default None
-
-    Returns
-    -------
-    Dict[str, Any]
-        converted output
-    """
-    try:
-        jdata.get("training", {}).pop("numb_test")
-    except KeyError:
-        pass
-    else:
-        if warning:
-            warnings.warn(
-                "The argument training->numb_test has been deprecated since v2.0.0. "
-                "Use training->validation_data->batch_size instead."
-            )
-
-    if dump is not None:
-        with open(dump, "w") as fp:
-            json.dump(jdata, fp, indent=4)
-    return jdata
-
-
-def update_deepmd_input(
-    jdata: Dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
-) -> Dict[str, Any]:
-    def is_deepmd_v0_input(jdata):
-        return "model" not in jdata.keys()
-
-    def is_deepmd_v1_input(jdata):
-        return "systems" in j_must_have(jdata, "training").keys()
-
-    if is_deepmd_v0_input(jdata):
-        jdata = convert_input_v0_v1(jdata, warning, None)
-        jdata = convert_input_v1_v2(jdata, False, None)
-        jdata = deprecate_numb_test(jdata, False, dump)
-    elif is_deepmd_v1_input(jdata):
-        jdata = convert_input_v1_v2(jdata, warning, None)
-        jdata = deprecate_numb_test(jdata, False, dump)
-    else:
-        jdata = deprecate_numb_test(jdata, warning, dump)
-
-    return jdata
+__all__ = [
+    "convert_input_v0_v1",
+    "convert_input_v1_v2",
+    "deprecate_numb_test",
+    "update_deepmd_input",
+]
diff --git a/deepmd/utils/compress.py b/deepmd/utils/compress.py
index c6e68dfe19..7a79dec520 100644
--- a/deepmd/utils/compress.py
+++ b/deepmd/utils/compress.py
@@ -43,15 +43,15 @@ def get_two_side_type_embedding(self, graph):
 
 
 def get_extra_side_embedding_net_variable(
-    self, graph_def, type_side, varialbe_name, suffix
+    self, graph_def, type_side_suffix, varialbe_name, suffix
 ):
     ret = {}
     for i in range(1, self.layer_size + 1):
         target = get_pattern_nodes_from_graph_def(
             graph_def,
-            f"filter_type_all{suffix}/{varialbe_name}_{i}_{type_side}_ebd",
+            f"filter_type_all{suffix}/{varialbe_name}_{i}{type_side_suffix}",
         )
-        node = target[f"filter_type_all{suffix}/{varialbe_name}_{i}_{type_side}_ebd"]
+        node = target[f"filter_type_all{suffix}/{varialbe_name}_{i}{type_side_suffix}"]
         ret["layer_" + str(i)] = node
     return ret
 
diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index 423745cddf..a6f888beac 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -1,614 +1,9 @@
-#!/usr/bin/env python3
-
 # SPDX-License-Identifier: LGPL-3.0-or-later
-import logging
-from typing import (
-    List,
-    Optional,
-)
-
-import numpy as np
-
-from deepmd.env import (
-    GLOBAL_ENER_FLOAT_PRECISION,
-    GLOBAL_NP_FLOAT_PRECISION,
+"""Alias for backward compatibility."""
+from deepmd_utils.utils.data import (
+    DeepmdData,
 )
-from deepmd.utils import random as dp_random
-from deepmd.utils.path import (
-    DPPath,
-)
-
-log = logging.getLogger(__name__)
-
-
-class DeepmdData:
-    """Class for a data system.
-
-    It loads data from hard disk, and mantains the data as a `data_dict`
-
-    Parameters
-    ----------
-    sys_path
-            Path to the data system
-    set_prefix
-            Prefix for the directories of different sets
-    shuffle_test
-            If the test data are shuffled
-    type_map
-            Gives the name of different atom types
-    optional_type_map
-            If the type_map.raw in each system is optional
-    modifier
-            Data modifier that has the method `modify_data`
-    trn_all_set
-            Use all sets as training dataset. Otherwise, if the number of sets is more than 1, the last set is left for test.
-    sort_atoms : bool
-            Sort atoms by atom types. Required to enable when the data is directly feeded to
-            descriptors except mixed types.
-    """
-
-    def __init__(
-        self,
-        sys_path: str,
-        set_prefix: str = "set",
-        shuffle_test: bool = True,
-        type_map: Optional[List[str]] = None,
-        optional_type_map: bool = True,
-        modifier=None,
-        trn_all_set: bool = False,
-        sort_atoms: bool = True,
-    ):
-        """Constructor."""
-        root = DPPath(sys_path)
-        self.dirs = root.glob(set_prefix + ".*")
-        if not len(self.dirs):
-            raise FileNotFoundError(f"No {set_prefix}.* is found in {sys_path}")
-        self.dirs.sort()
-        # check mix_type format
-        error_format_msg = (
-            "if one of the set is of mixed_type format, "
-            "then all of the sets in this system should be of mixed_type format!"
-        )
-        self.mixed_type = self._check_mode(self.dirs[0])
-        for set_item in self.dirs[1:]:
-            assert self._check_mode(set_item) == self.mixed_type, error_format_msg
-        # load atom type
-        self.atom_type = self._load_type(root)
-        self.natoms = len(self.atom_type)
-        # load atom type map
-        self.type_map = self._load_type_map(root)
-        assert (
-            optional_type_map or self.type_map is not None
-        ), f"System {sys_path} must have type_map.raw in this mode! "
-        if self.type_map is not None:
-            assert len(self.type_map) >= max(self.atom_type) + 1
-        # check pbc
-        self.pbc = self._check_pbc(root)
-        # enforce type_map if necessary
-        self.enforce_type_map = False
-        if type_map is not None and self.type_map is not None and len(type_map):
-            if not self.mixed_type:
-                atom_type_ = [
-                    type_map.index(self.type_map[ii]) for ii in self.atom_type
-                ]
-                self.atom_type = np.array(atom_type_, dtype=np.int32)
-            else:
-                self.enforce_type_map = True
-                sorter = np.argsort(type_map)
-                self.type_idx_map = np.array(
-                    sorter[np.searchsorted(type_map, self.type_map, sorter=sorter)]
-                )
-                # padding for virtual atom
-                self.type_idx_map = np.append(
-                    self.type_idx_map, np.array([-1], dtype=np.int32)
-                )
-            self.type_map = type_map
-        if type_map is None and self.type_map is None and self.mixed_type:
-            raise RuntimeError("mixed_type format must have type_map!")
-        # make idx map
-        self.sort_atoms = sort_atoms
-        self.idx_map = self._make_idx_map(self.atom_type)
-        # train dirs
-        self.test_dir = self.dirs[-1]
-        if trn_all_set:
-            self.train_dirs = self.dirs
-        else:
-            if len(self.dirs) == 1:
-                self.train_dirs = self.dirs
-            else:
-                self.train_dirs = self.dirs[:-1]
-        self.data_dict = {}
-        # add box and coord
-        self.add("box", 9, must=self.pbc)
-        self.add("coord", 3, atomic=True, must=True)
-        # the training times of each frame
-        self.add("numb_copy", 1, must=False, default=1, dtype=int)
-        # set counters
-        self.set_count = 0
-        self.iterator = 0
-        self.shuffle_test = shuffle_test
-        # set modifier
-        self.modifier = modifier
-
-    def add(
-        self,
-        key: str,
-        ndof: int,
-        atomic: bool = False,
-        must: bool = False,
-        high_prec: bool = False,
-        type_sel: Optional[List[int]] = None,
-        repeat: int = 1,
-        default: float = 0.0,
-        dtype: Optional[np.dtype] = None,
-    ):
-        """Add a data item that to be loaded.
-
-        Parameters
-        ----------
-        key
-            The key of the item. The corresponding data is stored in `sys_path/set.*/key.npy`
-        ndof
-            The number of dof
-        atomic
-            The item is an atomic property.
-            If False, the size of the data should be nframes x ndof
-            If True, the size of data should be nframes x natoms x ndof
-        must
-            The data file `sys_path/set.*/key.npy` must exist.
-            If must is False and the data file does not exist, the `data_dict[find_key]` is set to 0.0
-        high_prec
-            Load the data and store in float64, otherwise in float32
-        type_sel
-            Select certain type of atoms
-        repeat
-            The data will be repeated `repeat` times.
-        default : float, default=0.
-            default value of data
-        dtype : np.dtype, optional
-            the dtype of data, overwrites `high_prec` if provided
-        """
-        self.data_dict[key] = {
-            "ndof": ndof,
-            "atomic": atomic,
-            "must": must,
-            "high_prec": high_prec,
-            "type_sel": type_sel,
-            "repeat": repeat,
-            "reduce": None,
-            "default": default,
-            "dtype": dtype,
-        }
-        return self
-
-    def reduce(self, key_out: str, key_in: str):
-        """Generate a new item from the reduction of another atom.
-
-        Parameters
-        ----------
-        key_out
-            The name of the reduced item
-        key_in
-            The name of the data item to be reduced
-        """
-        assert key_in in self.data_dict, "cannot find input key"
-        assert self.data_dict[key_in]["atomic"], "reduced property should be atomic"
-        assert key_out not in self.data_dict, "output key should not have been added"
-        assert (
-            self.data_dict[key_in]["repeat"] == 1
-        ), "reduced proerties should not have been repeated"
-
-        self.data_dict[key_out] = {
-            "ndof": self.data_dict[key_in]["ndof"],
-            "atomic": False,
-            "must": True,
-            "high_prec": True,
-            "type_sel": None,
-            "repeat": 1,
-            "reduce": key_in,
-        }
-        return self
-
-    def get_data_dict(self) -> dict:
-        """Get the `data_dict`."""
-        return self.data_dict
-
-    def check_batch_size(self, batch_size):
-        """Check if the system can get a batch of data with `batch_size` frames."""
-        for ii in self.train_dirs:
-            if self.data_dict["coord"]["high_prec"]:
-                tmpe = (
-                    (ii / "coord.npy").load_numpy().astype(GLOBAL_ENER_FLOAT_PRECISION)
-                )
-            else:
-                tmpe = (ii / "coord.npy").load_numpy().astype(GLOBAL_NP_FLOAT_PRECISION)
-            if tmpe.ndim == 1:
-                tmpe = tmpe.reshape([1, -1])
-            if tmpe.shape[0] < batch_size:
-                return ii, tmpe.shape[0]
-        return None
-
-    def check_test_size(self, test_size):
-        """Check if the system can get a test dataset with `test_size` frames."""
-        if self.data_dict["coord"]["high_prec"]:
-            tmpe = (
-                (self.test_dir / "coord.npy")
-                .load_numpy()
-                .astype(GLOBAL_ENER_FLOAT_PRECISION)
-            )
-        else:
-            tmpe = (
-                (self.test_dir / "coord.npy")
-                .load_numpy()
-                .astype(GLOBAL_NP_FLOAT_PRECISION)
-            )
-        if tmpe.ndim == 1:
-            tmpe = tmpe.reshape([1, -1])
-        if tmpe.shape[0] < test_size:
-            return self.test_dir, tmpe.shape[0]
-        else:
-            return None
-
-    def get_batch(self, batch_size: int) -> dict:
-        """Get a batch of data with `batch_size` frames. The frames are randomly picked from the data system.
-
-        Parameters
-        ----------
-        batch_size
-            size of the batch
-        """
-        if hasattr(self, "batch_set"):
-            set_size = self.batch_set["coord"].shape[0]
-        else:
-            set_size = 0
-        if self.iterator + batch_size > set_size:
-            self._load_batch_set(self.train_dirs[self.set_count % self.get_numb_set()])
-            self.set_count += 1
-            set_size = self.batch_set["coord"].shape[0]
-        iterator_1 = self.iterator + batch_size
-        if iterator_1 >= set_size:
-            iterator_1 = set_size
-        idx = np.arange(self.iterator, iterator_1)
-        self.iterator += batch_size
-        ret = self._get_subdata(self.batch_set, idx)
-        return ret
-
-    def get_test(self, ntests: int = -1) -> dict:
-        """Get the test data with `ntests` frames.
-
-        Parameters
-        ----------
-        ntests
-            Size of the test data set. If `ntests` is -1, all test data will be get.
-        """
-        if not hasattr(self, "test_set"):
-            self._load_test_set(self.test_dir, self.shuffle_test)
-        if ntests == -1:
-            idx = None
-        else:
-            ntests_ = (
-                ntests
-                if ntests < self.test_set["type"].shape[0]
-                else self.test_set["type"].shape[0]
-            )
-            # print('ntest', self.test_set['type'].shape[0], ntests, ntests_)
-            idx = np.arange(ntests_)
-        ret = self._get_subdata(self.test_set, idx=idx)
-        if self.modifier is not None:
-            self.modifier.modify_data(ret, self)
-        return ret
-
-    def get_ntypes(self) -> int:
-        """Number of atom types in the system."""
-        if self.type_map is not None:
-            return len(self.type_map)
-        else:
-            return max(self.get_atom_type()) + 1
-
-    def get_type_map(self) -> List[str]:
-        """Get the type map."""
-        return self.type_map
-
-    def get_atom_type(self) -> List[int]:
-        """Get atom types."""
-        return self.atom_type
-
-    def get_numb_set(self) -> int:
-        """Get number of training sets."""
-        return len(self.train_dirs)
-
-    def get_numb_batch(self, batch_size: int, set_idx: int) -> int:
-        """Get the number of batches in a set."""
-        data = self._load_set(self.train_dirs[set_idx])
-        ret = data["coord"].shape[0] // batch_size
-        if ret == 0:
-            ret = 1
-        return ret
-
-    def get_sys_numb_batch(self, batch_size: int) -> int:
-        """Get the number of batches in the data system."""
-        ret = 0
-        for ii in range(len(self.train_dirs)):
-            ret += self.get_numb_batch(batch_size, ii)
-        return ret
-
-    def get_natoms(self):
-        """Get number of atoms."""
-        return len(self.atom_type)
-
-    def get_natoms_vec(self, ntypes: int):
-        """Get number of atoms and number of atoms in different types.
-
-        Parameters
-        ----------
-        ntypes
-            Number of types (may be larger than the actual number of types in the system).
-
-        Returns
-        -------
-        natoms
-            natoms[0]: number of local atoms
-            natoms[1]: total number of atoms held by this processor
-            natoms[i]: 2 <= i < Ntypes+2, number of type i atoms
-        """
-        natoms, natoms_vec = self._get_natoms_2(ntypes)
-        tmp = [natoms, natoms]
-        tmp = np.append(tmp, natoms_vec)
-        return tmp.astype(np.int32)
-
-    def avg(self, key):
-        """Return the average value of an item."""
-        if key not in self.data_dict.keys():
-            raise RuntimeError("key %s has not been added" % key)
-        info = self.data_dict[key]
-        ndof = info["ndof"]
-        eners = []
-        for ii in self.train_dirs:
-            data = self._load_set(ii)
-            ei = data[key].reshape([-1, ndof])
-            eners.append(ei)
-        eners = np.concatenate(eners, axis=0)
-        if eners.size == 0:
-            return 0
-        else:
-            return np.average(eners, axis=0)
-
-    def _idx_map_sel(self, atom_type, type_sel):
-        new_types = []
-        for ii in atom_type:
-            if ii in type_sel:
-                new_types.append(ii)
-        new_types = np.array(new_types, dtype=int)
-        natoms = new_types.shape[0]
-        idx = np.arange(natoms)
-        idx_map = np.lexsort((idx, new_types))
-        return idx_map
-
-    def _get_natoms_2(self, ntypes):
-        sample_type = self.atom_type
-        natoms = len(sample_type)
-        natoms_vec = np.zeros(ntypes).astype(int)
-        for ii in range(ntypes):
-            natoms_vec[ii] = np.count_nonzero(sample_type == ii)
-        return natoms, natoms_vec
-
-    def _get_subdata(self, data, idx=None):
-        new_data = {}
-        for ii in data:
-            dd = data[ii]
-            if "find_" in ii:
-                new_data[ii] = dd
-            else:
-                if idx is not None:
-                    new_data[ii] = dd[idx]
-                else:
-                    new_data[ii] = dd
-        return new_data
-
-    def _load_batch_set(self, set_name: DPPath):
-        if not hasattr(self, "batch_set") or self.get_numb_set() > 1:
-            self.batch_set = self._load_set(set_name)
-            if self.modifier is not None:
-                self.modifier.modify_data(self.batch_set, self)
-        self.batch_set, _ = self._shuffle_data(self.batch_set)
-        self.reset_get_batch()
-
-    def reset_get_batch(self):
-        self.iterator = 0
-
-    def _load_test_set(self, set_name: DPPath, shuffle_test):
-        self.test_set = self._load_set(set_name)
-        if shuffle_test:
-            self.test_set, _ = self._shuffle_data(self.test_set)
-
-    def _shuffle_data(self, data):
-        ret = {}
-        nframes = data["coord"].shape[0]
-        idx = np.arange(nframes)
-        # the training times of each frame
-        idx = np.repeat(idx, np.reshape(data["numb_copy"], (nframes,)))
-        dp_random.shuffle(idx)
-        for kk in data:
-            if (
-                type(data[kk]) == np.ndarray
-                and len(data[kk].shape) == 2
-                and data[kk].shape[0] == nframes
-                and "find_" not in kk
-            ):
-                ret[kk] = data[kk][idx]
-            else:
-                ret[kk] = data[kk]
-        return ret, idx
-
-    def _load_set(self, set_name: DPPath):
-        # get nframes
-        if not isinstance(set_name, DPPath):
-            set_name = DPPath(set_name)
-        path = set_name / "coord.npy"
-        if self.data_dict["coord"]["high_prec"]:
-            coord = path.load_numpy().astype(GLOBAL_ENER_FLOAT_PRECISION)
-        else:
-            coord = path.load_numpy().astype(GLOBAL_NP_FLOAT_PRECISION)
-        if coord.ndim == 1:
-            coord = coord.reshape([1, -1])
-        nframes = coord.shape[0]
-        assert coord.shape[1] == self.data_dict["coord"]["ndof"] * self.natoms
-        # load keys
-        data = {}
-        for kk in self.data_dict.keys():
-            if self.data_dict[kk]["reduce"] is None:
-                data["find_" + kk], data[kk] = self._load_data(
-                    set_name,
-                    kk,
-                    nframes,
-                    self.data_dict[kk]["ndof"],
-                    atomic=self.data_dict[kk]["atomic"],
-                    high_prec=self.data_dict[kk]["high_prec"],
-                    must=self.data_dict[kk]["must"],
-                    type_sel=self.data_dict[kk]["type_sel"],
-                    repeat=self.data_dict[kk]["repeat"],
-                    default=self.data_dict[kk]["default"],
-                    dtype=self.data_dict[kk]["dtype"],
-                )
-        for kk in self.data_dict.keys():
-            if self.data_dict[kk]["reduce"] is not None:
-                k_in = self.data_dict[kk]["reduce"]
-                ndof = self.data_dict[kk]["ndof"]
-                data["find_" + kk] = data["find_" + k_in]
-                tmp_in = data[k_in].astype(GLOBAL_ENER_FLOAT_PRECISION)
-                data[kk] = np.sum(
-                    np.reshape(tmp_in, [nframes, self.natoms, ndof]), axis=1
-                )
-
-        if self.mixed_type:
-            # nframes x natoms
-            atom_type_mix = self._load_type_mix(set_name)
-            if self.enforce_type_map:
-                try:
-                    atom_type_mix_ = self.type_idx_map[atom_type_mix].astype(np.int32)
-                except IndexError as e:
-                    raise IndexError(
-                        "some types in 'real_atom_types.npy' of set {} are not contained in {} types!".format(
-                            set_name, self.get_ntypes()
-                        )
-                    ) from e
-                atom_type_mix = atom_type_mix_
-            real_type = atom_type_mix.reshape([nframes, self.natoms])
-            data["type"] = real_type
-            natoms = data["type"].shape[1]
-            # nframes x ntypes
-            atom_type_nums = np.array(
-                [(real_type == i).sum(axis=-1) for i in range(self.get_ntypes())],
-                dtype=np.int32,
-            ).T
-            ghost_nums = np.array(
-                [(real_type == -1).sum(axis=-1)],
-                dtype=np.int32,
-            ).T
-            assert (
-                atom_type_nums.sum(axis=-1) + ghost_nums.sum(axis=-1) == natoms
-            ).all(), "some types in 'real_atom_types.npy' of set {} are not contained in {} types!".format(
-                set_name, self.get_ntypes()
-            )
-            data["real_natoms_vec"] = np.concatenate(
-                (
-                    np.tile(np.array([natoms, natoms], dtype=np.int32), (nframes, 1)),
-                    atom_type_nums,
-                ),
-                axis=-1,
-            )
-        else:
-            data["type"] = np.tile(self.atom_type[self.idx_map], (nframes, 1))
-
-        return data
-
-    def _load_data(
-        self,
-        set_name,
-        key,
-        nframes,
-        ndof_,
-        atomic=False,
-        must=True,
-        repeat=1,
-        high_prec=False,
-        type_sel=None,
-        default: float = 0.0,
-        dtype: Optional[np.dtype] = None,
-    ):
-        if atomic:
-            natoms = self.natoms
-            idx_map = self.idx_map
-            # if type_sel, then revise natoms and idx_map
-            if type_sel is not None:
-                natoms = 0
-                for jj in type_sel:
-                    natoms += np.sum(self.atom_type == jj)
-                idx_map = self._idx_map_sel(self.atom_type, type_sel)
-            ndof = ndof_ * natoms
-        else:
-            ndof = ndof_
-        if dtype is not None:
-            pass
-        elif high_prec:
-            dtype = GLOBAL_ENER_FLOAT_PRECISION
-        else:
-            dtype = GLOBAL_NP_FLOAT_PRECISION
-        path = set_name / (key + ".npy")
-        if path.is_file():
-            data = path.load_numpy().astype(dtype)
-            try:  # YWolfeee: deal with data shape error
-                if atomic:
-                    data = data.reshape([nframes, natoms, -1])
-                    data = data[:, idx_map, :]
-                    data = data.reshape([nframes, -1])
-                data = np.reshape(data, [nframes, ndof])
-            except ValueError as err_message:
-                explanation = "This error may occur when your label mismatch it's name, i.e. you might store global tensor in `atomic_tensor.npy` or atomic tensor in `tensor.npy`."
-                log.error(str(err_message))
-                log.error(explanation)
-                raise ValueError(str(err_message) + ". " + explanation)
-            if repeat != 1:
-                data = np.repeat(data, repeat).reshape([nframes, -1])
-            return np.float32(1.0), data
-        elif must:
-            raise RuntimeError("%s not found!" % path)
-        else:
-            data = np.full([nframes, ndof], default, dtype=dtype)
-            if repeat != 1:
-                data = np.repeat(data, repeat).reshape([nframes, -1])
-            return np.float32(0.0), data
-
-    def _load_type(self, sys_path: DPPath):
-        atom_type = (sys_path / "type.raw").load_txt(ndmin=1).astype(np.int32)
-        return atom_type
-
-    def _load_type_mix(self, set_name: DPPath):
-        type_path = set_name / "real_atom_types.npy"
-        real_type = type_path.load_numpy().astype(np.int32).reshape([-1, self.natoms])
-        return real_type
-
-    def _make_idx_map(self, atom_type):
-        natoms = atom_type.shape[0]
-        idx = np.arange(natoms)
-        if self.sort_atoms:
-            idx_map = np.lexsort((idx, atom_type))
-        else:
-            idx_map = idx
-        return idx_map
-
-    def _load_type_map(self, sys_path: DPPath):
-        fname = sys_path / "type_map.raw"
-        if fname.is_file():
-            return fname.load_txt(dtype=str, ndmin=1).tolist()
-        else:
-            return None
-
-    def _check_pbc(self, sys_path: DPPath):
-        pbc = True
-        if (sys_path / "nopbc").is_file():
-            pbc = False
-        return pbc
 
-    def _check_mode(self, set_path: DPPath):
-        return (set_path / "real_atom_types.npy").is_file()
+__all__ = [
+    "DeepmdData",
+]
diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index 69a6cbe112..65e87d8ebc 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -1,653 +1,13 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-import collections
-import logging
-import warnings
-from functools import (
-    lru_cache,
-)
-from typing import (
-    List,
-    Optional,
-)
-
-import numpy as np
-
-from deepmd.common import (
-    make_default_mesh,
-)
-from deepmd.env import (
-    GLOBAL_NP_FLOAT_PRECISION,
-)
-from deepmd.utils import random as dp_random
-from deepmd.utils.data import (
-    DeepmdData,
-)
-
-log = logging.getLogger(__name__)
-
-
-class DeepmdDataSystem:
-    """Class for manipulating many data systems.
-
-    It is implemented with the help of DeepmdData
-    """
-
-    def __init__(
-        self,
-        systems: List[str],
-        batch_size: int,
-        test_size: int,
-        rcut: float,
-        set_prefix: str = "set",
-        shuffle_test: bool = True,
-        type_map: Optional[List[str]] = None,
-        optional_type_map: bool = True,
-        modifier=None,
-        trn_all_set=False,
-        sys_probs=None,
-        auto_prob_style="prob_sys_size",
-        sort_atoms: bool = True,
-    ):
-        """Constructor.
-
-        Parameters
-        ----------
-        systems
-            Specifying the paths to systems
-        batch_size
-            The batch size
-        test_size
-            The size of test data
-        rcut
-            The cut-off radius
-        set_prefix
-            Prefix for the directories of different sets
-        shuffle_test
-            If the test data are shuffled
-        type_map
-            Gives the name of different atom types
-        optional_type_map
-            If the type_map.raw in each system is optional
-        modifier
-            Data modifier that has the method `modify_data`
-        trn_all_set
-            Use all sets as training dataset. Otherwise, if the number of sets is more than 1, the last set is left for test.
-        sys_probs : list of float
-            The probabilitis of systems to get the batch.
-            Summation of positive elements of this list should be no greater than 1.
-            Element of this list can be negative, the probability of the corresponding system is determined
-                automatically by the number of batches in the system.
-        auto_prob_style : str
-            Determine the probability of systems automatically. The method is assigned by this key and can be
-            - "prob_uniform"  : the probability all the systems are equal, namely 1.0/self.get_nsystems()
-            - "prob_sys_size" : the probability of a system is proportional to the number of batches in the system
-            - "prob_sys_size;stt_idx:end_idx:weight;stt_idx:end_idx:weight;..." :
-                                the list of systems is devided into blocks. A block is specified by `stt_idx:end_idx:weight`,
-                                where `stt_idx` is the starting index of the system, `end_idx` is then ending (not including) index of the system,
-                                the probabilities of the systems in this block sums up to `weight`, and the relatively probabilities within this block is proportional
-                to the number of batches in the system.
-        sort_atoms : bool
-            Sort atoms by atom types. Required to enable when the data is directly feeded to
-            descriptors except mixed types.
-        """
-        # init data
-        self.rcut = rcut
-        self.system_dirs = systems
-        self.nsystems = len(self.system_dirs)
-        self.data_systems = []
-        for ii in self.system_dirs:
-            self.data_systems.append(
-                DeepmdData(
-                    ii,
-                    set_prefix=set_prefix,
-                    shuffle_test=shuffle_test,
-                    type_map=type_map,
-                    optional_type_map=optional_type_map,
-                    modifier=modifier,
-                    trn_all_set=trn_all_set,
-                    sort_atoms=sort_atoms,
-                )
-            )
-        # check mix_type format
-        error_format_msg = (
-            "if one of the system is of mixed_type format, "
-            "then all of the systems should be of mixed_type format!"
-        )
-        if self.data_systems[0].mixed_type:
-            for data_sys in self.data_systems[1:]:
-                assert data_sys.mixed_type, error_format_msg
-            self.mixed_type = True
-        else:
-            for data_sys in self.data_systems[1:]:
-                assert not data_sys.mixed_type, error_format_msg
-            self.mixed_type = False
-        # batch size
-        self.batch_size = batch_size
-        is_auto_bs = False
-        self.mixed_systems = False
-        if isinstance(self.batch_size, int):
-            self.batch_size = self.batch_size * np.ones(self.nsystems, dtype=int)
-        elif isinstance(self.batch_size, str):
-            words = self.batch_size.split(":")
-            if "auto" == words[0]:
-                is_auto_bs = True
-                rule = 32
-                if len(words) == 2:
-                    rule = int(words[1])
-                self.batch_size = self._make_auto_bs(rule)
-            elif "mixed" == words[0]:
-                self.mixed_type = True
-                self.mixed_systems = True
-                if len(words) == 2:
-                    rule = int(words[1])
-                else:
-                    raise RuntimeError("batch size must be specified for mixed systems")
-                self.batch_size = rule * np.ones(self.nsystems, dtype=int)
-            else:
-                raise RuntimeError("unknown batch_size rule " + words[0])
-        elif isinstance(self.batch_size, list):
-            pass
-        else:
-            raise RuntimeError("invalid batch_size")
-        assert isinstance(self.batch_size, (list, np.ndarray))
-        assert len(self.batch_size) == self.nsystems
-
-        # natoms, nbatches
-        ntypes = []
-        for ii in self.data_systems:
-            ntypes.append(ii.get_ntypes())
-        self.sys_ntypes = max(ntypes)
-        self.natoms = []
-        self.natoms_vec = []
-        self.nbatches = []
-        type_map_list = []
-        for ii in range(self.nsystems):
-            self.natoms.append(self.data_systems[ii].get_natoms())
-            self.natoms_vec.append(
-                self.data_systems[ii].get_natoms_vec(self.sys_ntypes).astype(int)
-            )
-            self.nbatches.append(
-                self.data_systems[ii].get_sys_numb_batch(self.batch_size[ii])
-            )
-            type_map_list.append(self.data_systems[ii].get_type_map())
-        self.type_map = self._check_type_map_consistency(type_map_list)
-
-        # ! altered by Marián Rynik
-        # test size
-        # now test size can be set as a percentage of systems data or test size
-        # can be set for each system individualy in the same manner as batch
-        # size. This enables one to use systems with diverse number of
-        # structures and different number of atoms.
-        self.test_size = test_size
-        if isinstance(self.test_size, int):
-            self.test_size = self.test_size * np.ones(self.nsystems, dtype=int)
-        elif isinstance(self.test_size, str):
-            words = self.test_size.split("%")
-            try:
-                percent = int(words[0])
-            except ValueError:
-                raise RuntimeError("unknown test_size rule " + words[0])
-            self.test_size = self._make_auto_ts(percent)
-        elif isinstance(self.test_size, list):
-            pass
-        else:
-            raise RuntimeError("invalid test_size")
-        assert isinstance(self.test_size, (list, np.ndarray))
-        assert len(self.test_size) == self.nsystems
-
-        # prob of batch, init pick idx
-        self.prob_nbatches = [float(i) for i in self.nbatches] / np.sum(self.nbatches)
-        self.pick_idx = 0
-
-        # derive system probabilities
-        self.sys_probs = None
-        self.set_sys_probs(sys_probs, auto_prob_style)
-
-        # check batch and test size
-        for ii in range(self.nsystems):
-            chk_ret = self.data_systems[ii].check_batch_size(self.batch_size[ii])
-            if chk_ret is not None and not is_auto_bs and not self.mixed_systems:
-                warnings.warn(
-                    "system %s required batch size is larger than the size of the dataset %s (%d > %d)"
-                    % (
-                        self.system_dirs[ii],
-                        chk_ret[0],
-                        self.batch_size[ii],
-                        chk_ret[1],
-                    )
-                )
-            chk_ret = self.data_systems[ii].check_test_size(self.test_size[ii])
-            if chk_ret is not None and not is_auto_bs and not self.mixed_systems:
-                warnings.warn(
-                    "system %s required test size is larger than the size of the dataset %s (%d > %d)"
-                    % (self.system_dirs[ii], chk_ret[0], self.test_size[ii], chk_ret[1])
-                )
-
-    def _load_test(self, ntests=-1):
-        self.test_data = collections.defaultdict(list)
-        for ii in range(self.nsystems):
-            test_system_data = self.data_systems[ii].get_test(ntests=ntests)
-            for nn in test_system_data:
-                self.test_data[nn].append(test_system_data[nn])
-
-    @property
-    @lru_cache(maxsize=None)
-    def default_mesh(self) -> List[np.ndarray]:
-        """Mesh for each system."""
-        return [
-            make_default_mesh(
-                self.data_systems[ii].pbc, self.data_systems[ii].mixed_type
-            )
-            for ii in range(self.nsystems)
-        ]
-
-    def compute_energy_shift(self, rcond=None, key="energy"):
-        sys_ener = []
-        for ss in self.data_systems:
-            sys_ener.append(ss.avg(key))
-        sys_ener = np.concatenate(sys_ener)
-        sys_tynatom = np.array(self.natoms_vec, dtype=GLOBAL_NP_FLOAT_PRECISION)
-        sys_tynatom = np.reshape(sys_tynatom, [self.nsystems, -1])
-        sys_tynatom = sys_tynatom[:, 2:]
-        energy_shift, resd, rank, s_value = np.linalg.lstsq(
-            sys_tynatom, sys_ener, rcond=rcond
-        )
-        return energy_shift
-
-    def add_dict(self, adict: dict) -> None:
-        """Add items to the data system by a `dict`.
-        `adict` should have items like
-        .. code-block:: python.
-
-           adict[key] = {
-               "ndof": ndof,
-               "atomic": atomic,
-               "must": must,
-               "high_prec": high_prec,
-               "type_sel": type_sel,
-               "repeat": repeat,
-           }
-
-        For the explaination of the keys see `add`
-        """
-        for kk in adict:
-            self.add(
-                kk,
-                adict[kk]["ndof"],
-                atomic=adict[kk]["atomic"],
-                must=adict[kk]["must"],
-                high_prec=adict[kk]["high_prec"],
-                type_sel=adict[kk]["type_sel"],
-                repeat=adict[kk]["repeat"],
-                default=adict[kk]["default"],
-            )
-
-    def add(
-        self,
-        key: str,
-        ndof: int,
-        atomic: bool = False,
-        must: bool = False,
-        high_prec: bool = False,
-        type_sel: Optional[List[int]] = None,
-        repeat: int = 1,
-        default: float = 0.0,
-    ):
-        """Add a data item that to be loaded.
-
-        Parameters
-        ----------
-        key
-            The key of the item. The corresponding data is stored in `sys_path/set.*/key.npy`
-        ndof
-            The number of dof
-        atomic
-            The item is an atomic property.
-            If False, the size of the data should be nframes x ndof
-            If True, the size of data should be nframes x natoms x ndof
-        must
-            The data file `sys_path/set.*/key.npy` must exist.
-            If must is False and the data file does not exist, the `data_dict[find_key]` is set to 0.0
-        high_prec
-            Load the data and store in float64, otherwise in float32
-        type_sel
-            Select certain type of atoms
-        repeat
-            The data will be repeated `repeat` times.
-        default, default=0.
-            Default value of data
-        """
-        for ii in self.data_systems:
-            ii.add(
-                key,
-                ndof,
-                atomic=atomic,
-                must=must,
-                high_prec=high_prec,
-                repeat=repeat,
-                type_sel=type_sel,
-                default=default,
-            )
-
-    def reduce(self, key_out, key_in):
-        """Generate a new item from the reduction of another atom.
-
-        Parameters
-        ----------
-        key_out
-            The name of the reduced item
-        key_in
-            The name of the data item to be reduced
-        """
-        for ii in self.data_systems:
-            ii.reduce(key_out, key_in)
-
-    def get_data_dict(self, ii: int = 0) -> dict:
-        return self.data_systems[ii].get_data_dict()
-
-    def set_sys_probs(self, sys_probs=None, auto_prob_style: str = "prob_sys_size"):
-        if sys_probs is None:
-            if auto_prob_style == "prob_uniform":
-                prob_v = 1.0 / float(self.nsystems)
-                probs = [prob_v for ii in range(self.nsystems)]
-            elif auto_prob_style == "prob_sys_size":
-                probs = self.prob_nbatches
-            elif auto_prob_style[:14] == "prob_sys_size;":
-                probs = prob_sys_size_ext(
-                    auto_prob_style, self.get_nsystems(), self.nbatches
-                )
-            else:
-                raise RuntimeError("Unknown auto prob style: " + auto_prob_style)
-        else:
-            probs = process_sys_probs(sys_probs, self.nbatches)
-        self.sys_probs = probs
-
-    def get_batch(self, sys_idx: Optional[int] = None) -> dict:
-        # batch generation style altered by Ziyao Li:
-        # one should specify the "sys_prob" and "auto_prob_style" params
-        # via set_sys_prob() function. The sys_probs this function uses is
-        # defined as a private variable, self.sys_probs, initialized in __init__().
-        # This is to optimize the (vain) efforts in evaluating sys_probs every batch.
-        """Get a batch of data from the data systems.
-
-        Parameters
-        ----------
-        sys_idx : int
-            The index of system from which the batch is get.
-            If sys_idx is not None, `sys_probs` and `auto_prob_style` are ignored
-            If sys_idx is None, automatically determine the system according to `sys_probs` or `auto_prob_style`, see the following.
-            This option does not work for mixed systems.
-
-        Returns
-        -------
-        dict
-            The batch data
-        """
-        if not self.mixed_systems:
-            b_data = self.get_batch_standard(sys_idx)
-        else:
-            b_data = self.get_batch_mixed()
-        return b_data
-
-    def get_batch_standard(self, sys_idx: Optional[int] = None) -> dict:
-        """Get a batch of data from the data systems in the standard way.
-
-        Parameters
-        ----------
-        sys_idx : int
-            The index of system from which the batch is get.
-            If sys_idx is not None, `sys_probs` and `auto_prob_style` are ignored
-            If sys_idx is None, automatically determine the system according to `sys_probs` or `auto_prob_style`, see the following.
-
-        Returns
-        -------
-        dict
-            The batch data
-        """
-        if sys_idx is not None:
-            self.pick_idx = sys_idx
-        else:
-            # prob = self._get_sys_probs(sys_probs, auto_prob_style)
-            self.pick_idx = dp_random.choice(np.arange(self.nsystems), p=self.sys_probs)
-        b_data = self.data_systems[self.pick_idx].get_batch(
-            self.batch_size[self.pick_idx]
-        )
-        b_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
-        b_data["default_mesh"] = self.default_mesh[self.pick_idx]
-        return b_data
-
-    def get_batch_mixed(self) -> dict:
-        """Get a batch of data from the data systems in the mixed way.
-
-        Returns
-        -------
-        dict
-            The batch data
-        """
-        # mixed systems have a global batch size
-        batch_size = self.batch_size[0]
-        batch_data = []
-        for _ in range(batch_size):
-            self.pick_idx = dp_random.choice(np.arange(self.nsystems), p=self.sys_probs)
-            bb_data = self.data_systems[self.pick_idx].get_batch(1)
-            bb_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
-            bb_data["default_mesh"] = self.default_mesh[self.pick_idx]
-            batch_data.append(bb_data)
-        b_data = self._merge_batch_data(batch_data)
-        return b_data
-
-    def _merge_batch_data(self, batch_data: List[dict]) -> dict:
-        """Merge batch data from different systems.
-
-        Parameters
-        ----------
-        batch_data : list of dict
-            A list of batch data from different systems.
-
-        Returns
-        -------
-        dict
-            The merged batch data.
-        """
-        b_data = {}
-        max_natoms = max(bb["natoms_vec"][0] for bb in batch_data)
-        # natoms_vec
-        natoms_vec = np.zeros(2 + self.get_ntypes(), dtype=int)
-        natoms_vec[0:3] = max_natoms
-        b_data["natoms_vec"] = natoms_vec
-        # real_natoms_vec
-        real_natoms_vec = np.vstack([bb["natoms_vec"] for bb in batch_data])
-        b_data["real_natoms_vec"] = real_natoms_vec
-        # type
-        type_vec = np.full((len(batch_data), max_natoms), -1, dtype=int)
-        for ii, bb in enumerate(batch_data):
-            type_vec[ii, : bb["type"].shape[1]] = bb["type"][0]
-        b_data["type"] = type_vec
-        # default_mesh
-        default_mesh = np.mean([bb["default_mesh"] for bb in batch_data], axis=0)
-        b_data["default_mesh"] = default_mesh
-        # other data
-        data_dict = self.get_data_dict(0)
-        for kk, vv in data_dict.items():
-            if kk not in batch_data[0]:
-                continue
-            b_data["find_" + kk] = batch_data[0]["find_" + kk]
-            if not vv["atomic"]:
-                b_data[kk] = np.concatenate([bb[kk] for bb in batch_data], axis=0)
-            else:
-                b_data[kk] = np.zeros(
-                    (len(batch_data), max_natoms * vv["ndof"] * vv["repeat"]),
-                    dtype=batch_data[0][kk].dtype,
-                )
-                for ii, bb in enumerate(batch_data):
-                    b_data[kk][ii, : bb[kk].shape[1]] = bb[kk][0]
-        return b_data
-
-    # ! altered by Marián Rynik
-    def get_test(self, sys_idx: Optional[int] = None, n_test: int = -1):  # depreciated
-        """Get test data from the the data systems.
-
-        Parameters
-        ----------
-        sys_idx
-            The test dat of system with index `sys_idx` will be returned.
-            If is None, the currently selected system will be returned.
-        n_test
-            Number of test data. If set to -1 all test data will be get.
-        """
-        if not hasattr(self, "test_data"):
-            self._load_test(ntests=n_test)
-        if sys_idx is not None:
-            idx = sys_idx
-        else:
-            idx = self.pick_idx
-
-        test_system_data = {}
-        for nn in self.test_data:
-            test_system_data[nn] = self.test_data[nn][idx]
-        test_system_data["natoms_vec"] = self.natoms_vec[idx]
-        test_system_data["default_mesh"] = self.default_mesh[idx]
-        return test_system_data
-
-    def get_sys_ntest(self, sys_idx=None):
-        """Get number of tests for the currently selected system,
-        or one defined by sys_idx.
-        """
-        if sys_idx is not None:
-            return self.test_size[sys_idx]
-        else:
-            return self.test_size[self.pick_idx]
-
-    def get_type_map(self) -> List[str]:
-        """Get the type map."""
-        return self.type_map
-
-    def get_nbatches(self) -> int:
-        """Get the total number of batches."""
-        return self.nbatches
-
-    def get_ntypes(self) -> int:
-        """Get the number of types."""
-        return self.sys_ntypes
-
-    def get_nsystems(self) -> int:
-        """Get the number of data systems."""
-        return self.nsystems
-
-    def get_sys(self, idx: int) -> DeepmdData:
-        """Get a certain data system."""
-        return self.data_systems[idx]
-
-    def get_batch_size(self) -> int:
-        """Get the batch size."""
-        return self.batch_size
-
-    def _format_name_length(self, name, width):
-        if len(name) <= width:
-            return "{: >{}}".format(name, width)
-        else:
-            name = name[-(width - 3) :]
-            name = "-- " + name
-            return name
-
-    def print_summary(self, name):
-        # width 65
-        sys_width = 42
-        log.info(
-            f"---Summary of DataSystem: {name:13s}-----------------------------------------------"
-        )
-        log.info("found %d system(s):" % self.nsystems)
-        log.info(
-            ("%s  " % self._format_name_length("system", sys_width))
-            + ("%6s  %6s  %6s  %5s  %3s" % ("natoms", "bch_sz", "n_bch", "prob", "pbc"))
-        )
-        for ii in range(self.nsystems):
-            log.info(
-                "%s  %6d  %6d  %6d  %5.3f  %3s"
-                % (
-                    self._format_name_length(self.system_dirs[ii], sys_width),
-                    self.natoms[ii],
-                    # TODO batch size * nbatches = number of structures
-                    self.batch_size[ii],
-                    self.nbatches[ii],
-                    self.sys_probs[ii],
-                    "T" if self.data_systems[ii].pbc else "F",
-                )
-            )
-        log.info(
-            "--------------------------------------------------------------------------------------"
-        )
-
-    def _make_auto_bs(self, rule):
-        bs = []
-        for ii in self.data_systems:
-            ni = ii.get_natoms()
-            bsi = rule // ni
-            if bsi * ni < rule:
-                bsi += 1
-            bs.append(bsi)
-        return bs
-
-    # ! added by Marián Rynik
-    def _make_auto_ts(self, percent):
-        ts = []
-        for ii in range(self.nsystems):
-            ni = self.batch_size[ii] * self.nbatches[ii]
-            tsi = int(ni * percent / 100)
-            ts.append(tsi)
-
-        return ts
-
-    def _check_type_map_consistency(self, type_map_list):
-        ret = []
-        for ii in type_map_list:
-            if ii is not None:
-                min_len = min([len(ii), len(ret)])
-                for idx in range(min_len):
-                    if ii[idx] != ret[idx]:
-                        raise RuntimeError(f"inconsistent type map: {ret!s} {ii!s}")
-                if len(ii) > len(ret):
-                    ret = ii
-        return ret
-
-
-def process_sys_probs(sys_probs, nbatch):
-    sys_probs = np.array(sys_probs)
-    type_filter = sys_probs >= 0
-    assigned_sum_prob = np.sum(type_filter * sys_probs)
-    # 1e-8 is to handle floating point error; See #1917
-    assert (
-        assigned_sum_prob <= 1.0 + 1e-8
-    ), "the sum of assigned probability should be less than 1"
-    rest_sum_prob = 1.0 - assigned_sum_prob
-    if not np.isclose(rest_sum_prob, 0):
-        rest_nbatch = (1 - type_filter) * nbatch
-        rest_prob = rest_sum_prob * rest_nbatch / np.sum(rest_nbatch)
-        ret_prob = rest_prob + type_filter * sys_probs
-    else:
-        ret_prob = sys_probs
-    assert np.isclose(np.sum(ret_prob), 1), "sum of probs should be 1"
-    return ret_prob
-
-
-def prob_sys_size_ext(keywords, nsystems, nbatch):
-    block_str = keywords.split(";")[1:]
-    block_stt = []
-    block_end = []
-    block_weights = []
-    for ii in block_str:
-        stt = int(ii.split(":")[0])
-        end = int(ii.split(":")[1])
-        weight = float(ii.split(":")[2])
-        assert weight >= 0, "the weight of a block should be no less than 0"
-        block_stt.append(stt)
-        block_end.append(end)
-        block_weights.append(weight)
-    nblocks = len(block_str)
-    block_probs = np.array(block_weights) / np.sum(block_weights)
-    sys_probs = np.zeros([nsystems])
-    for ii in range(nblocks):
-        nbatch_block = nbatch[block_stt[ii] : block_end[ii]]
-        tmp_prob = [float(i) for i in nbatch_block] / np.sum(nbatch_block)
-        sys_probs[block_stt[ii] : block_end[ii]] = tmp_prob * block_probs[ii]
-    return sys_probs
+"""Alias for backward compatibility."""
+from deepmd_utils.utils.data_system import (
+    DeepmdDataSystem,
+    prob_sys_size_ext,
+    process_sys_probs,
+)
+
+__all__ = [
+    "DeepmdDataSystem",
+    "process_sys_probs",
+    "prob_sys_size_ext",
+]
diff --git a/deepmd/utils/errors.py b/deepmd/utils/errors.py
index 5d96fa0e6a..683131e48a 100644
--- a/deepmd/utils/errors.py
+++ b/deepmd/utils/errors.py
@@ -1,4 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from deepmd_utils.utils.errors import (
+    OutOfMemoryError,
+)
+
+
 class GraphTooLargeError(Exception):
     """The graph is too large, exceeding protobuf's hard limit of 2GB."""
 
@@ -7,5 +12,8 @@ class GraphWithoutTensorError(Exception):
     pass
 
 
-class OutOfMemoryError(Exception):
-    """This error is caused by out-of-memory (OOM)."""
+__all__ = [
+    "OutOfMemoryError",
+    "GraphTooLargeError",
+    "GraphWithoutTensorError",
+]
diff --git a/deepmd/utils/finetune.py b/deepmd/utils/finetune.py
index 4e597b1e05..cc6c0224de 100644
--- a/deepmd/utils/finetune.py
+++ b/deepmd/utils/finetune.py
@@ -41,12 +41,14 @@ def replace_model_params_with_pretrained_model(
     pretrained_jdata = json.loads(t_jdata)
 
     # Check the model type
-    assert pretrained_jdata["model"]["descriptor"]["type"] in [
-        "se_atten",
-        "se_atten_v2",
-    ] and pretrained_jdata["model"]["fitting_net"]["type"] in [
-        "ener"
-    ], "The finetune process only supports models pretrained with 'se_atten' or 'se_atten_v2' descriptor and 'ener' fitting_net!"
+    assert (
+        pretrained_jdata["model"]["descriptor"]["type"]
+        in [
+            "se_atten",
+            "se_atten_v2",
+        ]
+        and pretrained_jdata["model"]["fitting_net"]["type"] in ["ener"]
+    ), "The finetune process only supports models pretrained with 'se_atten' or 'se_atten_v2' descriptor and 'ener' fitting_net!"
 
     # Check the type map
     pretrained_type_map = pretrained_jdata["model"]["type_map"]
diff --git a/deepmd/utils/graph.py b/deepmd/utils/graph.py
index 2a795a45a2..ad4ee0224a 100644
--- a/deepmd/utils/graph.py
+++ b/deepmd/utils/graph.py
@@ -237,6 +237,91 @@ def get_embedding_net_variables_from_graph_def(
     return embedding_net_variables
 
 
+def get_extra_embedding_net_suffix(type_one_side: bool):
+    """Get the extra embedding net suffix according to the value of type_one_side.
+
+    Parameters
+    ----------
+    type_one_side
+        The value of type_one_side
+
+    Returns
+    -------
+    str
+        The extra embedding net suffix
+    """
+    if type_one_side:
+        extra_suffix = "_one_side_ebd"
+    else:
+        extra_suffix = "_two_side_ebd"
+    return extra_suffix
+
+
+def get_variables_from_graph_def_as_numpy_array(graph_def: tf.GraphDef, pattern: str):
+    """Get variables from the given tf.GraphDef object, with numpy array returns.
+
+    Parameters
+    ----------
+    graph_def
+        The input tf.GraphDef object
+    pattern : str
+        The name of variable
+
+    Returns
+    -------
+    np.ndarray
+        The numpy array of the variable
+    """
+    node = get_pattern_nodes_from_graph_def(graph_def, pattern)[pattern]
+    dtype = tf.as_dtype(node.dtype).as_numpy_dtype
+    tensor_shape = tf.TensorShape(node.tensor_shape).as_list()
+    if (len(tensor_shape) != 1) or (tensor_shape[0] != 1):
+        tensor_value = np.frombuffer(
+            node.tensor_content,
+            dtype=tf.as_dtype(node.dtype).as_numpy_dtype,
+        )
+    else:
+        tensor_value = get_tensor_by_type(node, dtype)
+    return np.reshape(tensor_value, tensor_shape)
+
+
+def get_extra_embedding_net_variables_from_graph_def(
+    graph_def: tf.GraphDef, suffix: str, extra_suffix: str, layer_size: int
+):
+    """Get extra embedding net variables from the given tf.GraphDef object.
+    The "extra embedding net" means the embedding net with only type embeddings input,
+    which occurs in "se_atten_v2" and "se_a_ebd_v2" descriptor.
+
+    Parameters
+    ----------
+    graph_def
+        The input tf.GraphDef object
+    suffix : str
+        The "common" suffix in the descriptor
+    extra_suffix : str
+        This value depends on the value of "type_one_side".
+        It should always be "_one_side_ebd" or "_two_side_ebd"
+    layer_size : int
+        The layer size of the embedding net
+
+    Returns
+    -------
+    Dict
+        The extra embedding net variables within the given tf.GraphDef object
+    """
+    extra_embedding_net_variables = {}
+    for i in range(1, layer_size + 1):
+        matrix_pattern = f"filter_type_all{suffix}/matrix_{i}{extra_suffix}"
+        extra_embedding_net_variables[
+            matrix_pattern
+        ] = get_variables_from_graph_def_as_numpy_array(graph_def, matrix_pattern)
+        bias_pattern = f"filter_type_all{suffix}/bias_{i}{extra_suffix}"
+        extra_embedding_net_variables[
+            bias_pattern
+        ] = get_variables_from_graph_def_as_numpy_array(graph_def, bias_pattern)
+    return extra_embedding_net_variables
+
+
 def get_embedding_net_variables(model_file: str, suffix: str = "") -> Dict:
     """Get the embedding net variables with the given frozen model(model_file).
 
diff --git a/deepmd/utils/pair_tab.py b/deepmd/utils/pair_tab.py
index 4451f53379..1a526ac5fc 100644
--- a/deepmd/utils/pair_tab.py
+++ b/deepmd/utils/pair_tab.py
@@ -1,91 +1,9 @@
-#!/usr/bin/env python3
-
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    Tuple,
-)
-
-import numpy as np
-from scipy.interpolate import (
-    CubicSpline,
+"""Alias for backward compatibility."""
+from deepmd_utils.utils.pair_tab import (
+    PairTab,
 )
 
-
-class PairTab:
-    """Pairwise tabulated potential.
-
-    Parameters
-    ----------
-    filename
-            File name for the short-range tabulated potential.
-            The table is a text data file with (N_t + 1) * N_t / 2 + 1 columes.
-            The first colume is the distance between atoms.
-            The second to the last columes are energies for pairs of certain types.
-            For example we have two atom types, 0 and 1.
-            The columes from 2nd to 4th are for 0-0, 0-1 and 1-1 correspondingly.
-    """
-
-    def __init__(self, filename: str) -> None:
-        """Constructor."""
-        self.reinit(filename)
-
-    def reinit(self, filename: str) -> None:
-        """Initialize the tabulated interaction.
-
-        Parameters
-        ----------
-        filename
-            File name for the short-range tabulated potential.
-            The table is a text data file with (N_t + 1) * N_t / 2 + 1 columes.
-            The first colume is the distance between atoms.
-            The second to the last columes are energies for pairs of certain types.
-            For example we have two atom types, 0 and 1.
-            The columes from 2nd to 4th are for 0-0, 0-1 and 1-1 correspondingly.
-        """
-        self.vdata = np.loadtxt(filename)
-        self.rmin = self.vdata[0][0]
-        self.hh = self.vdata[1][0] - self.vdata[0][0]
-        self.nspline = self.vdata.shape[0] - 1
-        ncol = self.vdata.shape[1] - 1
-        n0 = (-1 + np.sqrt(1 + 8 * ncol)) * 0.5
-        self.ntypes = int(n0 + 0.1)
-        assert self.ntypes * (self.ntypes + 1) // 2 == ncol, (
-            "number of volumes provided in %s does not match guessed number of types %d"
-            % (filename, self.ntypes)
-        )
-        self.tab_info = np.array([self.rmin, self.hh, self.nspline, self.ntypes])
-        self.tab_data = self._make_data()
-
-    def get(self) -> Tuple[np.array, np.array]:
-        """Get the serialized table."""
-        return self.tab_info, self.tab_data
-
-    def _make_data(self):
-        data = np.zeros([self.ntypes * self.ntypes * 4 * self.nspline])
-        stride = 4 * self.nspline
-        idx_iter = 0
-        xx = self.vdata[:, 0]
-        for t0 in range(self.ntypes):
-            for t1 in range(t0, self.ntypes):
-                vv = self.vdata[:, 1 + idx_iter]
-                cs = CubicSpline(xx, vv)
-                dd = cs(xx, 1)
-                dd *= self.hh
-                dtmp = np.zeros(stride)
-                for ii in range(self.nspline):
-                    dtmp[ii * 4 + 0] = 2 * vv[ii] - 2 * vv[ii + 1] + dd[ii] + dd[ii + 1]
-                    dtmp[ii * 4 + 1] = (
-                        -3 * vv[ii] + 3 * vv[ii + 1] - 2 * dd[ii] - dd[ii + 1]
-                    )
-                    dtmp[ii * 4 + 2] = dd[ii]
-                    dtmp[ii * 4 + 3] = vv[ii]
-                data[
-                    (t0 * self.ntypes + t1) * stride : (t0 * self.ntypes + t1) * stride
-                    + stride
-                ] = dtmp
-                data[
-                    (t1 * self.ntypes + t0) * stride : (t1 * self.ntypes + t0) * stride
-                    + stride
-                ] = dtmp
-                idx_iter += 1
-        return data
+__all__ = [
+    "PairTab",
+]
diff --git a/deepmd/utils/path.py b/deepmd/utils/path.py
index a8e4bc329f..780bc8cabf 100644
--- a/deepmd/utils/path.py
+++ b/deepmd/utils/path.py
@@ -1,358 +1,13 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-import os
-from abc import (
-    ABC,
-    abstractmethod,
-)
-from functools import (
-    lru_cache,
-)
-from pathlib import (
-    Path,
-)
-from typing import (
-    List,
-    Optional,
-)
-
-import h5py
-import numpy as np
-from wcmatch.glob import (
-    globfilter,
-)
-
-
-class DPPath(ABC):
-    """The path class to data system (DeepmdData).
-
-    Parameters
-    ----------
-    path : str
-        path
-    """
-
-    def __new__(cls, path: str):
-        if cls is DPPath:
-            if os.path.isdir(path):
-                return super().__new__(DPOSPath)
-            elif os.path.isfile(path.split("#")[0]):
-                # assume h5 if it is not dir
-                # TODO: check if it is a real h5? or just check suffix?
-                return super().__new__(DPH5Path)
-            raise FileNotFoundError("%s not found" % path)
-        return super().__new__(cls)
-
-    @abstractmethod
-    def load_numpy(self) -> np.ndarray:
-        """Load NumPy array.
-
-        Returns
-        -------
-        np.ndarray
-            loaded NumPy array
-        """
-
-    @abstractmethod
-    def load_txt(self, **kwargs) -> np.ndarray:
-        """Load NumPy array from text.
-
-        Returns
-        -------
-        np.ndarray
-            loaded NumPy array
-        """
-
-    @abstractmethod
-    def glob(self, pattern: str) -> List["DPPath"]:
-        """Search path using the glob pattern.
-
-        Parameters
-        ----------
-        pattern : str
-            glob pattern
-
-        Returns
-        -------
-        List[DPPath]
-            list of paths
-        """
-
-    @abstractmethod
-    def rglob(self, pattern: str) -> List["DPPath"]:
-        """This is like calling :meth:`DPPath.glob()` with `**/` added in front
-        of the given relative pattern.
-
-        Parameters
-        ----------
-        pattern : str
-            glob pattern
-
-        Returns
-        -------
-        List[DPPath]
-            list of paths
-        """
-
-    @abstractmethod
-    def is_file(self) -> bool:
-        """Check if self is file."""
-
-    @abstractmethod
-    def is_dir(self) -> bool:
-        """Check if self is directory."""
-
-    @abstractmethod
-    def __truediv__(self, key: str) -> "DPPath":
-        """Used for / operator."""
-
-    @abstractmethod
-    def __lt__(self, other: "DPPath") -> bool:
-        """Whether this DPPath is less than other for sorting."""
-
-    @abstractmethod
-    def __str__(self) -> str:
-        """Represent string."""
-
-    def __repr__(self) -> str:
-        return f"{type(self)} ({self!s})"
-
-    def __eq__(self, other) -> bool:
-        return str(self) == str(other)
-
-    def __hash__(self):
-        return hash(str(self))
-
-
-class DPOSPath(DPPath):
-    """The OS path class to data system (DeepmdData) for real directories.
-
-    Parameters
-    ----------
-    path : str
-        path
-    """
-
-    def __init__(self, path: str) -> None:
-        super().__init__()
-        if isinstance(path, Path):
-            self.path = path
-        else:
-            self.path = Path(path)
-
-    def load_numpy(self) -> np.ndarray:
-        """Load NumPy array.
-
-        Returns
-        -------
-        np.ndarray
-            loaded NumPy array
-        """
-        return np.load(str(self.path))
-
-    def load_txt(self, **kwargs) -> np.ndarray:
-        """Load NumPy array from text.
-
-        Returns
-        -------
-        np.ndarray
-            loaded NumPy array
-        """
-        return np.loadtxt(str(self.path), **kwargs)
-
-    def glob(self, pattern: str) -> List["DPPath"]:
-        """Search path using the glob pattern.
-
-        Parameters
-        ----------
-        pattern : str
-            glob pattern
-
-        Returns
-        -------
-        List[DPPath]
-            list of paths
-        """
-        # currently DPOSPath will only derivative DPOSPath
-        # TODO: discuss if we want to mix DPOSPath and DPH5Path?
-        return [type(self)(p) for p in self.path.glob(pattern)]
-
-    def rglob(self, pattern: str) -> List["DPPath"]:
-        """This is like calling :meth:`DPPath.glob()` with `**/` added in front
-        of the given relative pattern.
-
-        Parameters
-        ----------
-        pattern : str
-            glob pattern
-
-        Returns
-        -------
-        List[DPPath]
-            list of paths
-        """
-        return [type(self)(p) for p in self.path.rglob(pattern)]
-
-    def is_file(self) -> bool:
-        """Check if self is file."""
-        return self.path.is_file()
-
-    def is_dir(self) -> bool:
-        """Check if self is directory."""
-        return self.path.is_dir()
-
-    def __truediv__(self, key: str) -> "DPPath":
-        """Used for / operator."""
-        return type(self)(self.path / key)
-
-    def __lt__(self, other: "DPOSPath") -> bool:
-        """Whether this DPPath is less than other for sorting."""
-        return self.path < other.path
-
-    def __str__(self) -> str:
-        """Represent string."""
-        return str(self.path)
-
-
-class DPH5Path(DPPath):
-    """The path class to data system (DeepmdData) for HDF5 files.
-
-    Notes
-    -----
-    OS - HDF5 relationship:
-        directory - Group
-        file - Dataset
-
-    Parameters
-    ----------
-    path : str
-        path
-    """
-
-    def __init__(self, path: str) -> None:
-        super().__init__()
-        # we use "#" to split path
-        # so we do not support file names containing #...
-        s = path.split("#")
-        self.root_path = s[0]
-        self.root = self._load_h5py(s[0])
-        # h5 path: default is the root path
-        self.name = s[1] if len(s) > 1 else "/"
-
-    @classmethod
-    @lru_cache(None)
-    def _load_h5py(cls, path: str) -> h5py.File:
-        """Load hdf5 file.
-
-        Parameters
-        ----------
-        path : str
-            path to hdf5 file
-        """
-        # this method has cache to avoid duplicated
-        # loading from different DPH5Path
-        # However the file will be never closed?
-        return h5py.File(path, "r")
-
-    def load_numpy(self) -> np.ndarray:
-        """Load NumPy array.
-
-        Returns
-        -------
-        np.ndarray
-            loaded NumPy array
-        """
-        return self.root[self.name][:]
-
-    def load_txt(self, dtype: Optional[np.dtype] = None, **kwargs) -> np.ndarray:
-        """Load NumPy array from text.
-
-        Returns
-        -------
-        np.ndarray
-            loaded NumPy array
-        """
-        arr = self.load_numpy()
-        if dtype:
-            arr = arr.astype(dtype)
-        return arr
-
-    def glob(self, pattern: str) -> List["DPPath"]:
-        """Search path using the glob pattern.
-
-        Parameters
-        ----------
-        pattern : str
-            glob pattern
-
-        Returns
-        -------
-        List[DPPath]
-            list of paths
-        """
-        # got paths starts with current path first, which is faster
-        subpaths = [ii for ii in self._keys if ii.startswith(self.name)]
-        return [
-            type(self)(f"{self.root_path}#{pp}")
-            for pp in globfilter(subpaths, self._connect_path(pattern))
-        ]
-
-    def rglob(self, pattern: str) -> List["DPPath"]:
-        """This is like calling :meth:`DPPath.glob()` with `**/` added in front
-        of the given relative pattern.
-
-        Parameters
-        ----------
-        pattern : str
-            glob pattern
-
-        Returns
-        -------
-        List[DPPath]
-            list of paths
-        """
-        return self.glob("**" + pattern)
-
-    @property
-    def _keys(self) -> List[str]:
-        """Walk all groups and dataset."""
-        return self._file_keys(self.root)
-
-    @classmethod
-    @lru_cache(None)
-    def _file_keys(cls, file: h5py.File) -> List[str]:
-        """Walk all groups and dataset."""
-        l = []
-        file.visit(lambda x: l.append("/" + x))
-        return l
-
-    def is_file(self) -> bool:
-        """Check if self is file."""
-        if self.name not in self._keys:
-            return False
-        return isinstance(self.root[self.name], h5py.Dataset)
-
-    def is_dir(self) -> bool:
-        """Check if self is directory."""
-        if self.name not in self._keys:
-            return False
-        return isinstance(self.root[self.name], h5py.Group)
-
-    def __truediv__(self, key: str) -> "DPPath":
-        """Used for / operator."""
-        return type(self)(f"{self.root_path}#{self._connect_path(key)}")
-
-    def _connect_path(self, path: str) -> str:
-        """Connect self with path."""
-        if self.name.endswith("/"):
-            return f"{self.name}{path}"
-        return f"{self.name}/{path}"
-
-    def __lt__(self, other: "DPH5Path") -> bool:
-        """Whether this DPPath is less than other for sorting."""
-        if self.root_path == other.root_path:
-            return self.name < other.name
-        return self.root_path < other.root_path
-
-    def __str__(self) -> str:
-        """Returns path of self."""
-        return f"{self.root_path}#{self.name}"
+"""Alias for backward compatibility."""
+from deepmd_utils.utils.path import (
+    DPH5Path,
+    DPOSPath,
+    DPPath,
+)
+
+__all__ = [
+    "DPPath",
+    "DPOSPath",
+    "DPH5Path",
+]
diff --git a/deepmd/utils/plugin.py b/deepmd/utils/plugin.py
index 2a77b744c5..3b5b297304 100644
--- a/deepmd/utils/plugin.py
+++ b/deepmd/utils/plugin.py
@@ -1,95 +1,15 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Base of plugin systems."""
-# copied from https://github.com/deepmodeling/dpdata/blob/a3e76d75de53f6076254de82d18605a010dc3b00/dpdata/plugin.py
-
-from abc import (
-    ABCMeta,
-)
-from typing import (
-    Callable,
+"""Alias for backward compatibility."""
+from deepmd_utils.utils.plugin import (
+    Plugin,
+    PluginVariant,
+    VariantABCMeta,
+    VariantMeta,
 )
 
-
-class Plugin:
-    """A class to register and restore plugins.
-
-    Attributes
-    ----------
-    plugins : Dict[str, object]
-        plugins
-
-    Examples
-    --------
-    >>> plugin = Plugin()
-    >>> @plugin.register("xx")
-        def xxx():
-            pass
-    >>> print(plugin.plugins['xx'])
-    """
-
-    def __init__(self):
-        self.plugins = {}
-
-    def __add__(self, other) -> "Plugin":
-        self.plugins.update(other.plugins)
-        return self
-
-    def register(self, key: str) -> Callable[[object], object]:
-        """Register a plugin.
-
-        Parameters
-        ----------
-        key : str
-            key of the plugin
-
-        Returns
-        -------
-        Callable[[object], object]
-            decorator
-        """
-
-        def decorator(object: object) -> object:
-            self.plugins[key] = object
-            return object
-
-        return decorator
-
-    def get_plugin(self, key) -> object:
-        """Visit a plugin by key.
-
-        Parameters
-        ----------
-        key : str
-            key of the plugin
-
-        Returns
-        -------
-        object
-            the plugin
-        """
-        return self.plugins[key]
-
-
-class VariantMeta:
-    def __call__(cls, *args, **kwargs):
-        """Remove `type` and keys that starts with underline."""
-        obj = cls.__new__(cls, *args, **kwargs)
-        kwargs.pop("type", None)
-        to_pop = []
-        for kk in kwargs:
-            if kk[0] == "_":
-                to_pop.append(kk)
-        for kk in to_pop:
-            kwargs.pop(kk, None)
-        obj.__init__(*args, **kwargs)
-        return obj
-
-
-class VariantABCMeta(VariantMeta, ABCMeta):
-    pass
-
-
-class PluginVariant(metaclass=VariantABCMeta):
-    """A class to remove `type` from input arguments."""
-
-    pass
+__all__ = [
+    "Plugin",
+    "VariantMeta",
+    "VariantABCMeta",
+    "PluginVariant",
+]
diff --git a/deepmd/utils/random.py b/deepmd/utils/random.py
index 8944419412..09547eeac9 100644
--- a/deepmd/utils/random.py
+++ b/deepmd/utils/random.py
@@ -1,67 +1,15 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    Optional,
+"""Alias for backward compatibility."""
+from deepmd_utils.utils.random import (
+    choice,
+    random,
+    seed,
+    shuffle,
 )
 
-import numpy as np
-
-_RANDOM_GENERATOR = np.random.RandomState()
-
-
-def choice(a: np.ndarray, p: Optional[np.ndarray] = None):
-    """Generates a random sample from a given 1-D array.
-
-    Parameters
-    ----------
-    a : np.ndarray
-        A random sample is generated from its elements.
-    p : np.ndarray
-        The probabilities associated with each entry in a.
-
-    Returns
-    -------
-    np.ndarray
-        arrays with results and their shapes
-    """
-    return _RANDOM_GENERATOR.choice(a, p=p)
-
-
-def random(size=None):
-    """Return random floats in the half-open interval [0.0, 1.0).
-
-    Parameters
-    ----------
-    size
-        Output shape.
-
-    Returns
-    -------
-    np.ndarray
-        Arrays with results and their shapes.
-    """
-    return _RANDOM_GENERATOR.random_sample(size)
-
-
-def seed(val: Optional[int] = None):
-    """Seed the generator.
-
-    Parameters
-    ----------
-    val : int
-        Seed.
-    """
-    _RANDOM_GENERATOR.seed(val)
-
-
-def shuffle(x: np.ndarray):
-    """Modify a sequence in-place by shuffling its contents.
-
-    Parameters
-    ----------
-    x : np.ndarray
-        The array or list to be shuffled.
-    """
-    _RANDOM_GENERATOR.shuffle(x)
-
-
-__all__ = ["choice", "random", "seed", "shuffle"]
+__all__ = [
+    "choice",
+    "random",
+    "seed",
+    "shuffle",
+]
diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index d0a167f1dc..2b270b1dbc 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -85,7 +85,10 @@ def __init__(
         # functype
         if activation_fn == ACTIVATION_FN_DICT["tanh"]:
             self.functype = 1
-        elif activation_fn == ACTIVATION_FN_DICT["gelu"]:
+        elif activation_fn in (
+            ACTIVATION_FN_DICT["gelu"],
+            ACTIVATION_FN_DICT["gelu_tf"],
+        ):
             self.functype = 2
         elif activation_fn == ACTIVATION_FN_DICT["relu"]:
             self.functype = 3
@@ -330,8 +333,7 @@ def _build_lower(
         elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeT):
             tt = np.full((nspline, self.last_layer_size), stride1)
             tt[
-                int((lower - extrapolate * lower) / stride1)
-                + 1 : (
+                int((lower - extrapolate * lower) / stride1) + 1 : (
                     int((lower - extrapolate * lower) / stride1)
                     + int((upper - lower) / stride0)
                 ),
diff --git a/deepmd/utils/type_embed.py b/deepmd/utils/type_embed.py
index aadbb3c6e0..c8ab01f7f5 100644
--- a/deepmd/utils/type_embed.py
+++ b/deepmd/utils/type_embed.py
@@ -16,7 +16,6 @@
     nvnmd_cfg,
 )
 from deepmd.utils.graph import (
-    get_tensor_by_name_from_graph,
     get_type_embedding_net_variables_from_graph_def,
 )
 from deepmd.utils.network import (
@@ -109,7 +108,6 @@ def __init__(
         self.trainable = trainable
         self.uniform_seed = uniform_seed
         self.type_embedding_net_variables = None
-        self.type_embedding_from_graph = None
         self.padding = padding
         self.model_type = None
 
@@ -135,8 +133,6 @@ def build(
         embedded_types
             The computational graph for embedded types
         """
-        if self.model_type is not None and self.model_type == "compressed_model":
-            return self.type_embedding_from_graph
         types = tf.convert_to_tensor(list(range(ntypes)), dtype=tf.int32)
         ebd_type = tf.cast(
             tf.one_hot(tf.cast(types, dtype=tf.int32), int(ntypes)),
@@ -166,7 +162,7 @@ def build(
         if self.padding:
             last_type = tf.cast(tf.zeros([1, self.neuron[-1]]), self.filter_precision)
             ebd_type = tf.concat([ebd_type, last_type], 0)  # (ntypes + 1) * neuron[-1]
-        self.ebd_type = tf.identity(ebd_type, name="t_typeebd")
+        self.ebd_type = tf.identity(ebd_type, name="t_typeebd" + suffix)
         return self.ebd_type
 
     def init_variables(
@@ -193,5 +189,3 @@ def init_variables(
         self.type_embedding_net_variables = (
             get_type_embedding_net_variables_from_graph_def(graph_def, suffix=suffix)
         )
-        type_embedding = get_tensor_by_name_from_graph(graph, "t_typeebd")
-        self.type_embedding_from_graph = tf.convert_to_tensor(type_embedding)
diff --git a/deepmd/utils/weight_avg.py b/deepmd/utils/weight_avg.py
index b344d3bb75..267f89ed28 100644
--- a/deepmd/utils/weight_avg.py
+++ b/deepmd/utils/weight_avg.py
@@ -1,48 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from collections import (
-    defaultdict,
+"""Alias for backward compatibility."""
+from deepmd_utils.utils.weight_avg import (
+    weighted_average,
 )
-from typing import (
-    Dict,
-    List,
-    Tuple,
-)
-
-import numpy as np
-
-
-def weighted_average(errors: List[Dict[str, Tuple[float, float]]]) -> Dict:
-    """Compute wighted average of prediction errors (MAE or RMSE) for model.
-
-    Parameters
-    ----------
-    errors : List[Dict[str, Tuple[float, float]]]
-        List: the error of systems
-        Dict: the error of quantities, name given by the key
-        str: the name of the quantity, must starts with 'mae' or 'rmse'
-        Tuple: (error, weight)
 
-    Returns
-    -------
-    Dict
-        weighted averages
-    """
-    sum_err = defaultdict(float)
-    sum_siz = defaultdict(int)
-    for err in errors:
-        for kk, (ee, ss) in err.items():
-            if kk.startswith("mae"):
-                sum_err[kk] += ee * ss
-            elif kk.startswith("rmse"):
-                sum_err[kk] += ee * ee * ss
-            else:
-                raise RuntimeError("unknown error type")
-            sum_siz[kk] += ss
-    for kk in sum_err.keys():
-        if kk.startswith("mae"):
-            sum_err[kk] = sum_err[kk] / sum_siz[kk]
-        elif kk.startswith("rmse"):
-            sum_err[kk] = np.sqrt(sum_err[kk] / sum_siz[kk])
-        else:
-            raise RuntimeError("unknown error type")
-    return sum_err
+__all__ = [
+    "weighted_average",
+]
diff --git a/deepmd_cli/__init__.py b/deepmd_cli/__init__.py
deleted file mode 100644
index d295053965..0000000000
--- a/deepmd_cli/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: LGPL-3.0-or-later
-"""This module contains the entry points for DeePMD-kit.
-
-If only printing the help message, this module does not call
-the main DeePMD-kit module to avoid the slow import of TensorFlow.
-"""
diff --git a/deepmd_utils/__init__.py b/deepmd_utils/__init__.py
new file mode 100644
index 0000000000..1c5314bb7e
--- /dev/null
+++ b/deepmd_utils/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Untilization methods for DeePMD-kit.
+
+The __init__ module should not import any modules
+for performance.
+"""
diff --git a/deepmd_utils/common.py b/deepmd_utils/common.py
new file mode 100644
index 0000000000..b594c54030
--- /dev/null
+++ b/deepmd_utils/common.py
@@ -0,0 +1,270 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import warnings
+from pathlib import (
+    Path,
+)
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    List,
+    Optional,
+    TypeVar,
+    Union,
+)
+
+try:
+    from typing import Literal  # python >=3.8
+except ImportError:
+    from typing_extensions import Literal  # type: ignore
+
+import numpy as np
+import yaml
+
+from deepmd_utils.env import (
+    GLOBAL_NP_FLOAT_PRECISION,
+)
+from deepmd_utils.utils.path import (
+    DPPath,
+)
+
+__all__ = [
+    "data_requirement",
+    "add_data_requirement",
+    "select_idx_map",
+    "make_default_mesh",
+    "j_must_have",
+    "j_loader",
+    "expand_sys_str",
+    "get_np_precision",
+]
+
+
+if TYPE_CHECKING:
+    _DICT_VAL = TypeVar("_DICT_VAL")
+    _PRECISION = Literal["default", "float16", "float32", "float64"]
+    _ACTIVATION = Literal[
+        "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu", "gelu_tf"
+    ]
+    __all__.extend(
+        [
+            "_DICT_VAL",
+            "_PRECISION",
+            "_ACTIVATION",
+        ]
+    )
+
+
+# TODO this is not a good way to do things. This is some global variable to which
+# TODO anyone can write and there is no good way to keep track of the changes
+data_requirement = {}
+
+
+def add_data_requirement(
+    key: str,
+    ndof: int,
+    atomic: bool = False,
+    must: bool = False,
+    high_prec: bool = False,
+    type_sel: Optional[bool] = None,
+    repeat: int = 1,
+    default: float = 0.0,
+    dtype: Optional[np.dtype] = None,
+):
+    """Specify data requirements for training.
+
+    Parameters
+    ----------
+    key : str
+        type of data stored in corresponding `*.npy` file e.g. `forces` or `energy`
+    ndof : int
+        number of the degrees of freedom, this is tied to `atomic` parameter e.g. forces
+        have `atomic=True` and `ndof=3`
+    atomic : bool, optional
+        specifies whwther the `ndof` keyworrd applies to per atom quantity or not,
+        by default False
+    must : bool, optional
+        specifi if the `*.npy` data file must exist, by default False
+    high_prec : bool, optional
+        if true load data to `np.float64` else `np.float32`, by default False
+    type_sel : bool, optional
+        select only certain type of atoms, by default None
+    repeat : int, optional
+        if specify repaeat data `repeat` times, by default 1
+    default : float, optional, default=0.
+        default value of data
+    dtype : np.dtype, optional
+        the dtype of data, overwrites `high_prec` if provided
+    """
+    data_requirement[key] = {
+        "ndof": ndof,
+        "atomic": atomic,
+        "must": must,
+        "high_prec": high_prec,
+        "type_sel": type_sel,
+        "repeat": repeat,
+        "default": default,
+        "dtype": dtype,
+    }
+
+
+def select_idx_map(atom_types: np.ndarray, select_types: np.ndarray) -> np.ndarray:
+    """Build map of indices for element supplied element types from all atoms list.
+
+    Parameters
+    ----------
+    atom_types : np.ndarray
+        array specifing type for each atoms as integer
+    select_types : np.ndarray
+        types of atoms you want to find indices for
+
+    Returns
+    -------
+    np.ndarray
+        indices of types of atoms defined by `select_types` in `atom_types` array
+
+    Warnings
+    --------
+    `select_types` array will be sorted before finding indices in `atom_types`
+    """
+    sort_select_types = np.sort(select_types)
+    idx_map = []
+    for ii in sort_select_types:
+        idx_map.append(np.where(atom_types == ii)[0])
+    return np.concatenate(idx_map)
+
+
+def make_default_mesh(pbc: bool, mixed_type: bool) -> np.ndarray:
+    """Make mesh.
+
+    Only the size of mesh matters, not the values:
+    * 6 for PBC, no mixed types
+    * 0 for no PBC, no mixed types
+    * 7 for PBC, mixed types
+    * 1 for no PBC, mixed types
+
+    Parameters
+    ----------
+    pbc : bool
+        if True, the mesh will be made for periodic boundary conditions
+    mixed_type : bool
+        if True, the mesh will be made for mixed types
+
+    Returns
+    -------
+    np.ndarray
+        mesh
+    """
+    mesh_size = int(pbc) * 6 + int(mixed_type)
+    default_mesh = np.zeros(mesh_size, dtype=np.int32)
+    return default_mesh
+
+
+# TODO maybe rename this to j_deprecated and only warn about deprecated keys,
+# TODO if the deprecated_key argument is left empty function puppose is only custom
+# TODO error since dict[key] already raises KeyError when the key is missing
+def j_must_have(
+    jdata: Dict[str, "_DICT_VAL"], key: str, deprecated_key: List[str] = []
+) -> "_DICT_VAL":
+    """Assert that supplied dictionary conaines specified key.
+
+    Returns
+    -------
+    _DICT_VAL
+        value that was store unde supplied key
+
+    Raises
+    ------
+    RuntimeError
+        if the key is not present
+    """
+    if key not in jdata.keys():
+        for ii in deprecated_key:
+            if ii in jdata.keys():
+                warnings.warn(f"the key {ii} is deprecated, please use {key} instead")
+                return jdata[ii]
+        else:
+            raise RuntimeError(f"json database must provide key {key}")
+    else:
+        return jdata[key]
+
+
+def j_loader(filename: Union[str, Path]) -> Dict[str, Any]:
+    """Load yaml or json settings file.
+
+    Parameters
+    ----------
+    filename : Union[str, Path]
+        path to file
+
+    Returns
+    -------
+    Dict[str, Any]
+        loaded dictionary
+
+    Raises
+    ------
+    TypeError
+        if the supplied file is of unsupported type
+    """
+    filepath = Path(filename)
+    if filepath.suffix.endswith("json"):
+        with filepath.open() as fp:
+            return json.load(fp)
+    elif filepath.suffix.endswith(("yml", "yaml")):
+        with filepath.open() as fp:
+            return yaml.safe_load(fp)
+    else:
+        raise TypeError("config file must be json, or yaml/yml")
+
+
+# TODO port completely to pathlib when all callers are ported
+def expand_sys_str(root_dir: Union[str, Path]) -> List[str]:
+    """Recursively iterate over directories taking those that contain `type.raw` file.
+
+    Parameters
+    ----------
+    root_dir : Union[str, Path]
+        starting directory
+
+    Returns
+    -------
+    List[str]
+        list of string pointing to system directories
+    """
+    root_dir = DPPath(root_dir)
+    matches = [str(d) for d in root_dir.rglob("*") if (d / "type.raw").is_file()]
+    if (root_dir / "type.raw").is_file():
+        matches.append(str(root_dir))
+    return matches
+
+
+def get_np_precision(precision: "_PRECISION") -> np.dtype:
+    """Get numpy precision constant from string.
+
+    Parameters
+    ----------
+    precision : _PRECISION
+        string name of numpy constant or default
+
+    Returns
+    -------
+    np.dtype
+        numpy presicion constant
+
+    Raises
+    ------
+    RuntimeError
+        if string is invalid
+    """
+    if precision == "default":
+        return GLOBAL_NP_FLOAT_PRECISION
+    elif precision == "float16":
+        return np.float16
+    elif precision == "float32":
+        return np.float32
+    elif precision == "float64":
+        return np.float64
+    else:
+        raise RuntimeError(f"{precision} is not a valid precision")
diff --git a/deepmd_utils/entrypoints/__init__.py b/deepmd_utils/entrypoints/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/deepmd_utils/entrypoints/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/deepmd_utils/entrypoints/doc.py b/deepmd_utils/entrypoints/doc.py
new file mode 100644
index 0000000000..9f1fd39095
--- /dev/null
+++ b/deepmd_utils/entrypoints/doc.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Module that prints train input arguments docstrings."""
+
+from deepmd_utils.utils.argcheck import (
+    gen_doc,
+    gen_json,
+)
+
+__all__ = ["doc_train_input"]
+
+
+def doc_train_input(*, out_type: str = "rst", **kwargs):
+    """Print out trining input arguments to console."""
+    if out_type == "rst":
+        doc_str = gen_doc(make_anchor=True)
+    elif out_type == "json":
+        doc_str = gen_json()
+    else:
+        raise RuntimeError("Unsupported out type %s" % out_type)
+    print(doc_str)
diff --git a/deepmd_utils/entrypoints/gui.py b/deepmd_utils/entrypoints/gui.py
new file mode 100644
index 0000000000..8b6b9e0a09
--- /dev/null
+++ b/deepmd_utils/entrypoints/gui.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""DP-GUI entrypoint."""
+
+
+def start_dpgui(*, port: int, bind_all: bool, **kwargs):
+    """Host DP-GUI server.
+
+    Parameters
+    ----------
+    port : int
+        The port to serve DP-GUI on.
+    bind_all : bool
+        Serve on all public interfaces. This will expose your DP-GUI instance
+        to the network on both IPv4 and IPv6 (where available).
+    **kwargs
+        additional arguments
+
+    Raises
+    ------
+    ModuleNotFoundError
+        The dpgui package is not installed
+    """
+    try:
+        from dpgui import (
+            start_dpgui,
+        )
+    except ModuleNotFoundError as e:
+        raise ModuleNotFoundError(
+            "To use DP-GUI, please install the dpgui package:\npip install dpgui"
+        ) from e
+    start_dpgui(port=port, bind_all=bind_all)
diff --git a/deepmd_utils/env.py b/deepmd_utils/env.py
new file mode 100644
index 0000000000..b1d4958ed8
--- /dev/null
+++ b/deepmd_utils/env.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import os
+
+import numpy as np
+
+__all__ = [
+    "GLOBAL_NP_FLOAT_PRECISION",
+    "GLOBAL_ENER_FLOAT_PRECISION",
+    "global_float_prec",
+]
+
+# FLOAT_PREC
+dp_float_prec = os.environ.get("DP_INTERFACE_PREC", "high").lower()
+if dp_float_prec in ("high", ""):
+    # default is high
+    GLOBAL_NP_FLOAT_PRECISION = np.float64
+    GLOBAL_ENER_FLOAT_PRECISION = np.float64
+    global_float_prec = "double"
+elif dp_float_prec == "low":
+    GLOBAL_NP_FLOAT_PRECISION = np.float32
+    GLOBAL_ENER_FLOAT_PRECISION = np.float64
+    global_float_prec = "float"
+else:
+    raise RuntimeError(
+        "Unsupported float precision option: %s. Supported: high,"
+        "low. Please set precision with environmental variable "
+        "DP_INTERFACE_PREC." % dp_float_prec
+    )
diff --git a/deepmd_utils/loggers/__init__.py b/deepmd_utils/loggers/__init__.py
new file mode 100644
index 0000000000..39aa76139d
--- /dev/null
+++ b/deepmd_utils/loggers/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Module taking care of logging duties."""
+
+from .loggers import (
+    set_log_handles,
+)
+
+__all__ = ["set_log_handles"]
diff --git a/deepmd_utils/loggers/loggers.py b/deepmd_utils/loggers/loggers.py
new file mode 100644
index 0000000000..015581f6bd
--- /dev/null
+++ b/deepmd_utils/loggers/loggers.py
@@ -0,0 +1,277 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Logger initialization for package."""
+
+import logging
+import os
+from typing import (
+    TYPE_CHECKING,
+    Optional,
+)
+
+if TYPE_CHECKING:
+    from pathlib import (
+        Path,
+    )
+
+    from mpi4py import (
+        MPI,
+    )
+
+    _MPI_APPEND_MODE = MPI.MODE_CREATE | MPI.MODE_APPEND
+
+logging.getLogger(__name__)
+
+__all__ = ["set_log_handles"]
+
+# logger formater
+FFORMATTER = logging.Formatter(
+    "[%(asctime)s] %(app_name)s %(levelname)-7s %(name)-45s %(message)s"
+)
+CFORMATTER = logging.Formatter(
+    #    "%(app_name)s %(levelname)-7s |-> %(name)-45s %(message)s"
+    "%(app_name)s %(levelname)-7s %(message)s"
+)
+FFORMATTER_MPI = logging.Formatter(
+    "[%(asctime)s] %(app_name)s rank:%(rank)-2s %(levelname)-7s %(name)-45s %(message)s"
+)
+CFORMATTER_MPI = logging.Formatter(
+    #    "%(app_name)s rank:%(rank)-2s %(levelname)-7s |-> %(name)-45s %(message)s"
+    "%(app_name)s rank:%(rank)-2s %(levelname)-7s %(message)s"
+)
+
+
+class _AppFilter(logging.Filter):
+    """Add field `app_name` to log messages."""
+
+    def filter(self, record):
+        record.app_name = "DEEPMD"
+        return True
+
+
+class _MPIRankFilter(logging.Filter):
+    """Add MPI rank number to log messages, adds field `rank`."""
+
+    def __init__(self, rank: int) -> None:
+        super().__init__(name="MPI_rank_id")
+        self.mpi_rank = str(rank)
+
+    def filter(self, record):
+        record.rank = self.mpi_rank
+        return True
+
+
+class _MPIMasterFilter(logging.Filter):
+    """Filter that lets through only messages emited from rank==0."""
+
+    def __init__(self, rank: int) -> None:
+        super().__init__(name="MPI_master_log")
+        self.mpi_rank = rank
+
+    def filter(self, record):
+        if self.mpi_rank == 0:
+            return True
+        else:
+            return False
+
+
+class _MPIFileStream:
+    """Wrap MPI.File` so it has the same API as python file streams.
+
+    Parameters
+    ----------
+    filename : Path
+        disk location of the file stream
+    MPI : MPI
+        MPI communicator object
+    mode : str, optional
+        file write mode, by default _MPI_APPEND_MODE
+    """
+
+    def __init__(
+        self, filename: "Path", MPI: "MPI", mode: str = "_MPI_APPEND_MODE"
+    ) -> None:
+        self.stream = MPI.File.Open(MPI.COMM_WORLD, filename, mode)
+        self.stream.Set_atomicity(True)
+        self.name = "MPIfilestream"
+
+    def write(self, msg: str):
+        """Write to MPI shared file stream.
+
+        Parameters
+        ----------
+        msg : str
+            message to write
+        """
+        b = bytearray()
+        b.extend(map(ord, msg))
+        self.stream.Write_shared(b)
+
+    def close(self):
+        """Synchronize and close MPI file stream."""
+        self.stream.Sync()
+        self.stream.Close()
+
+
+class _MPIHandler(logging.FileHandler):
+    """Emulate `logging.FileHandler` with MPI shared File that all ranks can write to.
+
+    Parameters
+    ----------
+    filename : Path
+        file path
+    MPI : MPI
+        MPI communicator object
+    mode : str, optional
+        file access mode, by default "_MPI_APPEND_MODE"
+    """
+
+    def __init__(
+        self,
+        filename: "Path",
+        MPI: "MPI",
+        mode: str = "_MPI_APPEND_MODE",
+    ) -> None:
+        self.MPI = MPI
+        super().__init__(filename, mode=mode, encoding=None, delay=False)
+
+    def _open(self):
+        return _MPIFileStream(self.baseFilename, self.MPI, self.mode)
+
+    def setStream(self, stream):
+        """Stream canot be reasigned in MPI mode."""
+        raise NotImplementedError("Unable to do for MPI file handler!")
+
+
+def set_log_handles(
+    level: int, log_path: Optional["Path"] = None, mpi_log: Optional[str] = None
+):
+    """Set desired level for package loggers and add file handlers.
+
+    Parameters
+    ----------
+    level : int
+        logging level
+    log_path : Optional[str]
+        path to log file, if None logs will be send only to console. If the parent
+        directory does not exist it will be automatically created, by default None
+    mpi_log : Optional[str], optional
+        mpi log type. Has three options. `master` will output logs to file and console
+        only from rank==0. `collect` will write messages from all ranks to one file
+        opened under rank==0 and to console. `workers` will open one log file for each
+        worker designated by its rank, console behaviour is the same as for `collect`.
+        If this argument is specified, package 'mpi4py' must be already installed.
+        by default None
+
+    Raises
+    ------
+    RuntimeError
+        If the argument `mpi_log` is specified, package `mpi4py` is not installed.
+
+    References
+    ----------
+    https://groups.google.com/g/mpi4py/c/SaNzc8bdj6U
+    https://stackoverflow.com/questions/35869137/avoid-tensorflow-print-on-standard-error
+    https://stackoverflow.com/questions/56085015/suppress-openmp-debug-messages-when-running-tensorflow-on-cpu
+
+    Notes
+    -----
+    Logging levels:
+
+    +---------+--------------+----------------+----------------+----------------+
+    |         | our notation | python logging | tensorflow cpp | OpenMP         |
+    +=========+==============+================+================+================+
+    | debug   | 10           | 10             | 0              | 1/on/true/yes  |
+    +---------+--------------+----------------+----------------+----------------+
+    | info    | 20           | 20             | 1              | 0/off/false/no |
+    +---------+--------------+----------------+----------------+----------------+
+    | warning | 30           | 30             | 2              | 0/off/false/no |
+    +---------+--------------+----------------+----------------+----------------+
+    | error   | 40           | 40             | 3              | 0/off/false/no |
+    +---------+--------------+----------------+----------------+----------------+
+
+    """
+    # silence logging for OpenMP when running on CPU if level is any other than debug
+    if level <= 10:
+        os.environ["KMP_WARNINGS"] = "FALSE"
+
+    # set TF cpp internal logging level
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = str(int((level / 10) - 1))
+
+    # get root logger
+    root_log = logging.getLogger("deepmd")
+    root_log.propagate = False
+
+    root_log.setLevel(level)
+
+    # check if arguments are present
+    MPI = None
+    if mpi_log:
+        try:
+            from mpi4py import (
+                MPI,
+            )
+        except ImportError as e:
+            raise RuntimeError(
+                "You cannot specify 'mpi_log' when mpi4py not installed"
+            ) from e
+
+    # * add console handler ************************************************************
+    ch = logging.StreamHandler()
+    if MPI:
+        rank = MPI.COMM_WORLD.Get_rank()
+        if mpi_log == "master":
+            ch.setFormatter(CFORMATTER)
+            ch.addFilter(_MPIMasterFilter(rank))
+        else:
+            ch.setFormatter(CFORMATTER_MPI)
+            ch.addFilter(_MPIRankFilter(rank))
+    else:
+        ch.setFormatter(CFORMATTER)
+
+    ch.setLevel(level)
+    ch.addFilter(_AppFilter())
+    # clean old handlers before adding new one
+    root_log.handlers.clear()
+    root_log.addHandler(ch)
+
+    # * add file handler ***************************************************************
+    if log_path:
+        # create directory
+        log_path.parent.mkdir(exist_ok=True, parents=True)
+
+        fh = None
+
+        if mpi_log == "master":
+            rank = MPI.COMM_WORLD.Get_rank()
+            if rank == 0:
+                fh = logging.FileHandler(log_path, mode="w")
+                fh.addFilter(_MPIMasterFilter(rank))
+                fh.setFormatter(FFORMATTER)
+        elif mpi_log == "collect":
+            rank = MPI.COMM_WORLD.Get_rank()
+            fh = _MPIHandler(log_path, MPI, mode=MPI.MODE_WRONLY | MPI.MODE_CREATE)
+            fh.addFilter(_MPIRankFilter(rank))
+            fh.setFormatter(FFORMATTER_MPI)
+        elif mpi_log == "workers":
+            rank = MPI.COMM_WORLD.Get_rank()
+            # if file has suffix than inser rank number before suffix
+            # e.g deepmd.log -> deepmd_<rank>.log
+            # if no suffix is present, insert rank as suffix
+            # e.g. deepmdlog -> deepmdlog.<rank>
+            if log_path.suffix:
+                worker_log = (log_path.parent / f"{log_path.stem}_{rank}").with_suffix(
+                    log_path.suffix
+                )
+            else:
+                worker_log = log_path.with_suffix(f".{rank}")
+
+            fh = logging.FileHandler(worker_log, mode="w")
+            fh.setFormatter(FFORMATTER)
+        else:
+            fh = logging.FileHandler(log_path, mode="w")
+            fh.setFormatter(FFORMATTER)
+
+        if fh:
+            fh.setLevel(level)
+            fh.addFilter(_AppFilter())
+            root_log.addHandler(fh)
diff --git a/deepmd_cli/main.py b/deepmd_utils/main.py
similarity index 96%
rename from deepmd_cli/main.py
rename to deepmd_utils/main.py
index bffc1c6911..19afaeee1f 100644
--- a/deepmd_cli/main.py
+++ b/deepmd_utils/main.py
@@ -1,4 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+"""The entry points for DeePMD-kit.
+
+If only printing the help message, this module does not call
+the main DeePMD-kit module to avoid the slow import of TensorFlow.
+"""
 import argparse
 import logging
 import textwrap
@@ -8,7 +13,7 @@
 )
 
 try:
-    from deepmd_cli._version import version as __version__
+    from deepmd_utils._version import version as __version__
 except ImportError:
     __version__ = "unknown"
 
@@ -547,10 +552,26 @@ def main_parser() -> argparse.ArgumentParser:
         parents=[parser_log],
         help="train nvnmd model",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        epilog=textwrap.dedent(
+            """\
+        examples:
+            dp train-nvnmd input_cnn.json -s s1
+            dp train-nvnmd input_qnn.json -s s2
+            dp train-nvnmd input_cnn.json -s s1 --restart model.ckpt
+            dp train-nvnmd input_cnn.json -s s2 --init-model model.ckpt
+        """
+        ),
     )
     parser_train_nvnmd.add_argument(
         "INPUT", help="the input parameter file in json format"
     )
+    parser_train_nvnmd.add_argument(
+        "-i",
+        "--init-model",
+        type=str,
+        default=None,
+        help="Initialize the model by the provided path prefix of checkpoint files.",
+    )
     parser_train_nvnmd.add_argument(
         "-r",
         "--restart",
diff --git a/deepmd_utils/model_format/__init__.py b/deepmd_utils/model_format/__init__.py
new file mode 100644
index 0000000000..253bca3507
--- /dev/null
+++ b/deepmd_utils/model_format/__init__.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from .common import (
+    DEFAULT_PRECISION,
+    PRECISION_DICT,
+    NativeOP,
+)
+from .env_mat import (
+    EnvMat,
+)
+from .network import (
+    EmbeddingNet,
+    FittingNet,
+    NativeLayer,
+    NativeNet,
+    NetworkCollection,
+    load_dp_model,
+    make_embedding_network,
+    make_fitting_network,
+    make_multilayer_network,
+    save_dp_model,
+    traverse_model_dict,
+)
+from .output_def import (
+    FittingOutputDef,
+    ModelOutputDef,
+    OutputVariableDef,
+    fitting_check_output,
+    get_deriv_name,
+    get_reduce_name,
+    model_check_output,
+)
+from .se_e2_a import (
+    DescrptSeA,
+)
+
+__all__ = [
+    "DescrptSeA",
+    "EnvMat",
+    "make_multilayer_network",
+    "make_embedding_network",
+    "make_fitting_network",
+    "EmbeddingNet",
+    "FittingNet",
+    "NativeLayer",
+    "NativeNet",
+    "NetworkCollection",
+    "NativeOP",
+    "load_dp_model",
+    "save_dp_model",
+    "traverse_model_dict",
+    "PRECISION_DICT",
+    "DEFAULT_PRECISION",
+    "ModelOutputDef",
+    "FittingOutputDef",
+    "OutputVariableDef",
+    "model_check_output",
+    "fitting_check_output",
+    "get_reduce_name",
+    "get_deriv_name",
+]
diff --git a/deepmd_utils/model_format/common.py b/deepmd_utils/model_format/common.py
new file mode 100644
index 0000000000..d032e5d5df
--- /dev/null
+++ b/deepmd_utils/model_format/common.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from abc import (
+    ABC,
+)
+
+import numpy as np
+
+PRECISION_DICT = {
+    "float16": np.float16,
+    "float32": np.float32,
+    "float64": np.float64,
+    "half": np.float16,
+    "single": np.float32,
+    "double": np.float64,
+}
+DEFAULT_PRECISION = "float64"
+
+
+class NativeOP(ABC):
+    """The unit operation of a native model."""
+
+    def call(self, *args, **kwargs):
+        """Forward pass in NumPy implementation."""
+        raise NotImplementedError
+
+    def __call__(self, *args, **kwargs):
+        """Forward pass in NumPy implementation."""
+        return self.call(*args, **kwargs)
diff --git a/deepmd_utils/model_format/env_mat.py b/deepmd_utils/model_format/env_mat.py
new file mode 100644
index 0000000000..7822bd7d0c
--- /dev/null
+++ b/deepmd_utils/model_format/env_mat.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Optional,
+    Union,
+)
+
+import numpy as np
+
+from .common import (
+    NativeOP,
+)
+
+
+def compute_smooth_weight(
+    distance: np.ndarray,
+    rmin: float,
+    rmax: float,
+):
+    """Compute smooth weight for descriptor elements."""
+    min_mask = distance <= rmin
+    max_mask = distance >= rmax
+    mid_mask = np.logical_not(np.logical_or(min_mask, max_mask))
+    uu = (distance - rmin) / (rmax - rmin)
+    vv = uu * uu * uu * (-6.0 * uu * uu + 15.0 * uu - 10.0) + 1.0
+    return vv * mid_mask + min_mask
+
+
+def _make_env_mat(
+    nlist,
+    coord,
+    rcut: float,
+    ruct_smth: float,
+):
+    """Make smooth environment matrix."""
+    nf, nloc, nnei = nlist.shape
+    # nf x nall x 3
+    coord = coord.reshape(nf, -1, 3)
+    mask = nlist >= 0
+    nlist = nlist * mask
+    # nf x (nloc x nnei) x 3
+    index = np.tile(nlist.reshape(nf, -1, 1), (1, 1, 3))
+    coord_r = np.take_along_axis(coord, index, 1)
+    # nf x nloc x nnei x 3
+    coord_r = coord_r.reshape(nf, nloc, nnei, 3)
+    # nf x nloc x 1 x 3
+    coord_l = coord[:, :nloc].reshape(nf, -1, 1, 3)
+    # nf x nloc x nnei x 3
+    diff = coord_r - coord_l
+    # nf x nloc x nnei
+    length = np.linalg.norm(diff, axis=-1, keepdims=True)
+    # for index 0 nloc atom
+    length = length + ~np.expand_dims(mask, -1)
+    t0 = 1 / length
+    t1 = diff / length**2
+    weight = compute_smooth_weight(length, ruct_smth, rcut)
+    env_mat_se_a = np.concatenate([t0, t1], axis=-1) * weight * np.expand_dims(mask, -1)
+    return env_mat_se_a, diff * np.expand_dims(mask, -1), weight
+
+
+class EnvMat(NativeOP):
+    def __init__(
+        self,
+        rcut,
+        rcut_smth,
+    ):
+        self.rcut = rcut
+        self.rcut_smth = rcut_smth
+
+    def call(
+        self,
+        coord_ext: np.ndarray,
+        atype_ext: np.ndarray,
+        nlist: np.ndarray,
+        davg: Optional[np.ndarray] = None,
+        dstd: Optional[np.ndarray] = None,
+    ) -> Union[np.ndarray, np.ndarray]:
+        """Compute the environment matrix.
+
+        Parameters
+        ----------
+        nlist
+            The neighbor list. shape: nf x nloc x nnei
+        coord_ext
+            The extended coordinates of atoms. shape: nf x (nallx3)
+        atype_ext
+            The extended aotm types. shape: nf x nall
+        davg
+            The data avg. shape: nt x nnei x 4
+        dstd
+            The inverse of data std. shape: nt x nnei x 4
+
+        Returns
+        -------
+        env_mat
+            The environment matrix. shape: nf x nloc x nnei x 4
+        switch
+            The value of switch function. shape: nf x nloc x nnei
+        """
+        em, sw = self._call(nlist, coord_ext)
+        nf, nloc, nnei = nlist.shape
+        atype = atype_ext[:, :nloc]
+        if davg is not None:
+            em -= davg[atype]
+        if dstd is not None:
+            em /= dstd[atype]
+        return em, sw
+
+    def _call(
+        self,
+        nlist,
+        coord_ext,
+    ):
+        em, diff, ww = _make_env_mat(nlist, coord_ext, self.rcut, self.rcut_smth)
+        return em, ww
+
+    def serialize(
+        self,
+    ) -> dict:
+        return {
+            "rcut": self.rcut,
+            "rcut_smth": self.rcut_smth,
+        }
+
+    @classmethod
+    def deserialize(
+        cls,
+        data: dict,
+    ) -> "EnvMat":
+        return cls(**data)
diff --git a/deepmd_utils/model_format/network.py b/deepmd_utils/model_format/network.py
new file mode 100644
index 0000000000..71ed659787
--- /dev/null
+++ b/deepmd_utils/model_format/network.py
@@ -0,0 +1,692 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Native DP model format for multiple backends.
+
+See issue #2982 for more information.
+"""
+import copy
+import itertools
+import json
+from typing import (
+    ClassVar,
+    Dict,
+    List,
+    Optional,
+    Union,
+)
+
+import h5py
+import numpy as np
+
+try:
+    from deepmd_utils._version import version as __version__
+except ImportError:
+    __version__ = "unknown"
+
+from .common import (
+    DEFAULT_PRECISION,
+    PRECISION_DICT,
+    NativeOP,
+)
+
+
+def traverse_model_dict(model_obj, callback: callable, is_variable: bool = False):
+    """Traverse a model dict and call callback on each variable.
+
+    Parameters
+    ----------
+    model_obj : object
+        The model object to traverse.
+    callback : callable
+        The callback function to call on each variable.
+    is_variable : bool, optional
+        Whether the current node is a variable.
+
+    Returns
+    -------
+    object
+        The model object after traversing.
+    """
+    if isinstance(model_obj, dict):
+        for kk, vv in model_obj.items():
+            model_obj[kk] = traverse_model_dict(
+                vv, callback, is_variable=is_variable or kk == "@variables"
+            )
+    elif isinstance(model_obj, list):
+        for ii, vv in enumerate(model_obj):
+            model_obj[ii] = traverse_model_dict(vv, callback, is_variable=is_variable)
+    elif is_variable:
+        model_obj = callback(model_obj)
+    return model_obj
+
+
+class Counter:
+    """A callable counter.
+
+    Examples
+    --------
+    >>> counter = Counter()
+    >>> counter()
+    0
+    >>> counter()
+    1
+    """
+
+    def __init__(self):
+        self.count = -1
+
+    def __call__(self):
+        self.count += 1
+        return self.count
+
+
+def save_dp_model(filename: str, model_dict: dict, extra_info: Optional[dict] = None):
+    """Save a DP model to a file in the native format.
+
+    Parameters
+    ----------
+    filename : str
+        The filename to save to.
+    model_dict : dict
+        The model dict to save.
+    extra_info : dict, optional
+        Extra meta information to save.
+    """
+    model_dict = model_dict.copy()
+    variable_counter = Counter()
+    if extra_info is not None:
+        extra_info = extra_info.copy()
+    else:
+        extra_info = {}
+    with h5py.File(filename, "w") as f:
+        model_dict = traverse_model_dict(
+            model_dict,
+            lambda x: f.create_dataset(
+                f"variable_{variable_counter():04d}", data=x
+            ).name,
+        )
+        save_dict = {
+            "model": model_dict,
+            "software": "deepmd-kit",
+            "version": __version__,
+            **extra_info,
+        }
+        f.attrs["json"] = json.dumps(save_dict, separators=(",", ":"))
+
+
+def load_dp_model(filename: str) -> dict:
+    """Load a DP model from a file in the native format.
+
+    Parameters
+    ----------
+    filename : str
+        The filename to load from.
+
+    Returns
+    -------
+    dict
+        The loaded model dict, including meta information.
+    """
+    with h5py.File(filename, "r") as f:
+        model_dict = json.loads(f.attrs["json"])
+        model_dict = traverse_model_dict(model_dict, lambda x: f[x][()].copy())
+    return model_dict
+
+
+class NativeLayer(NativeOP):
+    """Native representation of a layer.
+
+    Parameters
+    ----------
+    w : np.ndarray, optional
+        The weights of the layer.
+    b : np.ndarray, optional
+        The biases of the layer.
+    idt : np.ndarray, optional
+        The identity matrix of the layer.
+    activation_function : str, optional
+        The activation function of the layer.
+    resnet : bool, optional
+        Whether the layer is a residual layer.
+    """
+
+    def __init__(
+        self,
+        num_in,
+        num_out,
+        bias: bool = True,
+        use_timestep: bool = False,
+        activation_function: Optional[str] = None,
+        resnet: bool = False,
+        precision: str = DEFAULT_PRECISION,
+    ) -> None:
+        prec = PRECISION_DICT[precision.lower()]
+        self.precision = precision
+        rng = np.random.default_rng()
+        self.w = rng.normal(size=(num_in, num_out)).astype(prec)
+        self.b = rng.normal(size=(num_out,)).astype(prec) if bias else None
+        self.idt = rng.normal(size=(num_out,)).astype(prec) if use_timestep else None
+        self.activation_function = (
+            activation_function if activation_function is not None else "none"
+        )
+        self.resnet = resnet
+        self.check_type_consistency()
+        self.check_shape_consistency()
+
+    def serialize(self) -> dict:
+        """Serialize the layer to a dict.
+
+        Returns
+        -------
+        dict
+            The serialized layer.
+        """
+        data = {
+            "w": self.w,
+            "b": self.b,
+            "idt": self.idt,
+        }
+        return {
+            "bias": self.b is not None,
+            "use_timestep": self.idt is not None,
+            "activation_function": self.activation_function,
+            "resnet": self.resnet,
+            "precision": self.precision,
+            "@variables": data,
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict) -> "NativeLayer":
+        """Deserialize the layer from a dict.
+
+        Parameters
+        ----------
+        data : dict
+            The dict to deserialize from.
+        """
+        data = copy.deepcopy(data)
+        variables = data.pop("@variables")
+        assert variables["w"] is not None and len(variables["w"].shape) == 2
+        num_in, num_out = variables["w"].shape
+        obj = cls(
+            num_in,
+            num_out,
+            **data,
+        )
+        obj.w, obj.b, obj.idt = (
+            variables["w"],
+            variables.get("b", None),
+            variables.get("idt", None),
+        )
+        obj.check_shape_consistency()
+        return obj
+
+    def check_shape_consistency(self):
+        if self.b is not None and self.w.shape[1] != self.b.shape[0]:
+            raise ValueError(
+                f"dim 1 of w {self.w.shape[1]} is not equal to shape "
+                f"of b {self.b.shape[0]}",
+            )
+        if self.idt is not None and self.w.shape[1] != self.idt.shape[0]:
+            raise ValueError(
+                f"dim 1 of w {self.w.shape[1]} is not equal to shape "
+                f"of idt {self.idt.shape[0]}",
+            )
+
+    def check_type_consistency(self):
+        precision = self.precision
+
+        def check_var(var):
+            if var is not None:
+                # assertion "float64" == "double" would fail
+                assert PRECISION_DICT[var.dtype.name] is PRECISION_DICT[precision]
+
+        check_var(self.w)
+        check_var(self.b)
+        check_var(self.idt)
+
+    def __setitem__(self, key, value):
+        if key in ("w", "matrix"):
+            self.w = value
+        elif key in ("b", "bias"):
+            self.b = value
+        elif key == "idt":
+            self.idt = value
+        elif key == "activation_function":
+            self.activation_function = value
+        elif key == "resnet":
+            self.resnet = value
+        elif key == "precision":
+            self.precision = value
+        else:
+            raise KeyError(key)
+
+    def __getitem__(self, key):
+        if key in ("w", "matrix"):
+            return self.w
+        elif key in ("b", "bias"):
+            return self.b
+        elif key == "idt":
+            return self.idt
+        elif key == "activation_function":
+            return self.activation_function
+        elif key == "resnet":
+            return self.resnet
+        elif key == "precision":
+            return self.precision
+        else:
+            raise KeyError(key)
+
+    def dim_in(self) -> int:
+        return self.w.shape[0]
+
+    def dim_out(self) -> int:
+        return self.w.shape[1]
+
+    def call(self, x: np.ndarray) -> np.ndarray:
+        """Forward pass.
+
+        Parameters
+        ----------
+        x : np.ndarray
+            The input.
+
+        Returns
+        -------
+        np.ndarray
+            The output.
+        """
+        if self.w is None or self.activation_function is None:
+            raise ValueError("w, b, and activation_function must be set")
+        if self.activation_function == "tanh":
+            fn = np.tanh
+        elif self.activation_function.lower() == "none":
+
+            def fn(x):
+                return x
+        else:
+            raise NotImplementedError(self.activation_function)
+        y = (
+            np.matmul(x, self.w) + self.b
+            if self.b is not None
+            else np.matmul(x, self.w)
+        )
+        y = fn(y)
+        if self.idt is not None:
+            y *= self.idt
+        if self.resnet and self.w.shape[1] == self.w.shape[0]:
+            y += x
+        elif self.resnet and self.w.shape[1] == 2 * self.w.shape[0]:
+            y += np.concatenate([x, x], axis=-1)
+        return y
+
+
+def make_multilayer_network(T_NetworkLayer, ModuleBase):
+    class NN(ModuleBase):
+        """Native representation of a neural network.
+
+        Parameters
+        ----------
+        layers : list[NativeLayer], optional
+            The layers of the network.
+        """
+
+        def __init__(self, layers: Optional[List[dict]] = None) -> None:
+            super().__init__()
+            if layers is None:
+                layers = []
+            self.layers = [T_NetworkLayer.deserialize(layer) for layer in layers]
+            self.check_shape_consistency()
+
+        def serialize(self) -> dict:
+            """Serialize the network to a dict.
+
+            Returns
+            -------
+            dict
+                The serialized network.
+            """
+            return {"layers": [layer.serialize() for layer in self.layers]}
+
+        @classmethod
+        def deserialize(cls, data: dict) -> "NN":
+            """Deserialize the network from a dict.
+
+            Parameters
+            ----------
+            data : dict
+                The dict to deserialize from.
+            """
+            return cls(data["layers"])
+
+        def __getitem__(self, key):
+            assert isinstance(key, int)
+            return self.layers[key]
+
+        def __setitem__(self, key, value):
+            assert isinstance(key, int)
+            self.layers[key] = value
+
+        def check_shape_consistency(self):
+            for ii in range(len(self.layers) - 1):
+                if self.layers[ii].dim_out() != self.layers[ii + 1].dim_in():
+                    raise ValueError(
+                        f"the dim of layer {ii} output {self.layers[ii].dim_out} ",
+                        f"does not match the dim of layer {ii+1} ",
+                        f"output {self.layers[ii].dim_out}",
+                    )
+
+        def call(self, x):
+            """Forward pass.
+
+            Parameters
+            ----------
+            x : np.ndarray
+                The input.
+
+            Returns
+            -------
+            np.ndarray
+                The output.
+            """
+            for layer in self.layers:
+                x = layer(x)
+            return x
+
+    return NN
+
+
+NativeNet = make_multilayer_network(NativeLayer, NativeOP)
+
+
+def make_embedding_network(T_Network, T_NetworkLayer):
+    class EN(T_Network):
+        """The embedding network.
+
+        Parameters
+        ----------
+        in_dim
+            Input dimension.
+        neuron
+            The number of neurons in each layer. The output dimension
+            is the same as the dimension of the last layer.
+        activation_function
+            The activation function.
+        resnet_dt
+            Use time step at the resnet architecture.
+        precision
+            Floating point precision for the model paramters.
+
+        """
+
+        def __init__(
+            self,
+            in_dim,
+            neuron: List[int] = [24, 48, 96],
+            activation_function: str = "tanh",
+            resnet_dt: bool = False,
+            precision: str = DEFAULT_PRECISION,
+        ):
+            layers = []
+            i_in = in_dim
+            for idx, ii in enumerate(neuron):
+                i_ot = ii
+                layers.append(
+                    T_NetworkLayer(
+                        i_in,
+                        i_ot,
+                        bias=True,
+                        use_timestep=resnet_dt,
+                        activation_function=activation_function,
+                        resnet=True,
+                        precision=precision,
+                    ).serialize()
+                )
+                i_in = i_ot
+            super().__init__(layers)
+            self.in_dim = in_dim
+            self.neuron = neuron
+            self.activation_function = activation_function
+            self.resnet_dt = resnet_dt
+            self.precision = precision
+
+        def serialize(self) -> dict:
+            """Serialize the network to a dict.
+
+            Returns
+            -------
+            dict
+                The serialized network.
+            """
+            return {
+                "in_dim": self.in_dim,
+                "neuron": self.neuron.copy(),
+                "activation_function": self.activation_function,
+                "resnet_dt": self.resnet_dt,
+                "precision": self.precision,
+                "layers": [layer.serialize() for layer in self.layers],
+            }
+
+        @classmethod
+        def deserialize(cls, data: dict) -> "EmbeddingNet":
+            """Deserialize the network from a dict.
+
+            Parameters
+            ----------
+            data : dict
+                The dict to deserialize from.
+            """
+            data = copy.deepcopy(data)
+            layers = data.pop("layers")
+            obj = cls(**data)
+            super(EN, obj).__init__(layers)
+            return obj
+
+    return EN
+
+
+EmbeddingNet = make_embedding_network(NativeNet, NativeLayer)
+
+
+def make_fitting_network(T_EmbeddingNet, T_Network, T_NetworkLayer):
+    class FN(T_EmbeddingNet):
+        """The fitting network. It may be implemented as an embedding
+        net connected with a linear output layer.
+
+        Parameters
+        ----------
+        in_dim
+            Input dimension.
+        out_dim
+            Output dimension
+        neuron
+            The number of neurons in each hidden layer.
+        activation_function
+            The activation function.
+        resnet_dt
+            Use time step at the resnet architecture.
+        precision
+            Floating point precision for the model paramters.
+        bias_out
+            The last linear layer has bias.
+
+        """
+
+        def __init__(
+            self,
+            in_dim,
+            out_dim,
+            neuron: List[int] = [24, 48, 96],
+            activation_function: str = "tanh",
+            resnet_dt: bool = False,
+            precision: str = DEFAULT_PRECISION,
+            bias_out: bool = True,
+        ):
+            super().__init__(
+                in_dim,
+                neuron=neuron,
+                activation_function=activation_function,
+                resnet_dt=resnet_dt,
+                precision=precision,
+            )
+            i_in, i_ot = neuron[-1], out_dim
+            self.layers.append(
+                T_NetworkLayer(
+                    i_in,
+                    i_ot,
+                    bias=bias_out,
+                    use_timestep=False,
+                    activation_function=None,
+                    resnet=False,
+                    precision=precision,
+                )
+            )
+            self.out_dim = out_dim
+            self.bias_out = bias_out
+
+        def serialize(self) -> dict:
+            """Serialize the network to a dict.
+
+            Returns
+            -------
+            dict
+                The serialized network.
+            """
+            return {
+                "in_dim": self.in_dim,
+                "out_dim": self.out_dim,
+                "neuron": self.neuron.copy(),
+                "activation_function": self.activation_function,
+                "resnet_dt": self.resnet_dt,
+                "precision": self.precision,
+                "bias_out": self.bias_out,
+                "layers": [layer.serialize() for layer in self.layers],
+            }
+
+        @classmethod
+        def deserialize(cls, data: dict) -> "FittingNet":
+            """Deserialize the network from a dict.
+
+            Parameters
+            ----------
+            data : dict
+                The dict to deserialize from.
+            """
+            data = copy.deepcopy(data)
+            layers = data.pop("layers")
+            obj = cls(**data)
+            T_Network.__init__(obj, layers)
+            return obj
+
+    return FN
+
+
+FittingNet = make_fitting_network(EmbeddingNet, NativeNet, NativeLayer)
+
+
+class NetworkCollection:
+    """A collection of networks for multiple elements.
+
+    The number of dimesions for types might be 0, 1, or 2.
+    - 0: embedding or fitting with type embedding, in ()
+    - 1: embedding with type_one_side, or fitting, in (type_i)
+    - 2: embedding without type_one_side, in (type_i, type_j)
+
+    Parameters
+    ----------
+    ndim : int
+        The number of dimensions.
+    network_type : str, optional
+        The type of the network.
+    networks : dict, optional
+        The networks to initialize with.
+    """
+
+    # subclass may override this
+    NETWORK_TYPE_MAP: ClassVar[Dict[str, type]] = {
+        "network": NativeNet,
+        "embedding_network": EmbeddingNet,
+        "fitting_network": FittingNet,
+    }
+
+    def __init__(
+        self,
+        ndim: int,
+        ntypes: int,
+        network_type: str = "network",
+        networks: List[Union[NativeNet, dict]] = [],
+    ):
+        self.ndim = ndim
+        self.ntypes = ntypes
+        self.network_type = self.NETWORK_TYPE_MAP[network_type]
+        self._networks = [None for ii in range(ntypes**ndim)]
+        for ii, network in enumerate(networks):
+            self[ii] = network
+        if len(networks):
+            self.check_completeness()
+
+    def check_completeness(self):
+        """Check whether the collection is complete.
+
+        Raises
+        ------
+        RuntimeError
+            If the collection is incomplete.
+        """
+        for tt in itertools.product(range(self.ntypes), repeat=self.ndim):
+            if self[tuple(tt)] is None:
+                raise RuntimeError(f"network for {tt} not found")
+
+    def _convert_key(self, key):
+        if isinstance(key, int):
+            idx = key
+        else:
+            if isinstance(key, tuple):
+                pass
+            elif isinstance(key, str):
+                key = tuple([int(tt) for tt in key.split("_")[1:]])
+            else:
+                raise TypeError(key)
+            assert isinstance(key, tuple)
+            assert len(key) == self.ndim
+            idx = sum([tt * self.ntypes**ii for ii, tt in enumerate(key)])
+        return idx
+
+    def __getitem__(self, key):
+        return self._networks[self._convert_key(key)]
+
+    def __setitem__(self, key, value):
+        if isinstance(value, self.network_type):
+            pass
+        elif isinstance(value, dict):
+            value = self.network_type.deserialize(value)
+        else:
+            raise TypeError(value)
+        self._networks[self._convert_key(key)] = value
+
+    def serialize(self) -> dict:
+        """Serialize the networks to a dict.
+
+        Returns
+        -------
+        dict
+            The serialized networks.
+        """
+        network_type_map_inv = {v: k for k, v in self.NETWORK_TYPE_MAP.items()}
+        network_type_name = network_type_map_inv[self.network_type]
+        return {
+            "ndim": self.ndim,
+            "ntypes": self.ntypes,
+            "network_type": network_type_name,
+            "networks": [nn.serialize() for nn in self._networks],
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict) -> "NetworkCollection":
+        """Deserialize the networks from a dict.
+
+        Parameters
+        ----------
+        data : dict
+            The dict to deserialize from.
+        """
+        return cls(**data)
diff --git a/deepmd_utils/model_format/output_def.py b/deepmd_utils/model_format/output_def.py
new file mode 100644
index 0000000000..268dc21ea6
--- /dev/null
+++ b/deepmd_utils/model_format/output_def.py
@@ -0,0 +1,281 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Dict,
+    List,
+    Tuple,
+)
+
+
+def check_shape(
+    shape: List[int],
+    def_shape: List[int],
+):
+    """Check if the shape satisfies the defined shape."""
+    assert len(shape) == len(def_shape)
+    if def_shape[-1] == -1:
+        if list(shape[:-1]) != def_shape[:-1]:
+            raise ValueError(f"{shape[:-1]} shape not matching def {def_shape[:-1]}")
+    else:
+        if list(shape) != def_shape:
+            raise ValueError(f"{shape} shape not matching def {def_shape}")
+
+
+def check_var(var, var_def):
+    if var_def.atomic:
+        # var.shape == [nf, nloc, *var_def.shape]
+        if len(var.shape) != len(var_def.shape) + 2:
+            raise ValueError(f"{var.shape[2:]} length not matching def {var_def.shape}")
+        check_shape(list(var.shape[2:]), var_def.shape)
+    else:
+        # var.shape == [nf, *var_def.shape]
+        if len(var.shape) != len(var_def.shape) + 1:
+            raise ValueError(f"{var.shape[1:]} length not matching def {var_def.shape}")
+        check_shape(list(var.shape[1:]), var_def.shape)
+
+
+def model_check_output(cls):
+    """Check if the output of the Model is consistent with the definition.
+
+    Two methods are assumed to be provided by the Model:
+    1. Model.output_def that gives the output definition.
+    2. Model.__call__ that defines the forward path of the model.
+
+    """
+
+    class wrapper(cls):
+        def __init__(
+            self,
+            *args,
+            **kwargs,
+        ):
+            super().__init__(*args, **kwargs)
+            self.md = self.output_def()
+
+        def __call__(
+            self,
+            *args,
+            **kwargs,
+        ):
+            ret = cls.__call__(self, *args, **kwargs)
+            for kk in self.md.keys_outp():
+                dd = self.md[kk]
+                check_var(ret[kk], dd)
+                if dd.reduciable:
+                    rk = get_reduce_name(kk)
+                    check_var(ret[rk], self.md[rk])
+                if dd.differentiable:
+                    dnr, dnc = get_deriv_name(kk)
+                    check_var(ret[dnr], self.md[dnr])
+                    check_var(ret[dnc], self.md[dnc])
+            return ret
+
+    return wrapper
+
+
+def fitting_check_output(cls):
+    """Check if the output of the Fitting is consistent with the definition.
+
+    Two methods are assumed to be provided by the Fitting:
+    1. Fitting.output_def that gives the output definition.
+    2. Fitting.__call__ defines the forward path of the fitting.
+
+    """
+
+    class wrapper(cls):
+        def __init__(
+            self,
+            *args,
+            **kwargs,
+        ):
+            super().__init__(*args, **kwargs)
+            self.md = self.output_def()
+
+        def __call__(
+            self,
+            *args,
+            **kwargs,
+        ):
+            ret = cls.__call__(self, *args, **kwargs)
+            for kk in self.md.keys():
+                dd = self.md[kk]
+                check_var(ret[kk], dd)
+            return ret
+
+    return wrapper
+
+
+class OutputVariableDef:
+    """Defines the shape and other properties of the one output variable.
+
+    It is assume that the fitting network output variables for each
+    local atom. This class defines one output variable, including its
+    name, shape, reducibility and differentiability.
+
+    Parameters
+    ----------
+    name
+          Name of the output variable. Notice that the xxxx_redu,
+          xxxx_derv_c, xxxx_derv_r are reserved names that should
+          not be used to define variables.
+    shape
+          The shape of the variable. e.g. energy should be [1],
+          dipole should be [3], polarizabilty should be [3,3].
+    reduciable
+          If the variable is reduced.
+    differentiable
+          If the variable is differentiated with respect to coordinates
+          of atoms and cell tensor (pbc case). Only reduciable variable
+          are differentiable.
+
+    """
+
+    def __init__(
+        self,
+        name: str,
+        shape: List[int],
+        reduciable: bool = False,
+        differentiable: bool = False,
+        atomic: bool = True,
+    ):
+        self.name = name
+        self.shape = list(shape)
+        self.atomic = atomic
+        self.reduciable = reduciable
+        self.differentiable = differentiable
+        if not self.reduciable and self.differentiable:
+            raise ValueError("only reduciable variable are differentiable")
+
+
+class FittingOutputDef:
+    """Defines the shapes and other properties of the fitting network outputs.
+
+    It is assume that the fitting network output variables for each
+    local atom. This class defines all the outputs.
+
+    Parameters
+    ----------
+    var_defs
+          List of output variable definitions.
+
+    """
+
+    def __init__(
+        self,
+        var_defs: List[OutputVariableDef],
+    ):
+        self.var_defs = {vv.name: vv for vv in var_defs}
+
+    def __getitem__(
+        self,
+        key: str,
+    ) -> OutputVariableDef:
+        return self.var_defs[key]
+
+    def get_data(self) -> Dict[str, OutputVariableDef]:
+        return self.var_defs
+
+    def keys(self):
+        return self.var_defs.keys()
+
+
+class ModelOutputDef:
+    """Defines the shapes and other properties of the model outputs.
+
+    The model reduce and differentiate fitting outputs if applicable.
+    If a variable is named by foo, then the reduced variable is called
+    foo_redu, the derivative w.r.t. coordinates is called foo_derv_r
+    and the derivative w.r.t. cell is called foo_derv_c.
+
+    Parameters
+    ----------
+    fit_defs
+          Definition for the fitting net output
+
+    """
+
+    def __init__(
+        self,
+        fit_defs: FittingOutputDef,
+    ):
+        self.def_outp = fit_defs
+        self.def_redu = do_reduce(self.def_outp)
+        self.def_derv_r, self.def_derv_c = do_derivative(self.def_outp)
+        self.var_defs: Dict[str, OutputVariableDef] = {}
+        for ii in [
+            self.def_outp.get_data(),
+            self.def_redu,
+            self.def_derv_c,
+            self.def_derv_r,
+        ]:
+            self.var_defs.update(ii)
+
+    def __getitem__(
+        self,
+        key: str,
+    ) -> OutputVariableDef:
+        return self.var_defs[key]
+
+    def get_data(
+        self,
+        key: str,
+    ) -> Dict[str, OutputVariableDef]:
+        return self.var_defs
+
+    def keys(self):
+        return self.var_defs.keys()
+
+    def keys_outp(self):
+        return self.def_outp.keys()
+
+    def keys_redu(self):
+        return self.def_redu.keys()
+
+    def keys_derv_r(self):
+        return self.def_derv_r.keys()
+
+    def keys_derv_c(self):
+        return self.def_derv_c.keys()
+
+
+def get_reduce_name(name: str) -> str:
+    return name + "_redu"
+
+
+def get_deriv_name(name: str) -> Tuple[str, str]:
+    return name + "_derv_r", name + "_derv_c"
+
+
+def do_reduce(
+    def_outp: FittingOutputDef,
+) -> Dict[str, OutputVariableDef]:
+    def_redu: Dict[str, OutputVariableDef] = {}
+    for kk, vv in def_outp.get_data().items():
+        if vv.reduciable:
+            rk = get_reduce_name(kk)
+            def_redu[rk] = OutputVariableDef(
+                rk, vv.shape, reduciable=False, differentiable=False, atomic=False
+            )
+    return def_redu
+
+
+def do_derivative(
+    def_outp: FittingOutputDef,
+) -> Tuple[Dict[str, OutputVariableDef], Dict[str, OutputVariableDef]]:
+    def_derv_r: Dict[str, OutputVariableDef] = {}
+    def_derv_c: Dict[str, OutputVariableDef] = {}
+    for kk, vv in def_outp.get_data().items():
+        if vv.differentiable:
+            rkr, rkc = get_deriv_name(kk)
+            def_derv_r[rkr] = OutputVariableDef(
+                rkr,
+                vv.shape + [3],  # noqa: RUF005
+                reduciable=False,
+                differentiable=False,
+            )
+            def_derv_c[rkc] = OutputVariableDef(
+                rkc,
+                vv.shape + [3, 3],  # noqa: RUF005
+                reduciable=True,
+                differentiable=False,
+            )
+    return def_derv_r, def_derv_c
diff --git a/deepmd_utils/model_format/se_e2_a.py b/deepmd_utils/model_format/se_e2_a.py
new file mode 100644
index 0000000000..b9143ee360
--- /dev/null
+++ b/deepmd_utils/model_format/se_e2_a.py
@@ -0,0 +1,284 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import numpy as np
+
+try:
+    from deepmd_utils._version import version as __version__
+except ImportError:
+    __version__ = "unknown"
+
+import copy
+from typing import (
+    Any,
+    List,
+    Optional,
+)
+
+from .common import (
+    DEFAULT_PRECISION,
+    NativeOP,
+)
+from .env_mat import (
+    EnvMat,
+)
+from .network import (
+    EmbeddingNet,
+    NetworkCollection,
+)
+
+
+class DescrptSeA(NativeOP):
+    r"""DeepPot-SE constructed from all information (both angular and radial) of
+    atomic configurations. The embedding takes the distance between atoms as input.
+
+    The descriptor :math:`\mathcal{D}^i \in \mathcal{R}^{M_1 \times M_2}` is given by [1]_
+
+    .. math::
+        \mathcal{D}^i = (\mathcal{G}^i)^T \mathcal{R}^i (\mathcal{R}^i)^T \mathcal{G}^i_<
+
+    where :math:`\mathcal{R}^i \in \mathbb{R}^{N \times 4}` is the coordinate
+    matrix, and each row of :math:`\mathcal{R}^i` can be constructed as follows
+
+    .. math::
+        (\mathcal{R}^i)_j = [
+        \begin{array}{c}
+            s(r_{ji}) & \frac{s(r_{ji})x_{ji}}{r_{ji}} & \frac{s(r_{ji})y_{ji}}{r_{ji}} & \frac{s(r_{ji})z_{ji}}{r_{ji}}
+        \end{array}
+        ]
+
+    where :math:`\mathbf{R}_{ji}=\mathbf{R}_j-\mathbf{R}_i = (x_{ji}, y_{ji}, z_{ji})` is
+    the relative coordinate and :math:`r_{ji}=\lVert \mathbf{R}_{ji} \lVert` is its norm.
+    The switching function :math:`s(r)` is defined as:
+
+    .. math::
+        s(r)=
+        \begin{cases}
+        \frac{1}{r}, & r<r_s \\
+        \frac{1}{r} \{ {(\frac{r - r_s}{ r_c - r_s})}^3 (-6 {(\frac{r - r_s}{ r_c - r_s})}^2 +15 \frac{r - r_s}{ r_c - r_s} -10) +1 \}, & r_s \leq r<r_c \\
+        0, & r \geq r_c
+        \end{cases}
+
+    Each row of the embedding matrix  :math:`\mathcal{G}^i \in \mathbb{R}^{N \times M_1}` consists of outputs
+    of a embedding network :math:`\mathcal{N}` of :math:`s(r_{ji})`:
+
+    .. math::
+        (\mathcal{G}^i)_j = \mathcal{N}(s(r_{ji}))
+
+    :math:`\mathcal{G}^i_< \in \mathbb{R}^{N \times M_2}` takes first :math:`M_2` columns of
+    :math:`\mathcal{G}^i`. The equation of embedding network :math:`\mathcal{N}` can be found at
+    :meth:`deepmd.utils.network.embedding_net`.
+
+    Parameters
+    ----------
+    rcut
+            The cut-off radius :math:`r_c`
+    rcut_smth
+            From where the environment matrix should be smoothed :math:`r_s`
+    sel : list[str]
+            sel[i] specifies the maxmum number of type i atoms in the cut-off radius
+    neuron : list[int]
+            Number of neurons in each hidden layers of the embedding net :math:`\mathcal{N}`
+    axis_neuron
+            Number of the axis neuron :math:`M_2` (number of columns of the sub-matrix of the embedding matrix)
+    resnet_dt
+            Time-step `dt` in the resnet construction:
+            y = x + dt * \phi (Wx + b)
+    trainable
+            If the weights of embedding net are trainable.
+    type_one_side
+            Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
+    exclude_types : List[List[int]]
+            The excluded pairs of types which have no interaction with each other.
+            For example, `[[0, 1]]` means no interaction between type 0 and type 1.
+    set_davg_zero
+            Set the shift of embedding net input to zero.
+    activation_function
+            The activation function in the embedding net. Supported options are |ACTIVATION_FN|
+    precision
+            The precision of the embedding net parameters. Supported options are |PRECISION|
+    multi_task
+            If the model has multi fitting nets to train.
+    spin
+            The deepspin object.
+
+    Limitations
+    -----------
+    The currently implementation does not support the following features
+
+    1. type_one_side == False
+    2. exclude_types != []
+    3. spin is not None
+
+    References
+    ----------
+    .. [1] Linfeng Zhang, Jiequn Han, Han Wang, Wissam A. Saidi, Roberto Car, and E. Weinan. 2018.
+       End-to-end symmetry preserving inter-atomic potential energy model for finite and extended
+       systems. In Proceedings of the 32nd International Conference on Neural Information Processing
+       Systems (NIPS'18). Curran Associates Inc., Red Hook, NY, USA, 4441-4451.
+    """
+
+    def __init__(
+        self,
+        rcut: float,
+        rcut_smth: float,
+        sel: List[str],
+        neuron: List[int] = [24, 48, 96],
+        axis_neuron: int = 8,
+        resnet_dt: bool = False,
+        trainable: bool = True,
+        type_one_side: bool = True,
+        exclude_types: List[List[int]] = [],
+        set_davg_zero: bool = False,
+        activation_function: str = "tanh",
+        precision: str = DEFAULT_PRECISION,
+        spin: Optional[Any] = None,
+    ) -> None:
+        ## seed, uniform_seed, multi_task, not included.
+        if not type_one_side:
+            raise NotImplementedError("type_one_side == False not implemented")
+        if exclude_types != []:
+            raise NotImplementedError("exclude_types is not implemented")
+        if spin is not None:
+            raise NotImplementedError("spin is not implemented")
+
+        self.rcut = rcut
+        self.rcut_smth = rcut_smth
+        self.sel = sel
+        self.ntypes = len(self.sel)
+        self.neuron = neuron
+        self.axis_neuron = axis_neuron
+        self.resnet_dt = resnet_dt
+        self.trainable = trainable
+        self.type_one_side = type_one_side
+        self.exclude_types = exclude_types
+        self.set_davg_zero = set_davg_zero
+        self.activation_function = activation_function
+        self.precision = precision
+        self.spin = spin
+
+        in_dim = 1  # not considiering type embedding
+        self.embeddings = NetworkCollection(
+            ntypes=self.ntypes,
+            ndim=(1 if self.type_one_side else 2),
+            network_type="embedding_network",
+        )
+        for ii in range(self.ntypes):
+            self.embeddings[(ii,)] = EmbeddingNet(
+                in_dim,
+                self.neuron,
+                self.activation_function,
+                self.resnet_dt,
+                self.precision,
+            )
+        self.env_mat = EnvMat(self.rcut, self.rcut_smth)
+        self.nnei = np.sum(self.sel)
+        self.nneix4 = self.nnei * 4
+        self.davg = np.zeros([self.ntypes, self.nneix4])
+        self.dstd = np.ones([self.ntypes, self.nneix4])
+        self.orig_sel = self.sel
+
+    def __setitem__(self, key, value):
+        if key in ("avg", "data_avg", "davg"):
+            self.davg = value
+        elif key in ("std", "data_std", "dstd"):
+            self.dstd = value
+        else:
+            raise KeyError(key)
+
+    def __getitem__(self, key):
+        if key in ("avg", "data_avg", "davg"):
+            return self.davg
+        elif key in ("std", "data_std", "dstd"):
+            return self.dstd
+        else:
+            raise KeyError(key)
+
+    def cal_g(
+        self,
+        ss,
+        ll,
+    ):
+        nf, nloc, nnei = ss.shape[0:3]
+        ss = ss.reshape(nf, nloc, nnei, 1)
+        # nf x nloc x nnei x ng
+        gg = self.embeddings[(ll,)].call(ss)
+        return gg
+
+    def call(
+        self,
+        coord_ext,
+        atype_ext,
+        nlist,
+    ):
+        """Compute the descriptor.
+
+        Parameters
+        ----------
+        coord_ext
+            The extended coordinates of atoms. shape: nf x (nallx3)
+        atype_ext
+            The extended aotm types. shape: nf x nall
+        nlist
+            The neighbor list. shape: nf x nloc x nnei
+
+        Returns
+        -------
+        descriptor
+            The descriptor. shape: nf x nloc x ng x axis_neuron
+        """
+        # nf x nloc x nnei x 4
+        rr, ww = self.env_mat.call(coord_ext, atype_ext, nlist, self.davg, self.dstd)
+        nf, nloc, nnei, _ = rr.shape
+        sec = np.append([0], np.cumsum(self.sel))
+
+        ng = self.neuron[-1]
+        gr = np.zeros([nf, nloc, ng, 4])
+        for tt in range(self.ntypes):
+            tr = rr[:, :, sec[tt] : sec[tt + 1], :]
+            ss = tr[..., 0:1]
+            gg = self.cal_g(ss, tt)
+            # nf x nloc x ng x 4
+            gr += np.einsum("flni,flnj->flij", gg, tr)
+        gr /= self.nnei
+        gr1 = gr[:, :, : self.axis_neuron, :]
+        # nf x nloc x ng x ng1
+        grrg = np.einsum("flid,fljd->flij", gr, gr1)
+        # nf x nloc x (ng x ng1)
+        grrg = grrg.reshape(nf, nloc, ng * self.axis_neuron)
+        return grrg
+
+    def serialize(self) -> dict:
+        return {
+            "rcut": self.rcut,
+            "rcut_smth": self.rcut_smth,
+            "sel": self.sel,
+            "neuron": self.neuron,
+            "axis_neuron": self.axis_neuron,
+            "resnet_dt": self.resnet_dt,
+            "trainable": self.trainable,
+            "type_one_side": self.type_one_side,
+            "exclude_types": self.exclude_types,
+            "set_davg_zero": self.set_davg_zero,
+            "activation_function": self.activation_function,
+            "precision": self.precision,
+            "spin": self.spin,
+            "env_mat": self.env_mat.serialize(),
+            "embeddings": self.embeddings.serialize(),
+            "@variables": {
+                "davg": self.davg,
+                "dstd": self.dstd,
+            },
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict) -> "DescrptSeA":
+        data = copy.deepcopy(data)
+        variables = data.pop("@variables")
+        embeddings = data.pop("embeddings")
+        env_mat = data.pop("env_mat")
+        obj = cls(**data)
+
+        obj["davg"] = variables["davg"]
+        obj["dstd"] = variables["dstd"]
+        obj.embeddings = NetworkCollection.deserialize(embeddings)
+        obj.env_mat = EnvMat.deserialize(env_mat)
+        return obj
diff --git a/deepmd_utils/utils/__init__.py b/deepmd_utils/utils/__init__.py
new file mode 100644
index 0000000000..bac6924ac1
--- /dev/null
+++ b/deepmd_utils/utils/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+# For performance, do not add things to this file
+# import submodules instead
diff --git a/deepmd_utils/utils/argcheck.py b/deepmd_utils/utils/argcheck.py
new file mode 100644
index 0000000000..6c51a7b859
--- /dev/null
+++ b/deepmd_utils/utils/argcheck.py
@@ -0,0 +1,2028 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import logging
+from typing import (
+    Callable,
+    List,
+    Optional,
+)
+
+from dargs import (
+    Argument,
+    ArgumentEncoder,
+    Variant,
+    dargs,
+)
+
+from deepmd.common import (
+    ACTIVATION_FN_DICT,
+    PRECISION_DICT,
+)
+from deepmd_utils.utils.argcheck_nvnmd import (
+    nvnmd_args,
+)
+from deepmd_utils.utils.plugin import (
+    Plugin,
+)
+
+log = logging.getLogger(__name__)
+
+
+def list_to_doc(xx):
+    items = []
+    for ii in xx:
+        if len(items) == 0:
+            items.append(f'"{ii}"')
+        else:
+            items.append(f', "{ii}"')
+    items.append(".")
+    return "".join(items)
+
+
+def make_link(content, ref_key):
+    return (
+        f"`{content} <{ref_key}_>`_"
+        if not dargs.RAW_ANCHOR
+        else f"`{content} <#{ref_key}>`_"
+    )
+
+
+def type_embedding_args():
+    doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_seed = "Random seed for parameter initialization"
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
+    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
+    doc_trainable = "If the parameters in the embedding net are trainable"
+
+    return [
+        Argument("neuron", List[int], optional=True, default=[8], doc=doc_neuron),
+        Argument(
+            "activation_function",
+            str,
+            optional=True,
+            default="tanh",
+            doc=doc_activation_function,
+        ),
+        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
+        Argument("precision", str, optional=True, default="default", doc=doc_precision),
+        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
+        Argument("seed", [int, None], optional=True, default=None, doc=doc_seed),
+    ]
+
+
+def spin_args():
+    doc_use_spin = "Whether to use atomic spin model for each atom type"
+    doc_spin_norm = "The magnitude of atomic spin for each atom type with spin"
+    doc_virtual_len = "The distance between virtual atom representing spin and its corresponding real atom for each atom type with spin"
+
+    return [
+        Argument("use_spin", List[bool], doc=doc_use_spin),
+        Argument("spin_norm", List[float], doc=doc_spin_norm),
+        Argument("virtual_len", List[float], doc=doc_virtual_len),
+    ]
+
+
+#  --- Descriptor configurations: --- #
+
+
+class ArgsPlugin:
+    def __init__(self) -> None:
+        self.__plugin = Plugin()
+
+    def register(
+        self, name: str, alias: Optional[List[str]] = None
+    ) -> Callable[[], List[Argument]]:
+        """Register a descriptor argument plugin.
+
+        Parameters
+        ----------
+        name : str
+            the name of a descriptor
+        alias : List[str], optional
+            the list of aliases of this descriptor
+
+        Returns
+        -------
+        Callable[[], List[Argument]]
+            the registered descriptor argument method
+
+        Examples
+        --------
+        >>> some_plugin = ArgsPlugin()
+        >>> @some_plugin.register("some_descrpt")
+            def descrpt_some_descrpt_args():
+                return []
+        """
+        # convert alias to hashed item
+        if isinstance(alias, list):
+            alias = tuple(alias)
+        return self.__plugin.register((name, alias))
+
+    def get_all_argument(self, exclude_hybrid: bool = False) -> List[Argument]:
+        """Get all arguments.
+
+        Parameters
+        ----------
+        exclude_hybrid : bool
+            exclude hybrid descriptor to prevent circular calls
+
+        Returns
+        -------
+        List[Argument]
+            all arguments
+        """
+        arguments = []
+        for (name, alias), metd in self.__plugin.plugins.items():
+            if exclude_hybrid and name == "hybrid":
+                continue
+            arguments.append(
+                Argument(name=name, dtype=dict, sub_fields=metd(), alias=alias)
+            )
+        return arguments
+
+
+descrpt_args_plugin = ArgsPlugin()
+
+
+@descrpt_args_plugin.register("loc_frame")
+def descrpt_local_frame_args():
+    doc_sel_a = "A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_a[i]` gives the selected number of type-i neighbors. The full relative coordinates of the neighbors are used by the descriptor."
+    doc_sel_r = "A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_r[i]` gives the selected number of type-i neighbors. Only relative distance of the neighbors are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius."
+    doc_rcut = "The cut-off radius. The default value is 6.0"
+    doc_axis_rule = "A list of integers. The length should be 6 times of the number of types. \n\n\
+- axis_rule[i*6+0]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.\n\n\
+- axis_rule[i*6+1]: type of the atom defining the first axis of type-i atom.\n\n\
+- axis_rule[i*6+2]: index of the axis atom defining the first axis. Note that the neighbors with the same class and type are sorted according to their relative distance.\n\n\
+- axis_rule[i*6+3]: class of the atom defining the second axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.\n\n\
+- axis_rule[i*6+4]: type of the atom defining the second axis of type-i atom.\n\n\
+- axis_rule[i*6+5]: index of the axis atom defining the second axis. Note that the neighbors with the same class and type are sorted according to their relative distance."
+
+    return [
+        Argument("sel_a", List[int], optional=False, doc=doc_sel_a),
+        Argument("sel_r", List[int], optional=False, doc=doc_sel_r),
+        Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
+        Argument("axis_rule", List[int], optional=False, doc=doc_axis_rule),
+    ]
+
+
+@descrpt_args_plugin.register("se_e2_a", alias=["se_a"])
+def descrpt_se_a_args():
+    doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
+    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
+    - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
+    doc_rcut = "The cut-off radius."
+    doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
+    doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
+    doc_axis_neuron = "Size of the submatrix of G (embedding matrix)."
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
+    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
+    doc_trainable = "If the parameters in the embedding net is trainable"
+    doc_seed = "Random seed for parameter initialization"
+    doc_exclude_types = "The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1."
+    doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used"
+
+    return [
+        Argument("sel", [List[int], str], optional=True, default="auto", doc=doc_sel),
+        Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
+        Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
+        Argument(
+            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
+        ),
+        Argument(
+            "axis_neuron",
+            int,
+            optional=True,
+            default=4,
+            alias=["n_axis_neuron"],
+            doc=doc_axis_neuron,
+        ),
+        Argument(
+            "activation_function",
+            str,
+            optional=True,
+            default="tanh",
+            doc=doc_activation_function,
+        ),
+        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
+        Argument(
+            "type_one_side", bool, optional=True, default=False, doc=doc_type_one_side
+        ),
+        Argument("precision", str, optional=True, default="default", doc=doc_precision),
+        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
+        Argument("seed", [int, None], optional=True, doc=doc_seed),
+        Argument(
+            "exclude_types",
+            List[List[int]],
+            optional=True,
+            default=[],
+            doc=doc_exclude_types,
+        ),
+        Argument(
+            "set_davg_zero", bool, optional=True, default=False, doc=doc_set_davg_zero
+        ),
+    ]
+
+
+@descrpt_args_plugin.register("se_e3", alias=["se_at", "se_a_3be", "se_t"])
+def descrpt_se_t_args():
+    doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
+    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
+    - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
+    doc_rcut = "The cut-off radius."
+    doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
+    doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
+    doc_trainable = "If the parameters in the embedding net are trainable"
+    doc_seed = "Random seed for parameter initialization"
+    doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used"
+
+    return [
+        Argument("sel", [List[int], str], optional=True, default="auto", doc=doc_sel),
+        Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
+        Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
+        Argument(
+            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
+        ),
+        Argument(
+            "activation_function",
+            str,
+            optional=True,
+            default="tanh",
+            doc=doc_activation_function,
+        ),
+        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
+        Argument("precision", str, optional=True, default="default", doc=doc_precision),
+        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
+        Argument("seed", [int, None], optional=True, doc=doc_seed),
+        Argument(
+            "set_davg_zero", bool, optional=True, default=False, doc=doc_set_davg_zero
+        ),
+    ]
+
+
+@descrpt_args_plugin.register("se_a_tpe", alias=["se_a_ebd"])
+def descrpt_se_a_tpe_args():
+    doc_type_nchanl = "number of channels for type embedding"
+    doc_type_nlayer = "number of hidden layers of type embedding net"
+    doc_numb_aparam = "dimension of atomic parameter. if set to a value > 0, the atomic parameters are embedded."
+
+    return [
+        *descrpt_se_a_args(),
+        Argument("type_nchanl", int, optional=True, default=4, doc=doc_type_nchanl),
+        Argument("type_nlayer", int, optional=True, default=2, doc=doc_type_nlayer),
+        Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
+    ]
+
+
+@descrpt_args_plugin.register("se_e2_r", alias=["se_r"])
+def descrpt_se_r_args():
+    doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
+    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
+    - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
+    doc_rcut = "The cut-off radius."
+    doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
+    doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
+    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
+    doc_trainable = "If the parameters in the embedding net are trainable"
+    doc_seed = "Random seed for parameter initialization"
+    doc_exclude_types = "The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1."
+    doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used"
+
+    return [
+        Argument("sel", [List[int], str], optional=True, default="auto", doc=doc_sel),
+        Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
+        Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
+        Argument(
+            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
+        ),
+        Argument(
+            "activation_function",
+            str,
+            optional=True,
+            default="tanh",
+            doc=doc_activation_function,
+        ),
+        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
+        Argument(
+            "type_one_side", bool, optional=True, default=False, doc=doc_type_one_side
+        ),
+        Argument("precision", str, optional=True, default="default", doc=doc_precision),
+        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
+        Argument("seed", [int, None], optional=True, doc=doc_seed),
+        Argument(
+            "exclude_types",
+            List[List[int]],
+            optional=True,
+            default=[],
+            doc=doc_exclude_types,
+        ),
+        Argument(
+            "set_davg_zero", bool, optional=True, default=False, doc=doc_set_davg_zero
+        ),
+    ]
+
+
+@descrpt_args_plugin.register("hybrid")
+def descrpt_hybrid_args():
+    doc_list = "A list of descriptor definitions"
+
+    return [
+        Argument(
+            "list",
+            list,
+            optional=False,
+            doc=doc_list,
+            repeat=True,
+            sub_fields=[],
+            sub_variants=[descrpt_variant_type_args(exclude_hybrid=True)],
+            fold_subdoc=True,
+        )
+    ]
+
+
+def descrpt_se_atten_common_args():
+    doc_sel = 'This parameter set the number of selected neighbors. Note that this parameter is a little different from that in other descriptors. Instead of separating each type of atoms, only the summation matters. And this number is highly related with the efficiency, thus one should not make it too large. Usually 200 or less is enough, far away from the GPU limitation 4096. It can be:\n\n\
+    - `int`. The maximum number of neighbor atoms to be considered. We recommend it to be less than 200. \n\n\
+    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.\
+    - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
+    doc_rcut = "The cut-off radius."
+    doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
+    doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
+    doc_axis_neuron = "Size of the submatrix of G (embedding matrix)."
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
+    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
+    doc_trainable = "If the parameters in the embedding net is trainable"
+    doc_seed = "Random seed for parameter initialization"
+    doc_exclude_types = "The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1."
+    doc_attn = "The length of hidden vectors in attention layers"
+    doc_attn_layer = "The number of attention layers. Note that model compression of `se_atten` is only enabled when attn_layer==0 and stripped_type_embedding is True"
+    doc_attn_dotr = "Whether to do dot product with the normalized relative coordinates"
+    doc_attn_mask = "Whether to do mask on the diagonal in the attention matrix"
+
+    return [
+        Argument(
+            "sel", [int, List[int], str], optional=True, default="auto", doc=doc_sel
+        ),
+        Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
+        Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
+        Argument(
+            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
+        ),
+        Argument(
+            "axis_neuron",
+            int,
+            optional=True,
+            default=4,
+            alias=["n_axis_neuron"],
+            doc=doc_axis_neuron,
+        ),
+        Argument(
+            "activation_function",
+            str,
+            optional=True,
+            default="tanh",
+            doc=doc_activation_function,
+        ),
+        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
+        Argument(
+            "type_one_side", bool, optional=True, default=False, doc=doc_type_one_side
+        ),
+        Argument("precision", str, optional=True, default="default", doc=doc_precision),
+        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
+        Argument("seed", [int, None], optional=True, doc=doc_seed),
+        Argument(
+            "exclude_types",
+            List[List[int]],
+            optional=True,
+            default=[],
+            doc=doc_exclude_types,
+        ),
+        Argument("attn", int, optional=True, default=128, doc=doc_attn),
+        Argument("attn_layer", int, optional=True, default=2, doc=doc_attn_layer),
+        Argument("attn_dotr", bool, optional=True, default=True, doc=doc_attn_dotr),
+        Argument("attn_mask", bool, optional=True, default=False, doc=doc_attn_mask),
+    ]
+
+
+@descrpt_args_plugin.register("se_atten")
+def descrpt_se_atten_args():
+    doc_stripped_type_embedding = "Whether to strip the type embedding into a separated embedding network. Setting it to `False` will fall back to the previous version of `se_atten` which is non-compressible."
+    doc_smooth_type_embdding = "When using stripped type embedding, whether to dot smooth factor on the network output of type embedding to keep the network smooth, instead of setting `set_davg_zero` to be True."
+    doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `se_atten` descriptor or `atom_ener` in the energy fitting is used"
+
+    return [
+        *descrpt_se_atten_common_args(),
+        Argument(
+            "stripped_type_embedding",
+            bool,
+            optional=True,
+            default=False,
+            doc=doc_stripped_type_embedding,
+        ),
+        Argument(
+            "smooth_type_embdding",
+            bool,
+            optional=True,
+            default=False,
+            doc=doc_smooth_type_embdding,
+        ),
+        Argument(
+            "set_davg_zero", bool, optional=True, default=True, doc=doc_set_davg_zero
+        ),
+    ]
+
+
+@descrpt_args_plugin.register("se_atten_v2")
+def descrpt_se_atten_v2_args():
+    doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `se_atten` descriptor or `atom_ener` in the energy fitting is used"
+
+    return [
+        *descrpt_se_atten_common_args(),
+        Argument(
+            "set_davg_zero", bool, optional=True, default=False, doc=doc_set_davg_zero
+        ),
+    ]
+
+
+@descrpt_args_plugin.register("se_a_ebd_v2", alias=["se_a_tpe_v2"])
+def descrpt_se_a_ebd_v2_args():
+    return descrpt_se_a_args()
+
+
+@descrpt_args_plugin.register("se_a_mask")
+def descrpt_se_a_mask_args():
+    doc_sel = 'This parameter sets the number of selected neighbors for each type of atom. It can be:\n\n\
+    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
+    - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
+
+    doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
+    doc_axis_neuron = "Size of the submatrix of G (embedding matrix)."
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
+    doc_exclude_types = "The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1."
+    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
+    doc_trainable = "If the parameters in the embedding net is trainable"
+    doc_seed = "Random seed for parameter initialization"
+
+    return [
+        Argument("sel", [List[int], str], optional=True, default="auto", doc=doc_sel),
+        Argument(
+            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
+        ),
+        Argument(
+            "axis_neuron",
+            int,
+            optional=True,
+            default=4,
+            alias=["n_axis_neuron"],
+            doc=doc_axis_neuron,
+        ),
+        Argument(
+            "activation_function",
+            str,
+            optional=True,
+            default="tanh",
+            doc=doc_activation_function,
+        ),
+        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
+        Argument(
+            "type_one_side", bool, optional=True, default=False, doc=doc_type_one_side
+        ),
+        Argument(
+            "exclude_types",
+            List[List[int]],
+            optional=True,
+            default=[],
+            doc=doc_exclude_types,
+        ),
+        Argument("precision", str, optional=True, default="default", doc=doc_precision),
+        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
+        Argument("seed", [int, None], optional=True, doc=doc_seed),
+    ]
+
+
+def descrpt_variant_type_args(exclude_hybrid: bool = False) -> Variant:
+    link_lf = make_link("loc_frame", "model/descriptor[loc_frame]")
+    link_se_e2_a = make_link("se_e2_a", "model/descriptor[se_e2_a]")
+    link_se_e2_r = make_link("se_e2_r", "model/descriptor[se_e2_r]")
+    link_se_e3 = make_link("se_e3", "model/descriptor[se_e3]")
+    link_se_a_tpe = make_link("se_a_tpe", "model/descriptor[se_a_tpe]")
+    link_hybrid = make_link("hybrid", "model/descriptor[hybrid]")
+    link_se_atten = make_link("se_atten", "model/descriptor[se_atten]")
+    link_se_atten_v2 = make_link("se_atten_v2", "model/descriptor[se_atten_v2]")
+    doc_descrpt_type = "The type of the descritpor. See explanation below. \n\n\
+- `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.\n\n\
+- `se_e2_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.\n\n\
+- `se_e2_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.\n\n\
+- `se_e3`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Three-body embedding will be used by this descriptor.\n\n\
+- `se_a_tpe`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Type embedding will be used by this descriptor.\n\n\
+- `se_atten`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Attention mechanism will be used by this descriptor.\n\n\
+- `se_atten_v2`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Attention mechanism with new modifications will be used by this descriptor.\n\n\
+- `se_a_mask`: Used by the smooth edition of Deep Potential. It can accept a variable number of atoms in a frame (Non-PBC system). *aparam* are required as an indicator matrix for the real/virtual sign of input atoms. \n\n\
+- `hybrid`: Concatenate of a list of descriptors as a new descriptor."
+
+    return Variant(
+        "type",
+        descrpt_args_plugin.get_all_argument(exclude_hybrid=exclude_hybrid),
+        doc=doc_descrpt_type,
+    )
+
+
+#  --- Fitting net configurations: --- #
+fitting_args_plugin = ArgsPlugin()
+
+
+@fitting_args_plugin.register("ener")
+def fitting_ener():
+    doc_numb_fparam = "The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams."
+    doc_numb_aparam = "The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams."
+    doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
+    doc_precision = f"The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_trainable = "Whether the parameters in the fitting net are trainable. This option can be\n\n\
+- bool: True if all parameters of the fitting net are trainable, False otherwise.\n\n\
+- list of bool: Specifies if each layer is trainable. Since the fitting net is composed by hidden layers followed by a output layer, the length of this list should be equal to len(`neuron`)+1."
+    doc_rcond = "The condition number used to determine the inital energy shift for each type of atoms. See `rcond` in :py:meth:`numpy.linalg.lstsq` for more details."
+    doc_seed = "Random seed for parameter initialization of the fitting net"
+    doc_atom_ener = "Specify the atomic energy in vacuum for each type"
+    doc_layer_name = (
+        "The name of the each layer. The length of this list should be equal to n_neuron + 1. "
+        "If two layers, either in the same fitting or different fittings, "
+        "have the same name, they will share the same neural network parameters. "
+        "The shape of these layers should be the same. "
+        "If null is given for a layer, parameters will not be shared."
+    )
+    doc_use_aparam_as_mask = (
+        "Whether to use the aparam as a mask in input."
+        "If True, the aparam will not be used in fitting net for embedding."
+        "When descrpt is se_a_mask, the aparam will be used as a mask to indicate the input atom is real/virtual. And use_aparam_as_mask should be set to True."
+    )
+
+    return [
+        Argument("numb_fparam", int, optional=True, default=0, doc=doc_numb_fparam),
+        Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
+        Argument(
+            "neuron",
+            List[int],
+            optional=True,
+            default=[120, 120, 120],
+            alias=["n_neuron"],
+            doc=doc_neuron,
+        ),
+        Argument(
+            "activation_function",
+            str,
+            optional=True,
+            default="tanh",
+            doc=doc_activation_function,
+        ),
+        Argument("precision", str, optional=True, default="default", doc=doc_precision),
+        Argument("resnet_dt", bool, optional=True, default=True, doc=doc_resnet_dt),
+        Argument(
+            "trainable",
+            [List[bool], bool],
+            optional=True,
+            default=True,
+            doc=doc_trainable,
+        ),
+        Argument(
+            "rcond", [float, type(None)], optional=True, default=None, doc=doc_rcond
+        ),
+        Argument("seed", [int, None], optional=True, doc=doc_seed),
+        Argument(
+            "atom_ener",
+            List[Optional[float]],
+            optional=True,
+            default=[],
+            doc=doc_atom_ener,
+        ),
+        Argument("layer_name", List[str], optional=True, doc=doc_layer_name),
+        Argument(
+            "use_aparam_as_mask",
+            bool,
+            optional=True,
+            default=False,
+            doc=doc_use_aparam_as_mask,
+        ),
+    ]
+
+
+@fitting_args_plugin.register("dos")
+def fitting_dos():
+    doc_numb_fparam = "The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams."
+    doc_numb_aparam = "The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams."
+    doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
+    doc_precision = f"The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_trainable = "Whether the parameters in the fitting net are trainable. This option can be\n\n\
+- bool: True if all parameters of the fitting net are trainable, False otherwise.\n\n\
+- list of bool: Specifies if each layer is trainable. Since the fitting net is composed by hidden layers followed by a output layer, the length of tihs list should be equal to len(`neuron`)+1."
+    doc_rcond = "The condition number used to determine the inital energy shift for each type of atoms. See `rcond` in :py:meth:`numpy.linalg.lstsq` for more details."
+    doc_seed = "Random seed for parameter initialization of the fitting net"
+    doc_numb_dos = (
+        "The number of gridpoints on which the DOS is evaluated (NEDOS in VASP)"
+    )
+
+    return [
+        Argument("numb_fparam", int, optional=True, default=0, doc=doc_numb_fparam),
+        Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
+        Argument(
+            "neuron", List[int], optional=True, default=[120, 120, 120], doc=doc_neuron
+        ),
+        Argument(
+            "activation_function",
+            str,
+            optional=True,
+            default="tanh",
+            doc=doc_activation_function,
+        ),
+        Argument("precision", str, optional=True, default="float64", doc=doc_precision),
+        Argument("resnet_dt", bool, optional=True, default=True, doc=doc_resnet_dt),
+        Argument(
+            "trainable",
+            [List[bool], bool],
+            optional=True,
+            default=True,
+            doc=doc_trainable,
+        ),
+        Argument(
+            "rcond", [float, type(None)], optional=True, default=None, doc=doc_rcond
+        ),
+        Argument("seed", [int, None], optional=True, doc=doc_seed),
+        Argument("numb_dos", int, optional=True, default=300, doc=doc_numb_dos),
+    ]
+
+
+@fitting_args_plugin.register("polar")
+def fitting_polar():
+    doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_precision = f"The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
+    doc_scale = "The output of the fitting net (polarizability matrix) will be scaled by ``scale``"
+    # doc_diag_shift = 'The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.'
+    doc_fit_diag = "Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix."
+    doc_sel_type = "The atom types for which the atomic polarizability will be provided. If not set, all types will be selected."
+    doc_seed = "Random seed for parameter initialization of the fitting net"
+
+    # YWolfeee: user can decide whether to use shift diag
+    doc_shift_diag = "Whether to shift the diagonal of polar, which is beneficial to training. Default is true."
+
+    return [
+        Argument(
+            "neuron",
+            List[int],
+            optional=True,
+            default=[120, 120, 120],
+            alias=["n_neuron"],
+            doc=doc_neuron,
+        ),
+        Argument(
+            "activation_function",
+            str,
+            optional=True,
+            default="tanh",
+            doc=doc_activation_function,
+        ),
+        Argument("resnet_dt", bool, optional=True, default=True, doc=doc_resnet_dt),
+        Argument("precision", str, optional=True, default="default", doc=doc_precision),
+        Argument("fit_diag", bool, optional=True, default=True, doc=doc_fit_diag),
+        Argument(
+            "scale", [List[float], float], optional=True, default=1.0, doc=doc_scale
+        ),
+        # Argument("diag_shift", [list,float], optional = True, default = 0.0, doc = doc_diag_shift),
+        Argument("shift_diag", bool, optional=True, default=True, doc=doc_shift_diag),
+        Argument(
+            "sel_type",
+            [List[int], int, None],
+            optional=True,
+            alias=["pol_type"],
+            doc=doc_sel_type,
+        ),
+        Argument("seed", [int, None], optional=True, doc=doc_seed),
+    ]
+
+
+# def fitting_global_polar():
+#    return fitting_polar()
+
+
+@fitting_args_plugin.register("dipole")
+def fitting_dipole():
+    doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_precision = f"The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
+    doc_sel_type = "The atom types for which the atomic dipole will be provided. If not set, all types will be selected."
+    doc_seed = "Random seed for parameter initialization of the fitting net"
+    return [
+        Argument(
+            "neuron",
+            List[int],
+            optional=True,
+            default=[120, 120, 120],
+            alias=["n_neuron"],
+            doc=doc_neuron,
+        ),
+        Argument(
+            "activation_function",
+            str,
+            optional=True,
+            default="tanh",
+            doc=doc_activation_function,
+        ),
+        Argument("resnet_dt", bool, optional=True, default=True, doc=doc_resnet_dt),
+        Argument("precision", str, optional=True, default="default", doc=doc_precision),
+        Argument(
+            "sel_type",
+            [List[int], int, None],
+            optional=True,
+            alias=["dipole_type"],
+            doc=doc_sel_type,
+        ),
+        Argument("seed", [int, None], optional=True, doc=doc_seed),
+    ]
+
+
+#   YWolfeee: Delete global polar mode, merge it into polar mode and use loss setting to support.
+def fitting_variant_type_args():
+    doc_descrpt_type = "The type of the fitting. See explanation below. \n\n\
+- `ener`: Fit an energy model (potential energy surface).\n\n\
+- `dos` : Fit a density of states model. The total density of states / site-projected density of states labels should be provided by `dos.npy` or `atom_dos.npy` in each data system. The file has number of frames lines and number of energy grid columns (times number of atoms in `atom_dos.npy`). See `loss` parameter. \n\n\
+- `dipole`: Fit an atomic dipole model. Global dipole labels or atomic dipole labels for all the selected atoms (see `sel_type`) should be provided by `dipole.npy` in each data system. The file either has number of frames lines and 3 times of number of selected atoms columns, or has number of frames lines and 3 columns. See `loss` parameter.\n\n\
+- `polar`: Fit an atomic polarizability model. Global polarizazbility labels or atomic polarizability labels for all the selected atoms (see `sel_type`) should be provided by `polarizability.npy` in each data system. The file eith has number of frames lines and 9 times of number of selected atoms columns, or has number of frames lines and 9 columns. See `loss` parameter.\n\n"
+
+    return Variant(
+        "type",
+        fitting_args_plugin.get_all_argument(),
+        optional=True,
+        default_tag="ener",
+        doc=doc_descrpt_type,
+    )
+
+
+#  --- Modifier configurations: --- #
+def modifier_dipole_charge():
+    doc_model_name = "The name of the frozen dipole model file."
+    doc_model_charge_map = f"The charge of the WFCC. The list length should be the same as the {make_link('sel_type', 'model/fitting_net[dipole]/sel_type')}. "
+    doc_sys_charge_map = f"The charge of real atoms. The list length should be the same as the {make_link('type_map', 'model/type_map')}"
+    doc_ewald_h = "The grid spacing of the FFT grid. Unit is A"
+    doc_ewald_beta = f"The splitting parameter of Ewald sum. Unit is A^{-1}"
+
+    return [
+        Argument("model_name", str, optional=False, doc=doc_model_name),
+        Argument(
+            "model_charge_map", List[float], optional=False, doc=doc_model_charge_map
+        ),
+        Argument("sys_charge_map", List[float], optional=False, doc=doc_sys_charge_map),
+        Argument("ewald_beta", float, optional=True, default=0.4, doc=doc_ewald_beta),
+        Argument("ewald_h", float, optional=True, default=1.0, doc=doc_ewald_h),
+    ]
+
+
+def modifier_variant_type_args():
+    doc_modifier_type = "The type of modifier. See explanation below.\n\n\
+-`dipole_charge`: Use WFCC to model the electronic structure of the system. Correct the long-range interaction"
+    return Variant(
+        "type",
+        [
+            Argument("dipole_charge", dict, modifier_dipole_charge()),
+        ],
+        optional=False,
+        doc=doc_modifier_type,
+    )
+
+
+#  --- model compression configurations: --- #
+def model_compression():
+    doc_model_file = "The input model file, which will be compressed by the DeePMD-kit."
+    doc_table_config = "The arguments of model compression, including extrapolate(scale of model extrapolation), stride(uniform stride of tabulation's first and second table), and frequency(frequency of tabulation overflow check)."
+    doc_min_nbor_dist = (
+        "The nearest distance between neighbor atoms saved in the frozen model."
+    )
+
+    return [
+        Argument("model_file", str, optional=False, doc=doc_model_file),
+        Argument("table_config", List[float], optional=False, doc=doc_table_config),
+        Argument("min_nbor_dist", float, optional=False, doc=doc_min_nbor_dist),
+    ]
+
+
+#  --- model compression configurations: --- #
+def model_compression_type_args():
+    doc_compress_type = "The type of model compression, which should be consistent with the descriptor type."
+
+    return Variant(
+        "type",
+        [Argument("se_e2_a", dict, model_compression(), alias=["se_a"])],
+        optional=True,
+        default_tag="se_e2_a",
+        doc=doc_compress_type,
+    )
+
+
+def model_args(exclude_hybrid=False):
+    doc_type_map = "A list of strings. Give the name to each type of atoms. It is noted that the number of atom type of training system must be less than 128 in a GPU environment. If not given, type.raw in each system should use the same type indexes, and type_map.raw will take no effect."
+    doc_data_stat_nbatch = "The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics."
+    doc_data_stat_protect = "Protect parameter for atomic energy regression."
+    doc_data_bias_nsample = "The number of training samples in a system to compute and change the energy bias."
+    doc_type_embedding = "The type embedding."
+    doc_modifier = "The modifier of model output."
+    doc_use_srtab = "The table for the short-range pairwise interaction added on top of DP. The table is a text data file with (N_t + 1) * N_t / 2 + 1 columes. The first colume is the distance between atoms. The second to the last columes are energies for pairs of certain types. For example we have two atom types, 0 and 1. The columes from 2nd to 4th are for 0-0, 0-1 and 1-1 correspondingly."
+    doc_smin_alpha = "The short-range tabulated interaction will be swithed according to the distance of the nearest neighbor. This distance is calculated by softmin. This parameter is the decaying parameter in the softmin. It is only required when `use_srtab` is provided."
+    doc_sw_rmin = "The lower boundary of the interpolation between short-range tabulated interaction and DP. It is only required when `use_srtab` is provided."
+    doc_sw_rmax = "The upper boundary of the interpolation between short-range tabulated interaction and DP. It is only required when `use_srtab` is provided."
+    doc_srtab_add_bias = "Whether add energy bias from the statistics of the data to short-range tabulated atomic energy. It only takes effect when `use_srtab` is provided."
+    doc_compress_config = "Model compression configurations"
+    doc_spin = "The settings for systems with spin."
+    hybrid_models = []
+    if not exclude_hybrid:
+        hybrid_models.extend(
+            [
+                pairwise_dprc(),
+                linear_ener_model_args(),
+            ]
+        )
+    return Argument(
+        "model",
+        dict,
+        [
+            Argument("type_map", List[str], optional=True, doc=doc_type_map),
+            Argument(
+                "data_stat_nbatch",
+                int,
+                optional=True,
+                default=10,
+                doc=doc_data_stat_nbatch,
+            ),
+            Argument(
+                "data_stat_protect",
+                float,
+                optional=True,
+                default=1e-2,
+                doc=doc_data_stat_protect,
+            ),
+            Argument(
+                "data_bias_nsample",
+                int,
+                optional=True,
+                default=10,
+                doc=doc_data_bias_nsample,
+            ),
+            Argument("use_srtab", str, optional=True, doc=doc_use_srtab),
+            Argument("smin_alpha", float, optional=True, doc=doc_smin_alpha),
+            Argument("sw_rmin", float, optional=True, doc=doc_sw_rmin),
+            Argument("sw_rmax", float, optional=True, doc=doc_sw_rmax),
+            Argument(
+                "srtab_add_bias",
+                bool,
+                optional=True,
+                default=True,
+                doc=doc_srtab_add_bias,
+            ),
+            Argument(
+                "type_embedding",
+                dict,
+                type_embedding_args(),
+                [],
+                optional=True,
+                doc=doc_type_embedding,
+            ),
+            Argument(
+                "modifier",
+                dict,
+                [],
+                [modifier_variant_type_args()],
+                optional=True,
+                doc=doc_modifier,
+            ),
+            Argument(
+                "compress",
+                dict,
+                [],
+                [model_compression_type_args()],
+                optional=True,
+                doc=doc_compress_config,
+                fold_subdoc=True,
+            ),
+            Argument("spin", dict, spin_args(), [], optional=True, doc=doc_spin),
+        ],
+        [
+            Variant(
+                "type",
+                [
+                    standard_model_args(),
+                    multi_model_args(),
+                    frozen_model_args(),
+                    pairtab_model_args(),
+                    *hybrid_models,
+                ],
+                optional=True,
+                default_tag="standard",
+            ),
+        ],
+    )
+
+
+def standard_model_args() -> Argument:
+    doc_descrpt = "The descriptor of atomic environment."
+    doc_fitting = "The fitting of physical properties."
+
+    ca = Argument(
+        "standard",
+        dict,
+        [
+            Argument(
+                "descriptor", dict, [], [descrpt_variant_type_args()], doc=doc_descrpt
+            ),
+            Argument(
+                "fitting_net",
+                dict,
+                [],
+                [fitting_variant_type_args()],
+                doc=doc_fitting,
+            ),
+        ],
+        doc="Stardard model, which contains a descriptor and a fitting.",
+    )
+    return ca
+
+
+def multi_model_args() -> Argument:
+    doc_descrpt = "The descriptor of atomic environment. See model[standard]/descriptor for details."
+    doc_fitting_net_dict = "The dictionary of multiple fitting nets in multi-task mode. Each fitting_net_dict[fitting_key] is the single definition of fitting of physical properties with user-defined name `fitting_key`."
+
+    ca = Argument(
+        "multi",
+        dict,
+        [
+            Argument(
+                "descriptor",
+                dict,
+                [],
+                [descrpt_variant_type_args()],
+                doc=doc_descrpt,
+                fold_subdoc=True,
+            ),
+            Argument("fitting_net_dict", dict, doc=doc_fitting_net_dict),
+        ],
+        doc="Multiple-task model.",
+    )
+    return ca
+
+
+def pairwise_dprc() -> Argument:
+    qm_model_args = model_args(exclude_hybrid=True)
+    qm_model_args.name = "qm_model"
+    qm_model_args.fold_subdoc = True
+    qmmm_model_args = model_args(exclude_hybrid=True)
+    qmmm_model_args.name = "qmmm_model"
+    qmmm_model_args.fold_subdoc = True
+    ca = Argument(
+        "pairwise_dprc",
+        dict,
+        [
+            qm_model_args,
+            qmmm_model_args,
+        ],
+    )
+    return ca
+
+
+def frozen_model_args() -> Argument:
+    doc_model_file = "Path to the frozen model file."
+    ca = Argument(
+        "frozen",
+        dict,
+        [
+            Argument("model_file", str, optional=False, doc=doc_model_file),
+        ],
+    )
+    return ca
+
+
+def pairtab_model_args() -> Argument:
+    doc_tab_file = "Path to the tabulation file."
+    doc_rcut = "The cut-off radius."
+    doc_sel = 'This parameter set the number of selected neighbors. Note that this parameter is a little different from that in other descriptors. Instead of separating each type of atoms, only the summation matters. And this number is highly related with the efficiency, thus one should not make it too large. Usually 200 or less is enough, far away from the GPU limitation 4096. It can be:\n\n\
+    - `int`. The maximum number of neighbor atoms to be considered. We recommend it to be less than 200. \n\n\
+    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.\
+    - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
+    ca = Argument(
+        "pairtab",
+        dict,
+        [
+            Argument("tab_file", str, optional=False, doc=doc_tab_file),
+            Argument("rcut", float, optional=False, doc=doc_rcut),
+            Argument("sel", [int, List[int], str], optional=False, doc=doc_sel),
+        ],
+        doc="Pairwise tabulation energy model.",
+    )
+    return ca
+
+
+def linear_ener_model_args() -> Argument:
+    doc_weights = (
+        "If the type is list of float, a list of weights for each model. "
+        'If "mean", the weights are set to be 1 / len(models). '
+        'If "sum", the weights are set to be 1.'
+    )
+    models_args = model_args(exclude_hybrid=True)
+    models_args.name = "models"
+    models_args.fold_subdoc = True
+    models_args.set_dtype(list)
+    models_args.set_repeat(True)
+    models_args.doc = "The sub-models."
+    ca = Argument(
+        "linear_ener",
+        dict,
+        [
+            models_args,
+            Argument(
+                "weights",
+                [list, str],
+                optional=False,
+                doc=doc_weights,
+            ),
+        ],
+    )
+    return ca
+
+
+#  --- Learning rate configurations: --- #
+def learning_rate_exp():
+    doc_start_lr = "The learning rate at the start of the training."
+    doc_stop_lr = "The desired learning rate at the end of the training."
+    doc_decay_steps = (
+        "The learning rate is decaying every this number of training steps."
+    )
+
+    args = [
+        Argument("start_lr", float, optional=True, default=1e-3, doc=doc_start_lr),
+        Argument("stop_lr", float, optional=True, default=1e-8, doc=doc_stop_lr),
+        Argument("decay_steps", int, optional=True, default=5000, doc=doc_decay_steps),
+    ]
+    return args
+
+
+def learning_rate_variant_type_args():
+    doc_lr = "The type of the learning rate."
+
+    return Variant(
+        "type",
+        [Argument("exp", dict, learning_rate_exp())],
+        optional=True,
+        default_tag="exp",
+        doc=doc_lr,
+    )
+
+
+def learning_rate_args():
+    doc_scale_by_worker = "When parallel training or batch size scaled, how to alter learning rate. Valid values are `linear`(default), `sqrt` or `none`."
+    doc_lr = "The definitio of learning rate"
+    return Argument(
+        "learning_rate",
+        dict,
+        [
+            Argument(
+                "scale_by_worker",
+                str,
+                optional=True,
+                default="linear",
+                doc=doc_scale_by_worker,
+            )
+        ],
+        [learning_rate_variant_type_args()],
+        optional=True,
+        doc=doc_lr,
+    )
+
+
+def learning_rate_dict_args():
+    doc_learning_rate_dict = (
+        "The dictionary of definitions of learning rates in multi-task mode. "
+        "Each learning_rate_dict[fitting_key], with user-defined name `fitting_key` in `model/fitting_net_dict`, is the single definition of learning rate.\n"
+    )
+    ca = Argument(
+        "learning_rate_dict", dict, [], [], optional=True, doc=doc_learning_rate_dict
+    )
+    return ca
+
+
+#  --- Loss configurations: --- #
+def start_pref(item, label=None, abbr=None):
+    if label is None:
+        label = item
+    if abbr is None:
+        abbr = item
+    return f"The prefactor of {item} loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the {label} label should be provided by file {label}.npy in each data system. If both start_pref_{abbr} and limit_pref_{abbr} are set to 0, then the {item} will be ignored."
+
+
+def limit_pref(item):
+    return f"The prefactor of {item} loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity."
+
+
+loss_args_plugin = ArgsPlugin()
+
+
+@loss_args_plugin.register("ener")
+def loss_ener():
+    doc_start_pref_e = start_pref("energy", abbr="e")
+    doc_limit_pref_e = limit_pref("energy")
+    doc_start_pref_f = start_pref("force", abbr="f")
+    doc_limit_pref_f = limit_pref("force")
+    doc_start_pref_v = start_pref("virial", abbr="v")
+    doc_limit_pref_v = limit_pref("virial")
+    doc_start_pref_ae = start_pref("atomic energy", label="atom_ener", abbr="ae")
+    doc_limit_pref_ae = limit_pref("atomic energy")
+    doc_start_pref_pf = start_pref(
+        "atomic prefactor force", label="atom_pref", abbr="pf"
+    )
+    doc_limit_pref_pf = limit_pref("atomic prefactor force")
+    doc_start_pref_gf = start_pref("generalized force", label="drdq", abbr="gf")
+    doc_limit_pref_gf = limit_pref("generalized force")
+    doc_numb_generalized_coord = "The dimension of generalized coordinates. Required when generalized force loss is used."
+    doc_relative_f = "If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label."
+    doc_enable_atom_ener_coeff = "If true, the energy will be computed as \\sum_i c_i E_i. c_i should be provided by file atom_ener_coeff.npy in each data system, otherwise it's 1."
+    return [
+        Argument(
+            "start_pref_e",
+            [float, int],
+            optional=True,
+            default=0.02,
+            doc=doc_start_pref_e,
+        ),
+        Argument(
+            "limit_pref_e",
+            [float, int],
+            optional=True,
+            default=1.00,
+            doc=doc_limit_pref_e,
+        ),
+        Argument(
+            "start_pref_f",
+            [float, int],
+            optional=True,
+            default=1000,
+            doc=doc_start_pref_f,
+        ),
+        Argument(
+            "limit_pref_f",
+            [float, int],
+            optional=True,
+            default=1.00,
+            doc=doc_limit_pref_f,
+        ),
+        Argument(
+            "start_pref_v",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_start_pref_v,
+        ),
+        Argument(
+            "limit_pref_v",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_limit_pref_v,
+        ),
+        Argument(
+            "start_pref_ae",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_start_pref_ae,
+        ),
+        Argument(
+            "limit_pref_ae",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_limit_pref_ae,
+        ),
+        Argument(
+            "start_pref_pf",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_start_pref_pf,
+        ),
+        Argument(
+            "limit_pref_pf",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_limit_pref_pf,
+        ),
+        Argument("relative_f", [float, None], optional=True, doc=doc_relative_f),
+        Argument(
+            "enable_atom_ener_coeff",
+            [bool],
+            optional=True,
+            default=False,
+            doc=doc_enable_atom_ener_coeff,
+        ),
+        Argument(
+            "start_pref_gf",
+            float,
+            optional=True,
+            default=0.0,
+            doc=doc_start_pref_gf,
+        ),
+        Argument(
+            "limit_pref_gf",
+            float,
+            optional=True,
+            default=0.0,
+            doc=doc_limit_pref_gf,
+        ),
+        Argument(
+            "numb_generalized_coord",
+            int,
+            optional=True,
+            default=0,
+            doc=doc_numb_generalized_coord,
+        ),
+    ]
+
+
+@loss_args_plugin.register("ener_spin")
+def loss_ener_spin():
+    doc_start_pref_e = start_pref("energy")
+    doc_limit_pref_e = limit_pref("energy")
+    doc_start_pref_fr = start_pref("force_real_atom")
+    doc_limit_pref_fr = limit_pref("force_real_atom")
+    doc_start_pref_fm = start_pref("force_magnetic")
+    doc_limit_pref_fm = limit_pref("force_magnetic")
+    doc_start_pref_v = start_pref("virial")
+    doc_limit_pref_v = limit_pref("virial")
+    doc_start_pref_ae = start_pref("atom_ener")
+    doc_limit_pref_ae = limit_pref("atom_ener")
+    doc_start_pref_pf = start_pref("atom_pref")
+    doc_limit_pref_pf = limit_pref("atom_pref")
+    doc_relative_f = "If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label."
+    doc_enable_atom_ener_coeff = r"If true, the energy will be computed as \sum_i c_i E_i. c_i should be provided by file atom_ener_coeff.npy in each data system, otherwise it's 1."
+    return [
+        Argument(
+            "start_pref_e",
+            [float, int],
+            optional=True,
+            default=0.02,
+            doc=doc_start_pref_e,
+        ),
+        Argument(
+            "limit_pref_e",
+            [float, int],
+            optional=True,
+            default=1.00,
+            doc=doc_limit_pref_e,
+        ),
+        Argument(
+            "start_pref_fr",
+            [float, int],
+            optional=True,
+            default=1000,
+            doc=doc_start_pref_fr,
+        ),
+        Argument(
+            "limit_pref_fr",
+            [float, int],
+            optional=True,
+            default=1.00,
+            doc=doc_limit_pref_fr,
+        ),
+        Argument(
+            "start_pref_fm",
+            [float, int],
+            optional=True,
+            default=10000,
+            doc=doc_start_pref_fm,
+        ),
+        Argument(
+            "limit_pref_fm",
+            [float, int],
+            optional=True,
+            default=10.0,
+            doc=doc_limit_pref_fm,
+        ),
+        Argument(
+            "start_pref_v",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_start_pref_v,
+        ),
+        Argument(
+            "limit_pref_v",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_limit_pref_v,
+        ),
+        Argument(
+            "start_pref_ae",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_start_pref_ae,
+        ),
+        Argument(
+            "limit_pref_ae",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_limit_pref_ae,
+        ),
+        Argument(
+            "start_pref_pf",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_start_pref_pf,
+        ),
+        Argument(
+            "limit_pref_pf",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_limit_pref_pf,
+        ),
+        Argument("relative_f", [float, None], optional=True, doc=doc_relative_f),
+        Argument(
+            "enable_atom_ener_coeff",
+            [bool],
+            optional=True,
+            default=False,
+            doc=doc_enable_atom_ener_coeff,
+        ),
+    ]
+
+
+@loss_args_plugin.register("dos")
+def loss_dos():
+    doc_start_pref_dos = start_pref("Density of State (DOS)")
+    doc_limit_pref_dos = limit_pref("Density of State (DOS)")
+    doc_start_pref_cdf = start_pref(
+        "Cumulative Distribution Function (cumulative intergral of DOS)"
+    )
+    doc_limit_pref_cdf = limit_pref(
+        "Cumulative Distribution Function (cumulative intergral of DOS)"
+    )
+    doc_start_pref_ados = start_pref("atomic DOS (site-projected DOS)")
+    doc_limit_pref_ados = limit_pref("atomic DOS (site-projected DOS)")
+    doc_start_pref_acdf = start_pref("Cumulative integral of atomic DOS")
+    doc_limit_pref_acdf = limit_pref("Cumulative integral of atomic DOS")
+    return [
+        Argument(
+            "start_pref_dos",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_start_pref_dos,
+        ),
+        Argument(
+            "limit_pref_dos",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_limit_pref_dos,
+        ),
+        Argument(
+            "start_pref_cdf",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_start_pref_cdf,
+        ),
+        Argument(
+            "limit_pref_cdf",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_limit_pref_cdf,
+        ),
+        Argument(
+            "start_pref_ados",
+            [float, int],
+            optional=True,
+            default=1.00,
+            doc=doc_start_pref_ados,
+        ),
+        Argument(
+            "limit_pref_ados",
+            [float, int],
+            optional=True,
+            default=1.00,
+            doc=doc_limit_pref_ados,
+        ),
+        Argument(
+            "start_pref_acdf",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_start_pref_acdf,
+        ),
+        Argument(
+            "limit_pref_acdf",
+            [float, int],
+            optional=True,
+            default=0.00,
+            doc=doc_limit_pref_acdf,
+        ),
+    ]
+
+
+# YWolfeee: Modified to support tensor type of loss args.
+@loss_args_plugin.register("tensor")
+def loss_tensor():
+    # doc_global_weight = "The prefactor of the weight of global loss. It should be larger than or equal to 0. If only `pref` is provided or both are not provided, training will be global mode, i.e. the shape of 'polarizability.npy` or `dipole.npy` should be #frams x [9 or 3]."
+    # doc_local_weight =  "The prefactor of the weight of atomic loss. It should be larger than or equal to 0. If only `pref_atomic` is provided, training will be atomic mode, i.e. the shape of `polarizability.npy` or `dipole.npy` should be #frames x ([9 or 3] x #selected atoms). If both `pref` and `pref_atomic` are provided, training will be combined mode, and atomic label should be provided as well."
+    doc_global_weight = "The prefactor of the weight of global loss. It should be larger than or equal to 0. If controls the weight of loss corresponding to global label, i.e. 'polarizability.npy` or `dipole.npy`, whose shape should be #frames x [9 or 3]. If it's larger than 0.0, this npy should be included."
+    doc_local_weight = "The prefactor of the weight of atomic loss. It should be larger than or equal to 0. If controls the weight of loss corresponding to atomic label, i.e. `atomic_polarizability.npy` or `atomic_dipole.npy`, whose shape should be #frames x ([9 or 3] x #selected atoms). If it's larger than 0.0, this npy should be included. Both `pref` and `pref_atomic` should be provided, and either can be set to 0.0."
+    return [
+        Argument(
+            "pref", [float, int], optional=False, default=None, doc=doc_global_weight
+        ),
+        Argument(
+            "pref_atomic",
+            [float, int],
+            optional=False,
+            default=None,
+            doc=doc_local_weight,
+        ),
+    ]
+
+
+def loss_variant_type_args():
+    doc_loss = "The type of the loss. When the fitting type is `ener`, the loss type should be set to `ener` or left unset. When the fitting type is `dipole` or `polar`, the loss type should be set to `tensor`."
+
+    return Variant(
+        "type",
+        loss_args_plugin.get_all_argument(),
+        optional=True,
+        default_tag="ener",
+        doc=doc_loss,
+    )
+
+
+def loss_args():
+    doc_loss = "The definition of loss function. The loss type should be set to `tensor`, `ener` or left unset."
+    ca = Argument(
+        "loss", dict, [], [loss_variant_type_args()], optional=True, doc=doc_loss
+    )
+    return ca
+
+
+def loss_dict_args():
+    doc_loss_dict = (
+        "The dictionary of definitions of multiple loss functions in multi-task mode. "
+        "Each loss_dict[fitting_key], with user-defined name `fitting_key` in `model/fitting_net_dict`, is the single definition of loss function, whose type should be set to `tensor`, `ener` or left unset.\n"
+    )
+    ca = Argument("loss_dict", dict, [], [], optional=True, doc=doc_loss_dict)
+    return ca
+
+
+#  --- Training configurations: --- #
+def training_data_args():  # ! added by Ziyao: new specification style for data systems.
+    link_sys = make_link("systems", "training/training_data/systems")
+    doc_systems = (
+        "The data systems for training. "
+        "This key can be provided with a list that specifies the systems, or be provided with a string "
+        "by which the prefix of all systems are given and the list of the systems is automatically generated."
+    )
+    doc_set_prefix = f"The prefix of the sets in the {link_sys}."
+    doc_batch_size = f'This key can be \n\n\
+- list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\
+- int: all {link_sys} use the same batch size.\n\n\
+- string "auto": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.\n\n\
+- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.\n\n\
+- string "mixed:N": the batch data will be sampled from all systems and merged into a mixed system with the batch size N. Only support the se_atten descriptor.\n\n\
+If MPI is used, the value should be considered as the batch size per task.'
+    doc_auto_prob_style = 'Determine the probability of systems automatically. The method is assigned by this key and can be\n\n\
+- "prob_uniform"  : the probability all the systems are equal, namely 1.0/self.get_nsystems()\n\n\
+- "prob_sys_size" : the probability of a system is proportional to the number of batches in the system\n\n\
+- "prob_sys_size;stt_idx:end_idx:weight;stt_idx:end_idx:weight;..." : the list of systems is devided into blocks. A block is specified by `stt_idx:end_idx:weight`, where `stt_idx` is the starting index of the system, `end_idx` is then ending (not including) index of the system, the probabilities of the systems in this block sums up to `weight`, and the relatively probabilities within this block is proportional to the number of batches in the system.'
+    doc_sys_probs = (
+        "A list of float if specified. "
+        "Should be of the same length as `systems`, "
+        "specifying the probability of each system."
+    )
+
+    args = [
+        Argument(
+            "systems", [List[str], str], optional=False, default=".", doc=doc_systems
+        ),
+        Argument("set_prefix", str, optional=True, default="set", doc=doc_set_prefix),
+        Argument(
+            "batch_size",
+            [List[int], int, str],
+            optional=True,
+            default="auto",
+            doc=doc_batch_size,
+        ),
+        Argument(
+            "auto_prob",
+            str,
+            optional=True,
+            default="prob_sys_size",
+            doc=doc_auto_prob_style,
+            alias=[
+                "auto_prob_style",
+            ],
+        ),
+        Argument(
+            "sys_probs",
+            List[float],
+            optional=True,
+            default=None,
+            doc=doc_sys_probs,
+            alias=["sys_weights"],
+        ),
+    ]
+
+    doc_training_data = "Configurations of training data."
+    return Argument(
+        "training_data",
+        dict,
+        optional=True,
+        sub_fields=args,
+        sub_variants=[],
+        doc=doc_training_data,
+    )
+
+
+def validation_data_args():  # ! added by Ziyao: new specification style for data systems.
+    link_sys = make_link("systems", "training/validation_data/systems")
+    doc_systems = (
+        "The data systems for validation. "
+        "This key can be provided with a list that specifies the systems, or be provided with a string "
+        "by which the prefix of all systems are given and the list of the systems is automatically generated."
+    )
+    doc_set_prefix = f"The prefix of the sets in the {link_sys}."
+    doc_batch_size = f'This key can be \n\n\
+- list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\
+- int: all {link_sys} use the same batch size.\n\n\
+- string "auto": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.\n\n\
+- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.'
+    doc_auto_prob_style = 'Determine the probability of systems automatically. The method is assigned by this key and can be\n\n\
+- "prob_uniform"  : the probability all the systems are equal, namely 1.0/self.get_nsystems()\n\n\
+- "prob_sys_size" : the probability of a system is proportional to the number of batches in the system\n\n\
+- "prob_sys_size;stt_idx:end_idx:weight;stt_idx:end_idx:weight;..." : the list of systems is devided into blocks. A block is specified by `stt_idx:end_idx:weight`, where `stt_idx` is the starting index of the system, `end_idx` is then ending (not including) index of the system, the probabilities of the systems in this block sums up to `weight`, and the relatively probabilities within this block is proportional to the number of batches in the system.'
+    doc_sys_probs = (
+        "A list of float if specified. "
+        "Should be of the same length as `systems`, "
+        "specifying the probability of each system."
+    )
+    doc_numb_btch = "An integer that specifies the number of batches to be sampled for each validation period."
+
+    args = [
+        Argument(
+            "systems", [List[str], str], optional=False, default=".", doc=doc_systems
+        ),
+        Argument("set_prefix", str, optional=True, default="set", doc=doc_set_prefix),
+        Argument(
+            "batch_size",
+            [List[int], int, str],
+            optional=True,
+            default="auto",
+            doc=doc_batch_size,
+        ),
+        Argument(
+            "auto_prob",
+            str,
+            optional=True,
+            default="prob_sys_size",
+            doc=doc_auto_prob_style,
+            alias=[
+                "auto_prob_style",
+            ],
+        ),
+        Argument(
+            "sys_probs",
+            List[float],
+            optional=True,
+            default=None,
+            doc=doc_sys_probs,
+            alias=["sys_weights"],
+        ),
+        Argument(
+            "numb_btch",
+            int,
+            optional=True,
+            default=1,
+            doc=doc_numb_btch,
+            alias=[
+                "numb_batch",
+            ],
+        ),
+    ]
+
+    doc_validation_data = (
+        "Configurations of validation data. Similar to that of training data, "
+        "except that a `numb_btch` argument may be configured"
+    )
+    return Argument(
+        "validation_data",
+        dict,
+        optional=True,
+        default=None,
+        sub_fields=args,
+        sub_variants=[],
+        doc=doc_validation_data,
+    )
+
+
+def mixed_precision_args():  # ! added by Denghui.
+    doc_output_prec = 'The precision for mixed precision params. " \
+        "The trainable variables precision during the mixed precision training process, " \
+        "supported options are float32 only currently.'
+    doc_compute_prec = 'The precision for mixed precision compute. " \
+        "The compute precision during the mixed precision training process, "" \
+        "supported options are float16 and bfloat16 currently.'
+
+    args = [
+        Argument(
+            "output_prec", str, optional=True, default="float32", doc=doc_output_prec
+        ),
+        Argument(
+            "compute_prec", str, optional=False, default="float16", doc=doc_compute_prec
+        ),
+    ]
+
+    doc_mixed_precision = "Configurations of mixed precision."
+    return Argument(
+        "mixed_precision",
+        dict,
+        optional=True,
+        sub_fields=args,
+        sub_variants=[],
+        doc=doc_mixed_precision,
+    )
+
+
+def training_args():  # ! modified by Ziyao: data configuration isolated.
+    doc_numb_steps = "Number of training batch. Each training uses one batch of data."
+    doc_seed = "The random seed for getting frames from the training data set."
+    doc_disp_file = "The file for printing learning curve."
+    doc_disp_freq = "The frequency of printing learning curve."
+    doc_save_freq = "The frequency of saving check point."
+    doc_save_ckpt = "The path prefix of saving check point files."
+    doc_disp_training = "Displaying verbose information during training."
+    doc_time_training = "Timing durining training."
+    doc_profiling = "Profiling during training."
+    doc_profiling_file = "Output file for profiling."
+    doc_enable_profiler = "Enable TensorFlow Profiler (available in TensorFlow 2.3) to analyze performance. The log will be saved to `tensorboard_log_dir`."
+    doc_tensorboard = "Enable tensorboard"
+    doc_tensorboard_log_dir = "The log directory of tensorboard outputs"
+    doc_tensorboard_freq = "The frequency of writing tensorboard events."
+    doc_data_dict = (
+        "The dictionary of multi DataSystems in multi-task mode. "
+        "Each data_dict[fitting_key], with user-defined name `fitting_key` in `model/fitting_net_dict`, "
+        "contains training data and optional validation data definitions."
+    )
+    doc_fitting_weight = (
+        "Each fitting_weight[fitting_key], with user-defined name `fitting_key` in `model/fitting_net_dict`, "
+        "is the training weight of fitting net `fitting_key`. "
+        "Fitting nets with higher weights will be selected with higher probabilities to be trained in one step. "
+        "Weights will be normalized and minus ones will be ignored. "
+        "If not set, each fitting net will be equally selected when training."
+    )
+
+    arg_training_data = training_data_args()
+    arg_validation_data = validation_data_args()
+    mixed_precision_data = mixed_precision_args()
+
+    args = [
+        arg_training_data,
+        arg_validation_data,
+        mixed_precision_data,
+        Argument(
+            "numb_steps", int, optional=False, doc=doc_numb_steps, alias=["stop_batch"]
+        ),
+        Argument("seed", [int, None], optional=True, doc=doc_seed),
+        Argument(
+            "disp_file", str, optional=True, default="lcurve.out", doc=doc_disp_file
+        ),
+        Argument("disp_freq", int, optional=True, default=1000, doc=doc_disp_freq),
+        Argument("save_freq", int, optional=True, default=1000, doc=doc_save_freq),
+        Argument(
+            "save_ckpt", str, optional=True, default="model.ckpt", doc=doc_save_ckpt
+        ),
+        Argument(
+            "disp_training", bool, optional=True, default=True, doc=doc_disp_training
+        ),
+        Argument(
+            "time_training", bool, optional=True, default=True, doc=doc_time_training
+        ),
+        Argument("profiling", bool, optional=True, default=False, doc=doc_profiling),
+        Argument(
+            "profiling_file",
+            str,
+            optional=True,
+            default="timeline.json",
+            doc=doc_profiling_file,
+        ),
+        Argument(
+            "enable_profiler",
+            bool,
+            optional=True,
+            default=False,
+            doc=doc_enable_profiler,
+        ),
+        Argument(
+            "tensorboard", bool, optional=True, default=False, doc=doc_tensorboard
+        ),
+        Argument(
+            "tensorboard_log_dir",
+            str,
+            optional=True,
+            default="log",
+            doc=doc_tensorboard_log_dir,
+        ),
+        Argument(
+            "tensorboard_freq", int, optional=True, default=1, doc=doc_tensorboard_freq
+        ),
+        Argument("data_dict", dict, optional=True, doc=doc_data_dict),
+        Argument("fitting_weight", dict, optional=True, doc=doc_fitting_weight),
+    ]
+
+    doc_training = "The training options."
+    return Argument("training", dict, args, [], doc=doc_training)
+
+
+def make_index(keys):
+    ret = []
+    for ii in keys:
+        ret.append(make_link(ii, ii))
+    return ", ".join(ret)
+
+
+def gen_doc(*, make_anchor=True, make_link=True, **kwargs):
+    if make_link:
+        make_anchor = True
+    ptr = []
+    for ii in gen_args():
+        ptr.append(ii.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs))
+
+    key_words = []
+    for ii in "\n\n".join(ptr).split("\n"):
+        if "argument path" in ii:
+            key_words.append(ii.split(":")[1].replace("`", "").strip())
+    # ptr.insert(0, make_index(key_words))
+
+    return "\n\n".join(ptr)
+
+
+def gen_json(**kwargs):
+    return json.dumps(
+        tuple(gen_args()),
+        cls=ArgumentEncoder,
+    )
+
+
+def gen_args(**kwargs) -> List[Argument]:
+    return [
+        model_args(),
+        learning_rate_args(),
+        learning_rate_dict_args(),
+        loss_args(),
+        loss_dict_args(),
+        training_args(),
+        nvnmd_args(),
+    ]
+
+
+def normalize_multi_task(data):
+    # single-task or multi-task mode
+    if data["model"].get("type", "standard") not in ("standard", "multi"):
+        return data
+    single_fitting_net = "fitting_net" in data["model"].keys()
+    single_training_data = "training_data" in data["training"].keys()
+    single_valid_data = "validation_data" in data["training"].keys()
+    single_loss = "loss" in data.keys()
+    single_learning_rate = "learning_rate" in data.keys()
+    multi_fitting_net = "fitting_net_dict" in data["model"].keys()
+    multi_training_data = "data_dict" in data["training"].keys()
+    multi_loss = "loss_dict" in data.keys()
+    multi_fitting_weight = "fitting_weight" in data["training"].keys()
+    multi_learning_rate = "learning_rate_dict" in data.keys()
+    assert (single_fitting_net == single_training_data) and (
+        multi_fitting_net == multi_training_data
+    ), (
+        "In single-task mode, 'model/fitting_net' and 'training/training_data' must be defined at the same time! "
+        "While in multi-task mode, 'model/fitting_net_dict', 'training/data_dict' "
+        "must be defined at the same time! Please check your input script. "
+    )
+    assert not (single_fitting_net and multi_fitting_net), (
+        "Single-task mode and multi-task mode can not be performed together. "
+        "Please check your input script and choose just one format! "
+    )
+    assert (
+        single_fitting_net or multi_fitting_net
+    ), "Please define your fitting net and training data! "
+    if multi_fitting_net:
+        assert not single_valid_data, (
+            "In multi-task mode, 'training/validation_data' should not appear "
+            "outside 'training/data_dict'! Please check your input script."
+        )
+        assert (
+            not single_loss
+        ), "In multi-task mode, please use 'model/loss_dict' in stead of 'model/loss'! "
+        assert (
+            "type_map" in data["model"]
+        ), "In multi-task mode, 'model/type_map' must be defined! "
+        data["model"]["type"] = "multi"
+        data["model"]["fitting_net_dict"] = normalize_fitting_net_dict(
+            data["model"]["fitting_net_dict"]
+        )
+        data["training"]["data_dict"] = normalize_data_dict(
+            data["training"]["data_dict"]
+        )
+        data["loss_dict"] = (
+            normalize_loss_dict(
+                data["model"]["fitting_net_dict"].keys(), data["loss_dict"]
+            )
+            if multi_loss
+            else {}
+        )
+        if multi_learning_rate:
+            data["learning_rate_dict"] = normalize_learning_rate_dict(
+                data["model"]["fitting_net_dict"].keys(), data["learning_rate_dict"]
+            )
+        elif single_learning_rate:
+            data[
+                "learning_rate_dict"
+            ] = normalize_learning_rate_dict_with_single_learning_rate(
+                data["model"]["fitting_net_dict"].keys(), data["learning_rate"]
+            )
+        fitting_weight = (
+            data["training"]["fitting_weight"] if multi_fitting_weight else None
+        )
+        data["training"]["fitting_weight"] = normalize_fitting_weight(
+            data["model"]["fitting_net_dict"].keys(),
+            data["training"]["data_dict"].keys(),
+            fitting_weight=fitting_weight,
+        )
+    else:
+        assert not multi_loss, "In single-task mode, please use 'model/loss' in stead of 'model/loss_dict'! "
+        assert not multi_learning_rate, "In single-task mode, please use 'model/learning_rate' in stead of 'model/learning_rate_dict'! "
+    return data
+
+
+def normalize_fitting_net_dict(fitting_net_dict):
+    new_dict = {}
+    base = Argument("base", dict, [], [fitting_variant_type_args()], doc="")
+    for fitting_key_item in fitting_net_dict:
+        data = base.normalize_value(
+            fitting_net_dict[fitting_key_item], trim_pattern="_*"
+        )
+        base.check_value(data, strict=True)
+        new_dict[fitting_key_item] = data
+    return new_dict
+
+
+def normalize_data_dict(data_dict):
+    new_dict = {}
+    base = Argument(
+        "base", dict, [training_data_args(), validation_data_args()], [], doc=""
+    )
+    for data_system_key_item in data_dict:
+        data = base.normalize_value(data_dict[data_system_key_item], trim_pattern="_*")
+        base.check_value(data, strict=True)
+        new_dict[data_system_key_item] = data
+    return new_dict
+
+
+def normalize_loss_dict(fitting_keys, loss_dict):
+    # check the loss dict
+    failed_loss_keys = [item for item in loss_dict if item not in fitting_keys]
+    assert (
+        not failed_loss_keys
+    ), "Loss dict key(s) {} not have corresponding fitting keys in {}! ".format(
+        str(failed_loss_keys), str(list(fitting_keys))
+    )
+    new_dict = {}
+    base = Argument("base", dict, [], [loss_variant_type_args()], doc="")
+    for item in loss_dict:
+        data = base.normalize_value(loss_dict[item], trim_pattern="_*")
+        base.check_value(data, strict=True)
+        new_dict[item] = data
+    return new_dict
+
+
+def normalize_learning_rate_dict(fitting_keys, learning_rate_dict):
+    # check the learning_rate dict
+    failed_learning_rate_keys = [
+        item for item in learning_rate_dict if item not in fitting_keys
+    ]
+    assert not failed_learning_rate_keys, "Learning rate dict key(s) {} not have corresponding fitting keys in {}! ".format(
+        str(failed_learning_rate_keys), str(list(fitting_keys))
+    )
+    new_dict = {}
+    base = Argument("base", dict, [], [learning_rate_variant_type_args()], doc="")
+    for item in learning_rate_dict:
+        data = base.normalize_value(learning_rate_dict[item], trim_pattern="_*")
+        base.check_value(data, strict=True)
+        new_dict[item] = data
+    return new_dict
+
+
+def normalize_learning_rate_dict_with_single_learning_rate(fitting_keys, learning_rate):
+    new_dict = {}
+    base = Argument("base", dict, [], [learning_rate_variant_type_args()], doc="")
+    data = base.normalize_value(learning_rate, trim_pattern="_*")
+    base.check_value(data, strict=True)
+    for fitting_key in fitting_keys:
+        new_dict[fitting_key] = data
+    return new_dict
+
+
+def normalize_fitting_weight(fitting_keys, data_keys, fitting_weight=None):
+    # check the mapping
+    failed_data_keys = [item for item in data_keys if item not in fitting_keys]
+    assert (
+        not failed_data_keys
+    ), "Data dict key(s) {} not have corresponding fitting keys in {}! ".format(
+        str(failed_data_keys), str(list(fitting_keys))
+    )
+    empty_fitting_keys = []
+    valid_fitting_keys = []
+    for item in fitting_keys:
+        if item not in data_keys:
+            empty_fitting_keys.append(item)
+        else:
+            valid_fitting_keys.append(item)
+    if empty_fitting_keys:
+        log.warning(
+            "Fitting net(s) {} have no data and will not be used in training.".format(
+                str(empty_fitting_keys)
+            )
+        )
+    num_pair = len(valid_fitting_keys)
+    assert num_pair > 0, "No valid training data systems for fitting nets!"
+
+    # check and normalize the fitting weight
+    new_weight = {}
+    if fitting_weight is None:
+        equal_weight = 1.0 / num_pair
+        for item in fitting_keys:
+            new_weight[item] = equal_weight if item in valid_fitting_keys else 0.0
+    else:
+        failed_weight_keys = [
+            item for item in fitting_weight if item not in fitting_keys
+        ]
+        assert not failed_weight_keys, "Fitting weight key(s) {} not have corresponding fitting keys in {}! ".format(
+            str(failed_weight_keys), str(list(fitting_keys))
+        )
+        sum_prob = 0.0
+        for item in fitting_keys:
+            if item in valid_fitting_keys:
+                if (
+                    item in fitting_weight
+                    and isinstance(fitting_weight[item], (int, float))
+                    and fitting_weight[item] > 0.0
+                ):
+                    sum_prob += fitting_weight[item]
+                    new_weight[item] = fitting_weight[item]
+                else:
+                    valid_fitting_keys.remove(item)
+                    log.warning(
+                        f"Fitting net '{item}' has zero or invalid weight "
+                        "and will not be used in training."
+                    )
+                    new_weight[item] = 0.0
+            else:
+                new_weight[item] = 0.0
+        assert sum_prob > 0.0, "No valid training weight for fitting nets!"
+        # normalize
+        for item in new_weight:
+            new_weight[item] /= sum_prob
+    return new_weight
+
+
+def normalize(data):
+    data = normalize_multi_task(data)
+
+    base = Argument("base", dict, gen_args())
+    data = base.normalize_value(data, trim_pattern="_*")
+    base.check_value(data, strict=True)
+
+    return data
+
+
+if __name__ == "__main__":
+    gen_doc()
diff --git a/deepmd_utils/utils/argcheck_nvnmd.py b/deepmd_utils/utils/argcheck_nvnmd.py
new file mode 100644
index 0000000000..2dc17ebc27
--- /dev/null
+++ b/deepmd_utils/utils/argcheck_nvnmd.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from dargs import (
+    Argument,
+)
+
+
+def nvnmd_args():
+    doc_version = (
+        "configuration the nvnmd version (0 | 1), 0 for 4 types, 1 for 32 types"
+    )
+    doc_max_nnei = "configuration the max number of neighbors, 128|256 for version 0, 128 for version 1"
+    doc_net_size_file = (
+        "configuration the number of nodes of fitting_net, just can be set as 128"
+    )
+    doc_map_file = "A file containing the mapping tables to replace the calculation of embedding nets"
+    doc_config_file = "A file containing the parameters about how to implement the model in certain hardware"
+    doc_weight_file = "a *.npy file containing the weights of the model"
+    doc_enable = "enable the nvnmd training"
+    doc_restore_descriptor = (
+        "enable to restore the parameter of embedding_net from weight.npy"
+    )
+    doc_restore_fitting_net = (
+        "enable to restore the parameter of fitting_net from weight.npy"
+    )
+    doc_quantize_descriptor = "enable the quantizatioin of descriptor"
+    doc_quantize_fitting_net = "enable the quantizatioin of fitting_net"
+    args = [
+        Argument("version", int, optional=False, default=0, doc=doc_version),
+        Argument("max_nnei", int, optional=False, default=128, doc=doc_max_nnei),
+        Argument("net_size", int, optional=False, default=128, doc=doc_net_size_file),
+        Argument("map_file", str, optional=False, default="none", doc=doc_map_file),
+        Argument(
+            "config_file", str, optional=False, default="none", doc=doc_config_file
+        ),
+        Argument(
+            "weight_file", str, optional=False, default="none", doc=doc_weight_file
+        ),
+        Argument("enable", bool, optional=False, default=False, doc=doc_enable),
+        Argument(
+            "restore_descriptor",
+            bool,
+            optional=False,
+            default=False,
+            doc=doc_restore_descriptor,
+        ),
+        Argument(
+            "restore_fitting_net",
+            bool,
+            optional=False,
+            default=False,
+            doc=doc_restore_fitting_net,
+        ),
+        Argument(
+            "quantize_descriptor",
+            bool,
+            optional=False,
+            default=False,
+            doc=doc_quantize_descriptor,
+        ),
+        Argument(
+            "quantize_fitting_net",
+            bool,
+            optional=False,
+            default=False,
+            doc=doc_quantize_fitting_net,
+        ),
+    ]
+
+    doc_nvnmd = "The nvnmd options."
+    return Argument("nvnmd", dict, args, [], optional=True, doc=doc_nvnmd)
diff --git a/deepmd_utils/utils/batch_size.py b/deepmd_utils/utils/batch_size.py
new file mode 100644
index 0000000000..1b93a51242
--- /dev/null
+++ b/deepmd_utils/utils/batch_size.py
@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
+import os
+from abc import (
+    ABC,
+    abstractmethod,
+)
+from typing import (
+    Callable,
+    Tuple,
+)
+
+import numpy as np
+
+from deepmd_utils.utils.errors import (
+    OutOfMemoryError,
+)
+
+log = logging.getLogger(__name__)
+
+
+class AutoBatchSize(ABC):
+    """This class allows DeePMD-kit to automatically decide the maximum
+    batch size that will not cause an OOM error.
+
+    Notes
+    -----
+    In some CPU environments, the program may be directly killed when OOM. In
+    this case, by default the batch size will not be increased for CPUs. The
+    environment variable `DP_INFER_BATCH_SIZE` can be set as the batch size.
+
+    In other cases, we assume all OOM error will raise :class:`OutOfMemoryError`.
+
+    Parameters
+    ----------
+    initial_batch_size : int, default: 1024
+        initial batch size (number of total atoms) when DP_INFER_BATCH_SIZE
+        is not set
+    factor : float, default: 2.
+        increased factor
+
+    Attributes
+    ----------
+    current_batch_size : int
+        current batch size (number of total atoms)
+    maximum_working_batch_size : int
+        maximum working batch size
+    minimal_not_working_batch_size : int
+        minimal not working batch size
+    """
+
+    def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None:
+        # See also PyTorchLightning/pytorch-lightning#1638
+        # TODO: discuss a proper initial batch size
+        self.current_batch_size = initial_batch_size
+        DP_INFER_BATCH_SIZE = int(os.environ.get("DP_INFER_BATCH_SIZE", 0))
+        if DP_INFER_BATCH_SIZE > 0:
+            self.current_batch_size = DP_INFER_BATCH_SIZE
+            self.maximum_working_batch_size = DP_INFER_BATCH_SIZE
+            self.minimal_not_working_batch_size = self.maximum_working_batch_size + 1
+        else:
+            self.maximum_working_batch_size = initial_batch_size
+            if self.is_gpu_available():
+                self.minimal_not_working_batch_size = 2**31
+            else:
+                self.minimal_not_working_batch_size = (
+                    self.maximum_working_batch_size + 1
+                )
+                log.warning(
+                    "You can use the environment variable DP_INFER_BATCH_SIZE to"
+                    "control the inference batch size (nframes * natoms). "
+                    "The default value is %d." % initial_batch_size
+                )
+
+        self.factor = factor
+
+    def execute(
+        self, callable: Callable, start_index: int, natoms: int
+    ) -> Tuple[int, tuple]:
+        """Excuate a method with given batch size.
+
+        Parameters
+        ----------
+        callable : Callable
+            The method should accept the batch size and start_index as parameters,
+            and returns executed batch size and data.
+        start_index : int
+            start index
+        natoms : int
+            natoms
+
+        Returns
+        -------
+        int
+            executed batch size * number of atoms
+        tuple
+            result from callable, None if failing to execute
+
+        Raises
+        ------
+        OutOfMemoryError
+            OOM when batch size is 1
+        """
+        if natoms > 0:
+            batch_nframes = self.current_batch_size // natoms
+        else:
+            batch_nframes = self.current_batch_size
+        try:
+            n_batch, result = callable(max(batch_nframes, 1), start_index)
+        except Exception as e:
+            if not self.is_oom_error(e):
+                raise e
+            self.minimal_not_working_batch_size = min(
+                self.minimal_not_working_batch_size, self.current_batch_size
+            )
+            if self.maximum_working_batch_size >= self.minimal_not_working_batch_size:
+                self.maximum_working_batch_size = int(
+                    self.minimal_not_working_batch_size / self.factor
+                )
+            if self.minimal_not_working_batch_size <= natoms:
+                raise OutOfMemoryError(
+                    "The callable still throws an out-of-memory (OOM) error even when batch size is 1!"
+                ) from e
+            # adjust the next batch size
+            self._adjust_batch_size(1.0 / self.factor)
+            return 0, None
+        else:
+            n_tot = n_batch * natoms
+            self.maximum_working_batch_size = max(
+                self.maximum_working_batch_size, n_tot
+            )
+            # adjust the next batch size
+            if (
+                n_tot + natoms > self.current_batch_size
+                and self.current_batch_size * self.factor
+                < self.minimal_not_working_batch_size
+            ):
+                self._adjust_batch_size(self.factor)
+            return n_batch, result
+
+    def _adjust_batch_size(self, factor: float):
+        old_batch_size = self.current_batch_size
+        self.current_batch_size = int(self.current_batch_size * factor)
+        log.info(
+            "Adjust batch size from %d to %d"
+            % (old_batch_size, self.current_batch_size)
+        )
+
+    def execute_all(
+        self, callable: Callable, total_size: int, natoms: int, *args, **kwargs
+    ) -> Tuple[np.ndarray]:
+        """Excuate a method with all given data.
+
+        Parameters
+        ----------
+        callable : Callable
+            The method should accept *args and **kwargs as input and return the similiar array.
+        total_size : int
+            Total size
+        natoms : int
+            The number of atoms
+        *args
+            Variable length argument list.
+        **kwargs
+            If 2D np.ndarray, assume the first axis is batch; otherwise do nothing.
+        """
+
+        def execute_with_batch_size(
+            batch_size: int, start_index: int
+        ) -> Tuple[int, Tuple[np.ndarray]]:
+            end_index = start_index + batch_size
+            end_index = min(end_index, total_size)
+            return (end_index - start_index), callable(
+                *[
+                    (
+                        vv[start_index:end_index]
+                        if isinstance(vv, np.ndarray) and vv.ndim > 1
+                        else vv
+                    )
+                    for vv in args
+                ],
+                **{
+                    kk: (
+                        vv[start_index:end_index]
+                        if isinstance(vv, np.ndarray) and vv.ndim > 1
+                        else vv
+                    )
+                    for kk, vv in kwargs.items()
+                },
+            )
+
+        index = 0
+        results = []
+        while index < total_size:
+            n_batch, result = self.execute(execute_with_batch_size, index, natoms)
+            if not isinstance(result, tuple):
+                result = (result,)
+            index += n_batch
+            if n_batch:
+                for rr in result:
+                    rr.reshape((n_batch, -1))
+                results.append(result)
+
+        r = tuple([np.concatenate(r, axis=0) for r in zip(*results)])
+        if len(r) == 1:
+            # avoid returning tuple if callable doesn't return tuple
+            r = r[0]
+        return r
+
+    @abstractmethod
+    def is_gpu_available(self) -> bool:
+        """Check if GPU is available.
+
+        Returns
+        -------
+        bool
+            True if GPU is available
+        """
+
+    @abstractmethod
+    def is_oom_error(self, e: Exception) -> bool:
+        """Check if the exception is an OOM error.
+
+        Parameters
+        ----------
+        e : Exception
+            Exception
+
+        Returns
+        -------
+        bool
+            True if the exception is an OOM error
+        """
diff --git a/deepmd_utils/utils/compat.py b/deepmd_utils/utils/compat.py
new file mode 100644
index 0000000000..5f9c14e6d8
--- /dev/null
+++ b/deepmd_utils/utils/compat.py
@@ -0,0 +1,392 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Module providing compatibility between `0.x.x` and `1.x.x` input versions."""
+
+import json
+import warnings
+from pathlib import (
+    Path,
+)
+from typing import (
+    Any,
+    Dict,
+    Optional,
+    Sequence,
+    Union,
+)
+
+import numpy as np
+
+from deepmd.common import (
+    j_must_have,
+)
+
+
+def convert_input_v0_v1(
+    jdata: Dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
+) -> Dict[str, Any]:
+    """Convert input from v0 format to v1.
+
+    Parameters
+    ----------
+    jdata : Dict[str, Any]
+        loaded json/yaml file
+    warning : bool, optional
+        whether to show deprecation warning, by default True
+    dump : Optional[Union[str, Path]], optional
+        whether to dump converted file, by default None
+
+    Returns
+    -------
+    Dict[str, Any]
+        converted output
+    """
+    output = {}
+    output["model"] = _model(jdata, jdata["use_smooth"])
+    output["learning_rate"] = _learning_rate(jdata)
+    output["loss"] = _loss(jdata)
+    output["training"] = _training(jdata)
+    if warning:
+        _warning_input_v0_v1(dump)
+    if dump is not None:
+        with open(dump, "w") as fp:
+            json.dump(output, fp, indent=4)
+    return output
+
+
+def _warning_input_v0_v1(fname: Optional[Union[str, Path]]):
+    msg = (
+        "It seems that you are using a deepmd-kit input of version 0.x.x, "
+        "which is deprecated. we have converted the input to >2.0.0 compatible"
+    )
+    if fname is not None:
+        msg += f", and output it to file {fname}"
+    warnings.warn(msg)
+
+
+def _model(jdata: Dict[str, Any], smooth: bool) -> Dict[str, Dict[str, Any]]:
+    """Convert data to v1 input for non-smooth model.
+
+    Parameters
+    ----------
+    jdata : Dict[str, Any]
+        parsed input json/yaml data
+    smooth : bool
+        whether to use smooth or non-smooth descriptor version
+
+    Returns
+    -------
+    Dict[str, Dict[str, Any]]
+        dictionary with model input parameters and sub-dictionaries for descriptor and
+        fitting net
+    """
+    model = {}
+    model["descriptor"] = (
+        _smth_descriptor(jdata) if smooth else _nonsmth_descriptor(jdata)
+    )
+    model["fitting_net"] = _fitting_net(jdata)
+    return model
+
+
+def _nonsmth_descriptor(jdata: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert data to v1 input for non-smooth descriptor.
+
+    Parameters
+    ----------
+    jdata : Dict[str, Any]
+        parsed input json/yaml data
+
+    Returns
+    -------
+    Dict[str, Any]
+        dict with descriptor parameters
+    """
+    descriptor = {}
+    descriptor["type"] = "loc_frame"
+    _jcopy(jdata, descriptor, ("sel_a", "sel_r", "rcut", "axis_rule"))
+    return descriptor
+
+
+def _smth_descriptor(jdata: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert data to v1 input for smooth descriptor.
+
+    Parameters
+    ----------
+    jdata : Dict[str, Any]
+        parsed input json/yaml data
+
+    Returns
+    -------
+    Dict[str, Any]
+        dict with descriptor parameters
+    """
+    descriptor = {}
+    seed = jdata.get("seed", None)
+    if seed is not None:
+        descriptor["seed"] = seed
+    descriptor["type"] = "se_a"
+    descriptor["sel"] = jdata["sel_a"]
+    _jcopy(jdata, descriptor, ("rcut",))
+    descriptor["rcut_smth"] = jdata.get("rcut_smth", descriptor["rcut"])
+    descriptor["neuron"] = j_must_have(jdata, "filter_neuron")
+    descriptor["axis_neuron"] = j_must_have(jdata, "axis_neuron", ["n_axis_neuron"])
+    descriptor["resnet_dt"] = False
+    if "resnet_dt" in jdata:
+        descriptor["resnet_dt"] = jdata["filter_resnet_dt"]
+
+    return descriptor
+
+
+def _fitting_net(jdata: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert data to v1 input for fitting net.
+
+    Parameters
+    ----------
+    jdata : Dict[str, Any]
+        parsed input json/yaml data
+
+    Returns
+    -------
+    Dict[str, Any]
+        dict with fitting net parameters
+    """
+    fitting_net = {}
+
+    seed = jdata.get("seed", None)
+    if seed is not None:
+        fitting_net["seed"] = seed
+    fitting_net["neuron"] = j_must_have(jdata, "fitting_neuron", ["n_neuron"])
+    fitting_net["resnet_dt"] = True
+    if "resnet_dt" in jdata:
+        fitting_net["resnet_dt"] = jdata["resnet_dt"]
+    if "fitting_resnet_dt" in jdata:
+        fitting_net["resnet_dt"] = jdata["fitting_resnet_dt"]
+    return fitting_net
+
+
+def _learning_rate(jdata: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert data to v1 input for learning rate section.
+
+    Parameters
+    ----------
+    jdata : Dict[str, Any]
+        parsed input json/yaml data
+
+    Returns
+    -------
+    Dict[str, Any]
+        dict with learning rate parameters
+    """
+    learning_rate = {}
+    learning_rate["type"] = "exp"
+    _jcopy(jdata, learning_rate, ("decay_steps", "decay_rate", "start_lr"))
+    return learning_rate
+
+
+def _loss(jdata: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert data to v1 input for loss function.
+
+    Parameters
+    ----------
+    jdata : Dict[str, Any]
+        parsed input json/yaml data
+
+    Returns
+    -------
+    Dict[str, Any]
+        dict with loss function parameters
+    """
+    loss: Dict[str, Any] = {}
+    _jcopy(
+        jdata,
+        loss,
+        (
+            "start_pref_e",
+            "limit_pref_e",
+            "start_pref_f",
+            "limit_pref_f",
+            "start_pref_v",
+            "limit_pref_v",
+        ),
+    )
+    if "start_pref_ae" in jdata:
+        loss["start_pref_ae"] = jdata["start_pref_ae"]
+    if "limit_pref_ae" in jdata:
+        loss["limit_pref_ae"] = jdata["limit_pref_ae"]
+    return loss
+
+
+def _training(jdata: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert data to v1 input for training.
+
+    Parameters
+    ----------
+    jdata : Dict[str, Any]
+        parsed input json/yaml data
+
+    Returns
+    -------
+    Dict[str, Any]
+        dict with training parameters
+    """
+    training = {}
+    seed = jdata.get("seed", None)
+    if seed is not None:
+        training["seed"] = seed
+
+    _jcopy(jdata, training, ("systems", "set_prefix", "stop_batch", "batch_size"))
+    training["disp_file"] = "lcurve.out"
+    if "disp_file" in jdata:
+        training["disp_file"] = jdata["disp_file"]
+    training["disp_freq"] = j_must_have(jdata, "disp_freq")
+    training["numb_test"] = j_must_have(jdata, "numb_test")
+    training["save_freq"] = j_must_have(jdata, "save_freq")
+    training["save_ckpt"] = j_must_have(jdata, "save_ckpt")
+    training["disp_training"] = j_must_have(jdata, "disp_training")
+    training["time_training"] = j_must_have(jdata, "time_training")
+    if "profiling" in jdata:
+        training["profiling"] = jdata["profiling"]
+        if training["profiling"]:
+            training["profiling_file"] = j_must_have(jdata, "profiling_file")
+    return training
+
+
+def _jcopy(src: Dict[str, Any], dst: Dict[str, Any], keys: Sequence[str]):
+    """Copy specified keys from one dict to another.
+
+    Parameters
+    ----------
+    src : Dict[str, Any]
+        source dictionary
+    dst : Dict[str, Any]
+        destination dictionary, will be modified in place
+    keys : Sequence[str]
+        list of keys to copy
+    """
+    for k in keys:
+        dst[k] = src[k]
+
+
+def remove_decay_rate(jdata: Dict[str, Any]):
+    """Convert decay_rate to stop_lr.
+
+    Parameters
+    ----------
+    jdata : Dict[str, Any]
+        input data
+    """
+    lr = jdata["learning_rate"]
+    if "decay_rate" in lr:
+        decay_rate = lr["decay_rate"]
+        start_lr = lr["start_lr"]
+        stop_step = jdata["training"]["stop_batch"]
+        decay_steps = lr["decay_steps"]
+        stop_lr = np.exp(np.log(decay_rate) * (stop_step / decay_steps)) * start_lr
+        lr["stop_lr"] = stop_lr
+        lr.pop("decay_rate")
+
+
+def convert_input_v1_v2(
+    jdata: Dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
+) -> Dict[str, Any]:
+    tr_cfg = jdata["training"]
+    tr_data_keys = {
+        "systems",
+        "set_prefix",
+        "batch_size",
+        "sys_prob",
+        "auto_prob",
+        # alias included
+        "sys_weights",
+        "auto_prob_style",
+    }
+
+    tr_data_cfg = {k: v for k, v in tr_cfg.items() if k in tr_data_keys}
+    new_tr_cfg = {k: v for k, v in tr_cfg.items() if k not in tr_data_keys}
+    new_tr_cfg["training_data"] = tr_data_cfg
+    if "training_data" in tr_cfg:
+        raise RuntimeError(
+            "Both v1 (training/systems) and v2 (training/training_data) parameters are given."
+        )
+
+    jdata["training"] = new_tr_cfg
+
+    # remove deprecated arguments
+    remove_decay_rate(jdata)
+
+    if warning:
+        _warning_input_v1_v2(dump)
+    if dump is not None:
+        with open(dump, "w") as fp:
+            json.dump(jdata, fp, indent=4)
+
+    return jdata
+
+
+def _warning_input_v1_v2(fname: Optional[Union[str, Path]]):
+    msg = (
+        "It seems that you are using a deepmd-kit input of version 1.x.x, "
+        "which is deprecated. we have converted the input to >2.0.0 compatible"
+    )
+    if fname is not None:
+        msg += f", and output it to file {fname}"
+    warnings.warn(msg)
+
+
+def deprecate_numb_test(
+    jdata: Dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
+) -> Dict[str, Any]:
+    """Deprecate `numb_test` since v2.1. It has taken no effect since v2.0.
+
+    See `#1243 <https://github.com/deepmodeling/deepmd-kit/discussions/1243>`_.
+
+    Parameters
+    ----------
+    jdata : Dict[str, Any]
+        loaded json/yaml file
+    warning : bool, optional
+        whether to show deprecation warning, by default True
+    dump : Optional[Union[str, Path]], optional
+        whether to dump converted file, by default None
+
+    Returns
+    -------
+    Dict[str, Any]
+        converted output
+    """
+    try:
+        jdata.get("training", {}).pop("numb_test")
+    except KeyError:
+        pass
+    else:
+        if warning:
+            warnings.warn(
+                "The argument training->numb_test has been deprecated since v2.0.0. "
+                "Use training->validation_data->batch_size instead."
+            )
+
+    if dump is not None:
+        with open(dump, "w") as fp:
+            json.dump(jdata, fp, indent=4)
+    return jdata
+
+
+def update_deepmd_input(
+    jdata: Dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
+) -> Dict[str, Any]:
+    def is_deepmd_v0_input(jdata):
+        return "model" not in jdata.keys()
+
+    def is_deepmd_v1_input(jdata):
+        return "systems" in j_must_have(jdata, "training").keys()
+
+    if is_deepmd_v0_input(jdata):
+        jdata = convert_input_v0_v1(jdata, warning, None)
+        jdata = convert_input_v1_v2(jdata, False, None)
+        jdata = deprecate_numb_test(jdata, False, dump)
+    elif is_deepmd_v1_input(jdata):
+        jdata = convert_input_v1_v2(jdata, warning, None)
+        jdata = deprecate_numb_test(jdata, False, dump)
+    else:
+        jdata = deprecate_numb_test(jdata, warning, dump)
+
+    return jdata
diff --git a/deepmd_utils/utils/data.py b/deepmd_utils/utils/data.py
new file mode 100644
index 0000000000..2689257e16
--- /dev/null
+++ b/deepmd_utils/utils/data.py
@@ -0,0 +1,614 @@
+#!/usr/bin/env python3
+
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
+from typing import (
+    List,
+    Optional,
+)
+
+import numpy as np
+
+from deepmd_utils.env import (
+    GLOBAL_ENER_FLOAT_PRECISION,
+    GLOBAL_NP_FLOAT_PRECISION,
+)
+from deepmd_utils.utils import random as dp_random
+from deepmd_utils.utils.path import (
+    DPPath,
+)
+
+log = logging.getLogger(__name__)
+
+
+class DeepmdData:
+    """Class for a data system.
+
+    It loads data from hard disk, and mantains the data as a `data_dict`
+
+    Parameters
+    ----------
+    sys_path
+            Path to the data system
+    set_prefix
+            Prefix for the directories of different sets
+    shuffle_test
+            If the test data are shuffled
+    type_map
+            Gives the name of different atom types
+    optional_type_map
+            If the type_map.raw in each system is optional
+    modifier
+            Data modifier that has the method `modify_data`
+    trn_all_set
+            Use all sets as training dataset. Otherwise, if the number of sets is more than 1, the last set is left for test.
+    sort_atoms : bool
+            Sort atoms by atom types. Required to enable when the data is directly feeded to
+            descriptors except mixed types.
+    """
+
+    def __init__(
+        self,
+        sys_path: str,
+        set_prefix: str = "set",
+        shuffle_test: bool = True,
+        type_map: Optional[List[str]] = None,
+        optional_type_map: bool = True,
+        modifier=None,
+        trn_all_set: bool = False,
+        sort_atoms: bool = True,
+    ):
+        """Constructor."""
+        root = DPPath(sys_path)
+        self.dirs = root.glob(set_prefix + ".*")
+        if not len(self.dirs):
+            raise FileNotFoundError(f"No {set_prefix}.* is found in {sys_path}")
+        self.dirs.sort()
+        # check mix_type format
+        error_format_msg = (
+            "if one of the set is of mixed_type format, "
+            "then all of the sets in this system should be of mixed_type format!"
+        )
+        self.mixed_type = self._check_mode(self.dirs[0])
+        for set_item in self.dirs[1:]:
+            assert self._check_mode(set_item) == self.mixed_type, error_format_msg
+        # load atom type
+        self.atom_type = self._load_type(root)
+        self.natoms = len(self.atom_type)
+        # load atom type map
+        self.type_map = self._load_type_map(root)
+        assert (
+            optional_type_map or self.type_map is not None
+        ), f"System {sys_path} must have type_map.raw in this mode! "
+        if self.type_map is not None:
+            assert len(self.type_map) >= max(self.atom_type) + 1
+        # check pbc
+        self.pbc = self._check_pbc(root)
+        # enforce type_map if necessary
+        self.enforce_type_map = False
+        if type_map is not None and self.type_map is not None and len(type_map):
+            if not self.mixed_type:
+                atom_type_ = [
+                    type_map.index(self.type_map[ii]) for ii in self.atom_type
+                ]
+                self.atom_type = np.array(atom_type_, dtype=np.int32)
+            else:
+                self.enforce_type_map = True
+                sorter = np.argsort(type_map)
+                self.type_idx_map = np.array(
+                    sorter[np.searchsorted(type_map, self.type_map, sorter=sorter)]
+                )
+                # padding for virtual atom
+                self.type_idx_map = np.append(
+                    self.type_idx_map, np.array([-1], dtype=np.int32)
+                )
+            self.type_map = type_map
+        if type_map is None and self.type_map is None and self.mixed_type:
+            raise RuntimeError("mixed_type format must have type_map!")
+        # make idx map
+        self.sort_atoms = sort_atoms
+        self.idx_map = self._make_idx_map(self.atom_type)
+        # train dirs
+        self.test_dir = self.dirs[-1]
+        if trn_all_set:
+            self.train_dirs = self.dirs
+        else:
+            if len(self.dirs) == 1:
+                self.train_dirs = self.dirs
+            else:
+                self.train_dirs = self.dirs[:-1]
+        self.data_dict = {}
+        # add box and coord
+        self.add("box", 9, must=self.pbc)
+        self.add("coord", 3, atomic=True, must=True)
+        # the training times of each frame
+        self.add("numb_copy", 1, must=False, default=1, dtype=int)
+        # set counters
+        self.set_count = 0
+        self.iterator = 0
+        self.shuffle_test = shuffle_test
+        # set modifier
+        self.modifier = modifier
+
+    def add(
+        self,
+        key: str,
+        ndof: int,
+        atomic: bool = False,
+        must: bool = False,
+        high_prec: bool = False,
+        type_sel: Optional[List[int]] = None,
+        repeat: int = 1,
+        default: float = 0.0,
+        dtype: Optional[np.dtype] = None,
+    ):
+        """Add a data item that to be loaded.
+
+        Parameters
+        ----------
+        key
+            The key of the item. The corresponding data is stored in `sys_path/set.*/key.npy`
+        ndof
+            The number of dof
+        atomic
+            The item is an atomic property.
+            If False, the size of the data should be nframes x ndof
+            If True, the size of data should be nframes x natoms x ndof
+        must
+            The data file `sys_path/set.*/key.npy` must exist.
+            If must is False and the data file does not exist, the `data_dict[find_key]` is set to 0.0
+        high_prec
+            Load the data and store in float64, otherwise in float32
+        type_sel
+            Select certain type of atoms
+        repeat
+            The data will be repeated `repeat` times.
+        default : float, default=0.
+            default value of data
+        dtype : np.dtype, optional
+            the dtype of data, overwrites `high_prec` if provided
+        """
+        self.data_dict[key] = {
+            "ndof": ndof,
+            "atomic": atomic,
+            "must": must,
+            "high_prec": high_prec,
+            "type_sel": type_sel,
+            "repeat": repeat,
+            "reduce": None,
+            "default": default,
+            "dtype": dtype,
+        }
+        return self
+
+    def reduce(self, key_out: str, key_in: str):
+        """Generate a new item from the reduction of another atom.
+
+        Parameters
+        ----------
+        key_out
+            The name of the reduced item
+        key_in
+            The name of the data item to be reduced
+        """
+        assert key_in in self.data_dict, "cannot find input key"
+        assert self.data_dict[key_in]["atomic"], "reduced property should be atomic"
+        assert key_out not in self.data_dict, "output key should not have been added"
+        assert (
+            self.data_dict[key_in]["repeat"] == 1
+        ), "reduced proerties should not have been repeated"
+
+        self.data_dict[key_out] = {
+            "ndof": self.data_dict[key_in]["ndof"],
+            "atomic": False,
+            "must": True,
+            "high_prec": True,
+            "type_sel": None,
+            "repeat": 1,
+            "reduce": key_in,
+        }
+        return self
+
+    def get_data_dict(self) -> dict:
+        """Get the `data_dict`."""
+        return self.data_dict
+
+    def check_batch_size(self, batch_size):
+        """Check if the system can get a batch of data with `batch_size` frames."""
+        for ii in self.train_dirs:
+            if self.data_dict["coord"]["high_prec"]:
+                tmpe = (
+                    (ii / "coord.npy").load_numpy().astype(GLOBAL_ENER_FLOAT_PRECISION)
+                )
+            else:
+                tmpe = (ii / "coord.npy").load_numpy().astype(GLOBAL_NP_FLOAT_PRECISION)
+            if tmpe.ndim == 1:
+                tmpe = tmpe.reshape([1, -1])
+            if tmpe.shape[0] < batch_size:
+                return ii, tmpe.shape[0]
+        return None
+
+    def check_test_size(self, test_size):
+        """Check if the system can get a test dataset with `test_size` frames."""
+        if self.data_dict["coord"]["high_prec"]:
+            tmpe = (
+                (self.test_dir / "coord.npy")
+                .load_numpy()
+                .astype(GLOBAL_ENER_FLOAT_PRECISION)
+            )
+        else:
+            tmpe = (
+                (self.test_dir / "coord.npy")
+                .load_numpy()
+                .astype(GLOBAL_NP_FLOAT_PRECISION)
+            )
+        if tmpe.ndim == 1:
+            tmpe = tmpe.reshape([1, -1])
+        if tmpe.shape[0] < test_size:
+            return self.test_dir, tmpe.shape[0]
+        else:
+            return None
+
+    def get_batch(self, batch_size: int) -> dict:
+        """Get a batch of data with `batch_size` frames. The frames are randomly picked from the data system.
+
+        Parameters
+        ----------
+        batch_size
+            size of the batch
+        """
+        if hasattr(self, "batch_set"):
+            set_size = self.batch_set["coord"].shape[0]
+        else:
+            set_size = 0
+        if self.iterator + batch_size > set_size:
+            self._load_batch_set(self.train_dirs[self.set_count % self.get_numb_set()])
+            self.set_count += 1
+            set_size = self.batch_set["coord"].shape[0]
+        iterator_1 = self.iterator + batch_size
+        if iterator_1 >= set_size:
+            iterator_1 = set_size
+        idx = np.arange(self.iterator, iterator_1)
+        self.iterator += batch_size
+        ret = self._get_subdata(self.batch_set, idx)
+        return ret
+
+    def get_test(self, ntests: int = -1) -> dict:
+        """Get the test data with `ntests` frames.
+
+        Parameters
+        ----------
+        ntests
+            Size of the test data set. If `ntests` is -1, all test data will be get.
+        """
+        if not hasattr(self, "test_set"):
+            self._load_test_set(self.test_dir, self.shuffle_test)
+        if ntests == -1:
+            idx = None
+        else:
+            ntests_ = (
+                ntests
+                if ntests < self.test_set["type"].shape[0]
+                else self.test_set["type"].shape[0]
+            )
+            # print('ntest', self.test_set['type'].shape[0], ntests, ntests_)
+            idx = np.arange(ntests_)
+        ret = self._get_subdata(self.test_set, idx=idx)
+        if self.modifier is not None:
+            self.modifier.modify_data(ret, self)
+        return ret
+
+    def get_ntypes(self) -> int:
+        """Number of atom types in the system."""
+        if self.type_map is not None:
+            return len(self.type_map)
+        else:
+            return max(self.get_atom_type()) + 1
+
+    def get_type_map(self) -> List[str]:
+        """Get the type map."""
+        return self.type_map
+
+    def get_atom_type(self) -> List[int]:
+        """Get atom types."""
+        return self.atom_type
+
+    def get_numb_set(self) -> int:
+        """Get number of training sets."""
+        return len(self.train_dirs)
+
+    def get_numb_batch(self, batch_size: int, set_idx: int) -> int:
+        """Get the number of batches in a set."""
+        data = self._load_set(self.train_dirs[set_idx])
+        ret = data["coord"].shape[0] // batch_size
+        if ret == 0:
+            ret = 1
+        return ret
+
+    def get_sys_numb_batch(self, batch_size: int) -> int:
+        """Get the number of batches in the data system."""
+        ret = 0
+        for ii in range(len(self.train_dirs)):
+            ret += self.get_numb_batch(batch_size, ii)
+        return ret
+
+    def get_natoms(self):
+        """Get number of atoms."""
+        return len(self.atom_type)
+
+    def get_natoms_vec(self, ntypes: int):
+        """Get number of atoms and number of atoms in different types.
+
+        Parameters
+        ----------
+        ntypes
+            Number of types (may be larger than the actual number of types in the system).
+
+        Returns
+        -------
+        natoms
+            natoms[0]: number of local atoms
+            natoms[1]: total number of atoms held by this processor
+            natoms[i]: 2 <= i < Ntypes+2, number of type i atoms
+        """
+        natoms, natoms_vec = self._get_natoms_2(ntypes)
+        tmp = [natoms, natoms]
+        tmp = np.append(tmp, natoms_vec)
+        return tmp.astype(np.int32)
+
+    def avg(self, key):
+        """Return the average value of an item."""
+        if key not in self.data_dict.keys():
+            raise RuntimeError("key %s has not been added" % key)
+        info = self.data_dict[key]
+        ndof = info["ndof"]
+        eners = []
+        for ii in self.train_dirs:
+            data = self._load_set(ii)
+            ei = data[key].reshape([-1, ndof])
+            eners.append(ei)
+        eners = np.concatenate(eners, axis=0)
+        if eners.size == 0:
+            return 0
+        else:
+            return np.average(eners, axis=0)
+
+    def _idx_map_sel(self, atom_type, type_sel):
+        new_types = []
+        for ii in atom_type:
+            if ii in type_sel:
+                new_types.append(ii)
+        new_types = np.array(new_types, dtype=int)
+        natoms = new_types.shape[0]
+        idx = np.arange(natoms)
+        idx_map = np.lexsort((idx, new_types))
+        return idx_map
+
+    def _get_natoms_2(self, ntypes):
+        sample_type = self.atom_type
+        natoms = len(sample_type)
+        natoms_vec = np.zeros(ntypes).astype(int)
+        for ii in range(ntypes):
+            natoms_vec[ii] = np.count_nonzero(sample_type == ii)
+        return natoms, natoms_vec
+
+    def _get_subdata(self, data, idx=None):
+        new_data = {}
+        for ii in data:
+            dd = data[ii]
+            if "find_" in ii:
+                new_data[ii] = dd
+            else:
+                if idx is not None:
+                    new_data[ii] = dd[idx]
+                else:
+                    new_data[ii] = dd
+        return new_data
+
+    def _load_batch_set(self, set_name: DPPath):
+        if not hasattr(self, "batch_set") or self.get_numb_set() > 1:
+            self.batch_set = self._load_set(set_name)
+            if self.modifier is not None:
+                self.modifier.modify_data(self.batch_set, self)
+        self.batch_set, _ = self._shuffle_data(self.batch_set)
+        self.reset_get_batch()
+
+    def reset_get_batch(self):
+        self.iterator = 0
+
+    def _load_test_set(self, set_name: DPPath, shuffle_test):
+        self.test_set = self._load_set(set_name)
+        if shuffle_test:
+            self.test_set, _ = self._shuffle_data(self.test_set)
+
+    def _shuffle_data(self, data):
+        ret = {}
+        nframes = data["coord"].shape[0]
+        idx = np.arange(nframes)
+        # the training times of each frame
+        idx = np.repeat(idx, np.reshape(data["numb_copy"], (nframes,)))
+        dp_random.shuffle(idx)
+        for kk in data:
+            if (
+                type(data[kk]) == np.ndarray
+                and len(data[kk].shape) == 2
+                and data[kk].shape[0] == nframes
+                and "find_" not in kk
+            ):
+                ret[kk] = data[kk][idx]
+            else:
+                ret[kk] = data[kk]
+        return ret, idx
+
+    def _load_set(self, set_name: DPPath):
+        # get nframes
+        if not isinstance(set_name, DPPath):
+            set_name = DPPath(set_name)
+        path = set_name / "coord.npy"
+        if self.data_dict["coord"]["high_prec"]:
+            coord = path.load_numpy().astype(GLOBAL_ENER_FLOAT_PRECISION)
+        else:
+            coord = path.load_numpy().astype(GLOBAL_NP_FLOAT_PRECISION)
+        if coord.ndim == 1:
+            coord = coord.reshape([1, -1])
+        nframes = coord.shape[0]
+        assert coord.shape[1] == self.data_dict["coord"]["ndof"] * self.natoms
+        # load keys
+        data = {}
+        for kk in self.data_dict.keys():
+            if self.data_dict[kk]["reduce"] is None:
+                data["find_" + kk], data[kk] = self._load_data(
+                    set_name,
+                    kk,
+                    nframes,
+                    self.data_dict[kk]["ndof"],
+                    atomic=self.data_dict[kk]["atomic"],
+                    high_prec=self.data_dict[kk]["high_prec"],
+                    must=self.data_dict[kk]["must"],
+                    type_sel=self.data_dict[kk]["type_sel"],
+                    repeat=self.data_dict[kk]["repeat"],
+                    default=self.data_dict[kk]["default"],
+                    dtype=self.data_dict[kk]["dtype"],
+                )
+        for kk in self.data_dict.keys():
+            if self.data_dict[kk]["reduce"] is not None:
+                k_in = self.data_dict[kk]["reduce"]
+                ndof = self.data_dict[kk]["ndof"]
+                data["find_" + kk] = data["find_" + k_in]
+                tmp_in = data[k_in].astype(GLOBAL_ENER_FLOAT_PRECISION)
+                data[kk] = np.sum(
+                    np.reshape(tmp_in, [nframes, self.natoms, ndof]), axis=1
+                )
+
+        if self.mixed_type:
+            # nframes x natoms
+            atom_type_mix = self._load_type_mix(set_name)
+            if self.enforce_type_map:
+                try:
+                    atom_type_mix_ = self.type_idx_map[atom_type_mix].astype(np.int32)
+                except IndexError as e:
+                    raise IndexError(
+                        "some types in 'real_atom_types.npy' of set {} are not contained in {} types!".format(
+                            set_name, self.get_ntypes()
+                        )
+                    ) from e
+                atom_type_mix = atom_type_mix_
+            real_type = atom_type_mix.reshape([nframes, self.natoms])
+            data["type"] = real_type
+            natoms = data["type"].shape[1]
+            # nframes x ntypes
+            atom_type_nums = np.array(
+                [(real_type == i).sum(axis=-1) for i in range(self.get_ntypes())],
+                dtype=np.int32,
+            ).T
+            ghost_nums = np.array(
+                [(real_type == -1).sum(axis=-1)],
+                dtype=np.int32,
+            ).T
+            assert (
+                atom_type_nums.sum(axis=-1) + ghost_nums.sum(axis=-1) == natoms
+            ).all(), "some types in 'real_atom_types.npy' of set {} are not contained in {} types!".format(
+                set_name, self.get_ntypes()
+            )
+            data["real_natoms_vec"] = np.concatenate(
+                (
+                    np.tile(np.array([natoms, natoms], dtype=np.int32), (nframes, 1)),
+                    atom_type_nums,
+                ),
+                axis=-1,
+            )
+        else:
+            data["type"] = np.tile(self.atom_type[self.idx_map], (nframes, 1))
+
+        return data
+
+    def _load_data(
+        self,
+        set_name,
+        key,
+        nframes,
+        ndof_,
+        atomic=False,
+        must=True,
+        repeat=1,
+        high_prec=False,
+        type_sel=None,
+        default: float = 0.0,
+        dtype: Optional[np.dtype] = None,
+    ):
+        if atomic:
+            natoms = self.natoms
+            idx_map = self.idx_map
+            # if type_sel, then revise natoms and idx_map
+            if type_sel is not None:
+                natoms = 0
+                for jj in type_sel:
+                    natoms += np.sum(self.atom_type == jj)
+                idx_map = self._idx_map_sel(self.atom_type, type_sel)
+            ndof = ndof_ * natoms
+        else:
+            ndof = ndof_
+        if dtype is not None:
+            pass
+        elif high_prec:
+            dtype = GLOBAL_ENER_FLOAT_PRECISION
+        else:
+            dtype = GLOBAL_NP_FLOAT_PRECISION
+        path = set_name / (key + ".npy")
+        if path.is_file():
+            data = path.load_numpy().astype(dtype)
+            try:  # YWolfeee: deal with data shape error
+                if atomic:
+                    data = data.reshape([nframes, natoms, -1])
+                    data = data[:, idx_map, :]
+                    data = data.reshape([nframes, -1])
+                data = np.reshape(data, [nframes, ndof])
+            except ValueError as err_message:
+                explanation = "This error may occur when your label mismatch it's name, i.e. you might store global tensor in `atomic_tensor.npy` or atomic tensor in `tensor.npy`."
+                log.error(str(err_message))
+                log.error(explanation)
+                raise ValueError(str(err_message) + ". " + explanation)
+            if repeat != 1:
+                data = np.repeat(data, repeat).reshape([nframes, -1])
+            return np.float32(1.0), data
+        elif must:
+            raise RuntimeError("%s not found!" % path)
+        else:
+            data = np.full([nframes, ndof], default, dtype=dtype)
+            if repeat != 1:
+                data = np.repeat(data, repeat).reshape([nframes, -1])
+            return np.float32(0.0), data
+
+    def _load_type(self, sys_path: DPPath):
+        atom_type = (sys_path / "type.raw").load_txt(ndmin=1).astype(np.int32)
+        return atom_type
+
+    def _load_type_mix(self, set_name: DPPath):
+        type_path = set_name / "real_atom_types.npy"
+        real_type = type_path.load_numpy().astype(np.int32).reshape([-1, self.natoms])
+        return real_type
+
+    def _make_idx_map(self, atom_type):
+        natoms = atom_type.shape[0]
+        idx = np.arange(natoms)
+        if self.sort_atoms:
+            idx_map = np.lexsort((idx, atom_type))
+        else:
+            idx_map = idx
+        return idx_map
+
+    def _load_type_map(self, sys_path: DPPath):
+        fname = sys_path / "type_map.raw"
+        if fname.is_file():
+            return fname.load_txt(dtype=str, ndmin=1).tolist()
+        else:
+            return None
+
+    def _check_pbc(self, sys_path: DPPath):
+        pbc = True
+        if (sys_path / "nopbc").is_file():
+            pbc = False
+        return pbc
+
+    def _check_mode(self, set_path: DPPath):
+        return (set_path / "real_atom_types.npy").is_file()
diff --git a/deepmd_utils/utils/data_system.py b/deepmd_utils/utils/data_system.py
new file mode 100644
index 0000000000..f83f587590
--- /dev/null
+++ b/deepmd_utils/utils/data_system.py
@@ -0,0 +1,654 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import collections
+import logging
+import warnings
+from functools import (
+    lru_cache,
+)
+from typing import (
+    List,
+    Optional,
+)
+
+import numpy as np
+
+import deepmd_utils.utils.random as dp_random
+from deepmd_utils.common import (
+    make_default_mesh,
+)
+from deepmd_utils.env import (
+    GLOBAL_NP_FLOAT_PRECISION,
+)
+from deepmd_utils.utils.data import (
+    DeepmdData,
+)
+
+log = logging.getLogger(__name__)
+
+
+class DeepmdDataSystem:
+    """Class for manipulating many data systems.
+
+    It is implemented with the help of DeepmdData
+    """
+
+    def __init__(
+        self,
+        systems: List[str],
+        batch_size: int,
+        test_size: int,
+        rcut: Optional[float] = None,
+        set_prefix: str = "set",
+        shuffle_test: bool = True,
+        type_map: Optional[List[str]] = None,
+        optional_type_map: bool = True,
+        modifier=None,
+        trn_all_set=False,
+        sys_probs=None,
+        auto_prob_style="prob_sys_size",
+        sort_atoms: bool = True,
+    ):
+        """Constructor.
+
+        Parameters
+        ----------
+        systems
+            Specifying the paths to systems
+        batch_size
+            The batch size
+        test_size
+            The size of test data
+        rcut
+            The cut-off radius. Not used.
+        set_prefix
+            Prefix for the directories of different sets
+        shuffle_test
+            If the test data are shuffled
+        type_map
+            Gives the name of different atom types
+        optional_type_map
+            If the type_map.raw in each system is optional
+        modifier
+            Data modifier that has the method `modify_data`
+        trn_all_set
+            Use all sets as training dataset. Otherwise, if the number of sets is more than 1, the last set is left for test.
+        sys_probs : list of float
+            The probabilitis of systems to get the batch.
+            Summation of positive elements of this list should be no greater than 1.
+            Element of this list can be negative, the probability of the corresponding system is determined
+                automatically by the number of batches in the system.
+        auto_prob_style : str
+            Determine the probability of systems automatically. The method is assigned by this key and can be
+            - "prob_uniform"  : the probability all the systems are equal, namely 1.0/self.get_nsystems()
+            - "prob_sys_size" : the probability of a system is proportional to the number of batches in the system
+            - "prob_sys_size;stt_idx:end_idx:weight;stt_idx:end_idx:weight;..." :
+                                the list of systems is devided into blocks. A block is specified by `stt_idx:end_idx:weight`,
+                                where `stt_idx` is the starting index of the system, `end_idx` is then ending (not including) index of the system,
+                                the probabilities of the systems in this block sums up to `weight`, and the relatively probabilities within this block is proportional
+                to the number of batches in the system.
+        sort_atoms : bool
+            Sort atoms by atom types. Required to enable when the data is directly feeded to
+            descriptors except mixed types.
+        """
+        # init data
+        del rcut
+        self.system_dirs = systems
+        self.nsystems = len(self.system_dirs)
+        self.data_systems = []
+        for ii in self.system_dirs:
+            self.data_systems.append(
+                DeepmdData(
+                    ii,
+                    set_prefix=set_prefix,
+                    shuffle_test=shuffle_test,
+                    type_map=type_map,
+                    optional_type_map=optional_type_map,
+                    modifier=modifier,
+                    trn_all_set=trn_all_set,
+                    sort_atoms=sort_atoms,
+                )
+            )
+        # check mix_type format
+        error_format_msg = (
+            "if one of the system is of mixed_type format, "
+            "then all of the systems should be of mixed_type format!"
+        )
+        if self.data_systems[0].mixed_type:
+            for data_sys in self.data_systems[1:]:
+                assert data_sys.mixed_type, error_format_msg
+            self.mixed_type = True
+        else:
+            for data_sys in self.data_systems[1:]:
+                assert not data_sys.mixed_type, error_format_msg
+            self.mixed_type = False
+        # batch size
+        self.batch_size = batch_size
+        is_auto_bs = False
+        self.mixed_systems = False
+        if isinstance(self.batch_size, int):
+            self.batch_size = self.batch_size * np.ones(self.nsystems, dtype=int)
+        elif isinstance(self.batch_size, str):
+            words = self.batch_size.split(":")
+            if "auto" == words[0]:
+                is_auto_bs = True
+                rule = 32
+                if len(words) == 2:
+                    rule = int(words[1])
+                self.batch_size = self._make_auto_bs(rule)
+            elif "mixed" == words[0]:
+                self.mixed_type = True
+                self.mixed_systems = True
+                if len(words) == 2:
+                    rule = int(words[1])
+                else:
+                    raise RuntimeError("batch size must be specified for mixed systems")
+                self.batch_size = rule * np.ones(self.nsystems, dtype=int)
+            else:
+                raise RuntimeError("unknown batch_size rule " + words[0])
+        elif isinstance(self.batch_size, list):
+            pass
+        else:
+            raise RuntimeError("invalid batch_size")
+        assert isinstance(self.batch_size, (list, np.ndarray))
+        assert len(self.batch_size) == self.nsystems
+
+        # natoms, nbatches
+        ntypes = []
+        for ii in self.data_systems:
+            ntypes.append(ii.get_ntypes())
+        self.sys_ntypes = max(ntypes)
+        self.natoms = []
+        self.natoms_vec = []
+        self.nbatches = []
+        type_map_list = []
+        for ii in range(self.nsystems):
+            self.natoms.append(self.data_systems[ii].get_natoms())
+            self.natoms_vec.append(
+                self.data_systems[ii].get_natoms_vec(self.sys_ntypes).astype(int)
+            )
+            self.nbatches.append(
+                self.data_systems[ii].get_sys_numb_batch(self.batch_size[ii])
+            )
+            type_map_list.append(self.data_systems[ii].get_type_map())
+        self.type_map = self._check_type_map_consistency(type_map_list)
+
+        # ! altered by Marián Rynik
+        # test size
+        # now test size can be set as a percentage of systems data or test size
+        # can be set for each system individualy in the same manner as batch
+        # size. This enables one to use systems with diverse number of
+        # structures and different number of atoms.
+        self.test_size = test_size
+        if isinstance(self.test_size, int):
+            self.test_size = self.test_size * np.ones(self.nsystems, dtype=int)
+        elif isinstance(self.test_size, str):
+            words = self.test_size.split("%")
+            try:
+                percent = int(words[0])
+            except ValueError:
+                raise RuntimeError("unknown test_size rule " + words[0])
+            self.test_size = self._make_auto_ts(percent)
+        elif isinstance(self.test_size, list):
+            pass
+        else:
+            raise RuntimeError("invalid test_size")
+        assert isinstance(self.test_size, (list, np.ndarray))
+        assert len(self.test_size) == self.nsystems
+
+        # init pick idx
+        self.pick_idx = 0
+
+        # derive system probabilities
+        self.sys_probs = None
+        self.set_sys_probs(sys_probs, auto_prob_style)
+
+        # check batch and test size
+        for ii in range(self.nsystems):
+            chk_ret = self.data_systems[ii].check_batch_size(self.batch_size[ii])
+            if chk_ret is not None and not is_auto_bs and not self.mixed_systems:
+                warnings.warn(
+                    "system %s required batch size is larger than the size of the dataset %s (%d > %d)"
+                    % (
+                        self.system_dirs[ii],
+                        chk_ret[0],
+                        self.batch_size[ii],
+                        chk_ret[1],
+                    )
+                )
+            chk_ret = self.data_systems[ii].check_test_size(self.test_size[ii])
+            if chk_ret is not None and not is_auto_bs and not self.mixed_systems:
+                warnings.warn(
+                    "system %s required test size is larger than the size of the dataset %s (%d > %d)"
+                    % (self.system_dirs[ii], chk_ret[0], self.test_size[ii], chk_ret[1])
+                )
+
+    def _load_test(self, ntests=-1):
+        self.test_data = collections.defaultdict(list)
+        for ii in range(self.nsystems):
+            test_system_data = self.data_systems[ii].get_test(ntests=ntests)
+            for nn in test_system_data:
+                self.test_data[nn].append(test_system_data[nn])
+
+    @property
+    @lru_cache(maxsize=None)
+    def default_mesh(self) -> List[np.ndarray]:
+        """Mesh for each system."""
+        return [
+            make_default_mesh(
+                self.data_systems[ii].pbc, self.data_systems[ii].mixed_type
+            )
+            for ii in range(self.nsystems)
+        ]
+
+    def compute_energy_shift(self, rcond=None, key="energy"):
+        sys_ener = []
+        for ss in self.data_systems:
+            sys_ener.append(ss.avg(key))
+        sys_ener = np.concatenate(sys_ener)
+        sys_tynatom = np.array(self.natoms_vec, dtype=GLOBAL_NP_FLOAT_PRECISION)
+        sys_tynatom = np.reshape(sys_tynatom, [self.nsystems, -1])
+        sys_tynatom = sys_tynatom[:, 2:]
+        energy_shift, resd, rank, s_value = np.linalg.lstsq(
+            sys_tynatom, sys_ener, rcond=rcond
+        )
+        return energy_shift
+
+    def add_dict(self, adict: dict) -> None:
+        """Add items to the data system by a `dict`.
+        `adict` should have items like
+        .. code-block:: python.
+
+           adict[key] = {
+               "ndof": ndof,
+               "atomic": atomic,
+               "must": must,
+               "high_prec": high_prec,
+               "type_sel": type_sel,
+               "repeat": repeat,
+           }
+
+        For the explaination of the keys see `add`
+        """
+        for kk in adict:
+            self.add(
+                kk,
+                adict[kk]["ndof"],
+                atomic=adict[kk]["atomic"],
+                must=adict[kk]["must"],
+                high_prec=adict[kk]["high_prec"],
+                type_sel=adict[kk]["type_sel"],
+                repeat=adict[kk]["repeat"],
+                default=adict[kk]["default"],
+            )
+
+    def add(
+        self,
+        key: str,
+        ndof: int,
+        atomic: bool = False,
+        must: bool = False,
+        high_prec: bool = False,
+        type_sel: Optional[List[int]] = None,
+        repeat: int = 1,
+        default: float = 0.0,
+    ):
+        """Add a data item that to be loaded.
+
+        Parameters
+        ----------
+        key
+            The key of the item. The corresponding data is stored in `sys_path/set.*/key.npy`
+        ndof
+            The number of dof
+        atomic
+            The item is an atomic property.
+            If False, the size of the data should be nframes x ndof
+            If True, the size of data should be nframes x natoms x ndof
+        must
+            The data file `sys_path/set.*/key.npy` must exist.
+            If must is False and the data file does not exist, the `data_dict[find_key]` is set to 0.0
+        high_prec
+            Load the data and store in float64, otherwise in float32
+        type_sel
+            Select certain type of atoms
+        repeat
+            The data will be repeated `repeat` times.
+        default, default=0.
+            Default value of data
+        """
+        for ii in self.data_systems:
+            ii.add(
+                key,
+                ndof,
+                atomic=atomic,
+                must=must,
+                high_prec=high_prec,
+                repeat=repeat,
+                type_sel=type_sel,
+                default=default,
+            )
+
+    def reduce(self, key_out, key_in):
+        """Generate a new item from the reduction of another atom.
+
+        Parameters
+        ----------
+        key_out
+            The name of the reduced item
+        key_in
+            The name of the data item to be reduced
+        """
+        for ii in self.data_systems:
+            ii.reduce(key_out, key_in)
+
+    def get_data_dict(self, ii: int = 0) -> dict:
+        return self.data_systems[ii].get_data_dict()
+
+    def set_sys_probs(self, sys_probs=None, auto_prob_style: str = "prob_sys_size"):
+        if sys_probs is None:
+            if auto_prob_style == "prob_uniform":
+                prob_v = 1.0 / float(self.nsystems)
+                probs = [prob_v for ii in range(self.nsystems)]
+            elif auto_prob_style[:13] == "prob_sys_size":
+                if auto_prob_style == "prob_sys_size":
+                    prob_style = f"prob_sys_size;0:{self.get_nsystems()}:1.0"
+                else:
+                    prob_style = auto_prob_style
+                probs = prob_sys_size_ext(
+                    prob_style, self.get_nsystems(), self.nbatches
+                )
+            else:
+                raise RuntimeError("Unknown auto prob style: " + auto_prob_style)
+        else:
+            probs = process_sys_probs(sys_probs, self.nbatches)
+        self.sys_probs = probs
+
+    def get_batch(self, sys_idx: Optional[int] = None) -> dict:
+        # batch generation style altered by Ziyao Li:
+        # one should specify the "sys_prob" and "auto_prob_style" params
+        # via set_sys_prob() function. The sys_probs this function uses is
+        # defined as a private variable, self.sys_probs, initialized in __init__().
+        # This is to optimize the (vain) efforts in evaluating sys_probs every batch.
+        """Get a batch of data from the data systems.
+
+        Parameters
+        ----------
+        sys_idx : int
+            The index of system from which the batch is get.
+            If sys_idx is not None, `sys_probs` and `auto_prob_style` are ignored
+            If sys_idx is None, automatically determine the system according to `sys_probs` or `auto_prob_style`, see the following.
+            This option does not work for mixed systems.
+
+        Returns
+        -------
+        dict
+            The batch data
+        """
+        if not self.mixed_systems:
+            b_data = self.get_batch_standard(sys_idx)
+        else:
+            b_data = self.get_batch_mixed()
+        return b_data
+
+    def get_batch_standard(self, sys_idx: Optional[int] = None) -> dict:
+        """Get a batch of data from the data systems in the standard way.
+
+        Parameters
+        ----------
+        sys_idx : int
+            The index of system from which the batch is get.
+            If sys_idx is not None, `sys_probs` and `auto_prob_style` are ignored
+            If sys_idx is None, automatically determine the system according to `sys_probs` or `auto_prob_style`, see the following.
+
+        Returns
+        -------
+        dict
+            The batch data
+        """
+        if sys_idx is not None:
+            self.pick_idx = sys_idx
+        else:
+            # prob = self._get_sys_probs(sys_probs, auto_prob_style)
+            self.pick_idx = dp_random.choice(np.arange(self.nsystems), p=self.sys_probs)
+        b_data = self.data_systems[self.pick_idx].get_batch(
+            self.batch_size[self.pick_idx]
+        )
+        b_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
+        b_data["default_mesh"] = self.default_mesh[self.pick_idx]
+        return b_data
+
+    def get_batch_mixed(self) -> dict:
+        """Get a batch of data from the data systems in the mixed way.
+
+        Returns
+        -------
+        dict
+            The batch data
+        """
+        # mixed systems have a global batch size
+        batch_size = self.batch_size[0]
+        batch_data = []
+        for _ in range(batch_size):
+            self.pick_idx = dp_random.choice(np.arange(self.nsystems), p=self.sys_probs)
+            bb_data = self.data_systems[self.pick_idx].get_batch(1)
+            bb_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
+            bb_data["default_mesh"] = self.default_mesh[self.pick_idx]
+            batch_data.append(bb_data)
+        b_data = self._merge_batch_data(batch_data)
+        return b_data
+
+    def _merge_batch_data(self, batch_data: List[dict]) -> dict:
+        """Merge batch data from different systems.
+
+        Parameters
+        ----------
+        batch_data : list of dict
+            A list of batch data from different systems.
+
+        Returns
+        -------
+        dict
+            The merged batch data.
+        """
+        b_data = {}
+        max_natoms = max(bb["natoms_vec"][0] for bb in batch_data)
+        # natoms_vec
+        natoms_vec = np.zeros(2 + self.get_ntypes(), dtype=int)
+        natoms_vec[0:3] = max_natoms
+        b_data["natoms_vec"] = natoms_vec
+        # real_natoms_vec
+        real_natoms_vec = np.vstack([bb["natoms_vec"] for bb in batch_data])
+        b_data["real_natoms_vec"] = real_natoms_vec
+        # type
+        type_vec = np.full((len(batch_data), max_natoms), -1, dtype=int)
+        for ii, bb in enumerate(batch_data):
+            type_vec[ii, : bb["type"].shape[1]] = bb["type"][0]
+        b_data["type"] = type_vec
+        # default_mesh
+        default_mesh = np.mean([bb["default_mesh"] for bb in batch_data], axis=0)
+        b_data["default_mesh"] = default_mesh
+        # other data
+        data_dict = self.get_data_dict(0)
+        for kk, vv in data_dict.items():
+            if kk not in batch_data[0]:
+                continue
+            b_data["find_" + kk] = batch_data[0]["find_" + kk]
+            if not vv["atomic"]:
+                b_data[kk] = np.concatenate([bb[kk] for bb in batch_data], axis=0)
+            else:
+                b_data[kk] = np.zeros(
+                    (len(batch_data), max_natoms * vv["ndof"] * vv["repeat"]),
+                    dtype=batch_data[0][kk].dtype,
+                )
+                for ii, bb in enumerate(batch_data):
+                    b_data[kk][ii, : bb[kk].shape[1]] = bb[kk][0]
+        return b_data
+
+    # ! altered by Marián Rynik
+    def get_test(self, sys_idx: Optional[int] = None, n_test: int = -1):  # depreciated
+        """Get test data from the the data systems.
+
+        Parameters
+        ----------
+        sys_idx
+            The test dat of system with index `sys_idx` will be returned.
+            If is None, the currently selected system will be returned.
+        n_test
+            Number of test data. If set to -1 all test data will be get.
+        """
+        if not hasattr(self, "test_data"):
+            self._load_test(ntests=n_test)
+        if sys_idx is not None:
+            idx = sys_idx
+        else:
+            idx = self.pick_idx
+
+        test_system_data = {}
+        for nn in self.test_data:
+            test_system_data[nn] = self.test_data[nn][idx]
+        test_system_data["natoms_vec"] = self.natoms_vec[idx]
+        test_system_data["default_mesh"] = self.default_mesh[idx]
+        return test_system_data
+
+    def get_sys_ntest(self, sys_idx=None):
+        """Get number of tests for the currently selected system,
+        or one defined by sys_idx.
+        """
+        if sys_idx is not None:
+            return self.test_size[sys_idx]
+        else:
+            return self.test_size[self.pick_idx]
+
+    def get_type_map(self) -> List[str]:
+        """Get the type map."""
+        return self.type_map
+
+    def get_nbatches(self) -> int:
+        """Get the total number of batches."""
+        return self.nbatches
+
+    def get_ntypes(self) -> int:
+        """Get the number of types."""
+        return self.sys_ntypes
+
+    def get_nsystems(self) -> int:
+        """Get the number of data systems."""
+        return self.nsystems
+
+    def get_sys(self, idx: int) -> DeepmdData:
+        """Get a certain data system."""
+        return self.data_systems[idx]
+
+    def get_batch_size(self) -> int:
+        """Get the batch size."""
+        return self.batch_size
+
+    def _format_name_length(self, name, width):
+        if len(name) <= width:
+            return "{: >{}}".format(name, width)
+        else:
+            name = name[-(width - 3) :]
+            name = "-- " + name
+            return name
+
+    def print_summary(self, name):
+        # width 65
+        sys_width = 42
+        log.info(
+            f"---Summary of DataSystem: {name:13s}-----------------------------------------------"
+        )
+        log.info("found %d system(s):" % self.nsystems)
+        log.info(
+            ("%s  " % self._format_name_length("system", sys_width))
+            + ("%6s  %6s  %6s  %9s  %3s" % ("natoms", "bch_sz", "n_bch", "prob", "pbc"))
+        )
+        for ii in range(self.nsystems):
+            log.info(
+                "%s  %6d  %6d  %6d  %9.3e  %3s"
+                % (
+                    self._format_name_length(self.system_dirs[ii], sys_width),
+                    self.natoms[ii],
+                    # TODO batch size * nbatches = number of structures
+                    self.batch_size[ii],
+                    self.nbatches[ii],
+                    self.sys_probs[ii],
+                    "T" if self.data_systems[ii].pbc else "F",
+                )
+            )
+        log.info(
+            "--------------------------------------------------------------------------------------"
+        )
+
+    def _make_auto_bs(self, rule):
+        bs = []
+        for ii in self.data_systems:
+            ni = ii.get_natoms()
+            bsi = rule // ni
+            if bsi * ni < rule:
+                bsi += 1
+            bs.append(bsi)
+        return bs
+
+    # ! added by Marián Rynik
+    def _make_auto_ts(self, percent):
+        ts = []
+        for ii in range(self.nsystems):
+            ni = self.batch_size[ii] * self.nbatches[ii]
+            tsi = int(ni * percent / 100)
+            ts.append(tsi)
+
+        return ts
+
+    def _check_type_map_consistency(self, type_map_list):
+        ret = []
+        for ii in type_map_list:
+            if ii is not None:
+                min_len = min([len(ii), len(ret)])
+                for idx in range(min_len):
+                    if ii[idx] != ret[idx]:
+                        raise RuntimeError(f"inconsistent type map: {ret!s} {ii!s}")
+                if len(ii) > len(ret):
+                    ret = ii
+        return ret
+
+
+def process_sys_probs(sys_probs, nbatch):
+    sys_probs = np.array(sys_probs)
+    type_filter = sys_probs >= 0
+    assigned_sum_prob = np.sum(type_filter * sys_probs)
+    # 1e-8 is to handle floating point error; See #1917
+    assert (
+        assigned_sum_prob <= 1.0 + 1e-8
+    ), "the sum of assigned probability should be less than 1"
+    rest_sum_prob = 1.0 - assigned_sum_prob
+    if not np.isclose(rest_sum_prob, 0):
+        rest_nbatch = (1 - type_filter) * nbatch
+        rest_prob = rest_sum_prob * rest_nbatch / np.sum(rest_nbatch)
+        ret_prob = rest_prob + type_filter * sys_probs
+    else:
+        ret_prob = sys_probs
+    assert np.isclose(np.sum(ret_prob), 1), "sum of probs should be 1"
+    return ret_prob
+
+
+def prob_sys_size_ext(keywords, nsystems, nbatch):
+    block_str = keywords.split(";")[1:]
+    block_stt = []
+    block_end = []
+    block_weights = []
+    for ii in block_str:
+        stt = int(ii.split(":")[0])
+        end = int(ii.split(":")[1])
+        weight = float(ii.split(":")[2])
+        assert weight >= 0, "the weight of a block should be no less than 0"
+        block_stt.append(stt)
+        block_end.append(end)
+        block_weights.append(weight)
+    nblocks = len(block_str)
+    block_probs = np.array(block_weights) / np.sum(block_weights)
+    sys_probs = np.zeros([nsystems])
+    for ii in range(nblocks):
+        nbatch_block = nbatch[block_stt[ii] : block_end[ii]]
+        tmp_prob = [float(i) for i in nbatch_block] / np.sum(nbatch_block)
+        sys_probs[block_stt[ii] : block_end[ii]] = tmp_prob * block_probs[ii]
+    return sys_probs
diff --git a/deepmd_utils/utils/errors.py b/deepmd_utils/utils/errors.py
new file mode 100644
index 0000000000..11f42ede96
--- /dev/null
+++ b/deepmd_utils/utils/errors.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+class OutOfMemoryError(Exception):
+    """This error is caused by out-of-memory (OOM)."""
diff --git a/deepmd_utils/utils/model_stat.py b/deepmd_utils/utils/model_stat.py
new file mode 100644
index 0000000000..d2cc918b64
--- /dev/null
+++ b/deepmd_utils/utils/model_stat.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from collections import (
+    defaultdict,
+)
+
+import numpy as np
+
+
+def _make_all_stat_ref(data, nbatches):
+    all_stat = defaultdict(list)
+    for ii in range(data.get_nsystems()):
+        for jj in range(nbatches):
+            stat_data = data.get_batch(sys_idx=ii)
+            for dd in stat_data:
+                if dd == "natoms_vec":
+                    stat_data[dd] = stat_data[dd].astype(np.int32)
+                all_stat[dd].append(stat_data[dd])
+    return all_stat
+
+
+def make_stat_input(data, nbatches, merge_sys=True):
+    """Pack data for statistics.
+
+    Parameters
+    ----------
+    data
+        The data
+    nbatches : int
+        The number of batches
+    merge_sys : bool (True)
+        Merge system data
+
+    Returns
+    -------
+    all_stat:
+        A dictionary of list of list storing data for stat.
+        if merge_sys == False data can be accessed by
+            all_stat[key][sys_idx][batch_idx][frame_idx]
+        else merge_sys == True can be accessed by
+            all_stat[key][batch_idx][frame_idx]
+    """
+    all_stat = defaultdict(list)
+    for ii in range(data.get_nsystems()):
+        sys_stat = defaultdict(list)
+        for jj in range(nbatches):
+            stat_data = data.get_batch(sys_idx=ii)
+            for dd in stat_data:
+                if dd == "natoms_vec":
+                    stat_data[dd] = stat_data[dd].astype(np.int32)
+                sys_stat[dd].append(stat_data[dd])
+        for dd in sys_stat:
+            if merge_sys:
+                for bb in sys_stat[dd]:
+                    all_stat[dd].append(bb)
+            else:
+                all_stat[dd].append(sys_stat[dd])
+    return all_stat
+
+
+def merge_sys_stat(all_stat):
+    first_key = next(iter(all_stat.keys()))
+    nsys = len(all_stat[first_key])
+    ret = defaultdict(list)
+    for ii in range(nsys):
+        for dd in all_stat:
+            for bb in all_stat[dd][ii]:
+                ret[dd].append(bb)
+    return ret
diff --git a/deepmd_utils/utils/pair_tab.py b/deepmd_utils/utils/pair_tab.py
new file mode 100644
index 0000000000..4451f53379
--- /dev/null
+++ b/deepmd_utils/utils/pair_tab.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Tuple,
+)
+
+import numpy as np
+from scipy.interpolate import (
+    CubicSpline,
+)
+
+
+class PairTab:
+    """Pairwise tabulated potential.
+
+    Parameters
+    ----------
+    filename
+            File name for the short-range tabulated potential.
+            The table is a text data file with (N_t + 1) * N_t / 2 + 1 columes.
+            The first colume is the distance between atoms.
+            The second to the last columes are energies for pairs of certain types.
+            For example we have two atom types, 0 and 1.
+            The columes from 2nd to 4th are for 0-0, 0-1 and 1-1 correspondingly.
+    """
+
+    def __init__(self, filename: str) -> None:
+        """Constructor."""
+        self.reinit(filename)
+
+    def reinit(self, filename: str) -> None:
+        """Initialize the tabulated interaction.
+
+        Parameters
+        ----------
+        filename
+            File name for the short-range tabulated potential.
+            The table is a text data file with (N_t + 1) * N_t / 2 + 1 columes.
+            The first colume is the distance between atoms.
+            The second to the last columes are energies for pairs of certain types.
+            For example we have two atom types, 0 and 1.
+            The columes from 2nd to 4th are for 0-0, 0-1 and 1-1 correspondingly.
+        """
+        self.vdata = np.loadtxt(filename)
+        self.rmin = self.vdata[0][0]
+        self.hh = self.vdata[1][0] - self.vdata[0][0]
+        self.nspline = self.vdata.shape[0] - 1
+        ncol = self.vdata.shape[1] - 1
+        n0 = (-1 + np.sqrt(1 + 8 * ncol)) * 0.5
+        self.ntypes = int(n0 + 0.1)
+        assert self.ntypes * (self.ntypes + 1) // 2 == ncol, (
+            "number of volumes provided in %s does not match guessed number of types %d"
+            % (filename, self.ntypes)
+        )
+        self.tab_info = np.array([self.rmin, self.hh, self.nspline, self.ntypes])
+        self.tab_data = self._make_data()
+
+    def get(self) -> Tuple[np.array, np.array]:
+        """Get the serialized table."""
+        return self.tab_info, self.tab_data
+
+    def _make_data(self):
+        data = np.zeros([self.ntypes * self.ntypes * 4 * self.nspline])
+        stride = 4 * self.nspline
+        idx_iter = 0
+        xx = self.vdata[:, 0]
+        for t0 in range(self.ntypes):
+            for t1 in range(t0, self.ntypes):
+                vv = self.vdata[:, 1 + idx_iter]
+                cs = CubicSpline(xx, vv)
+                dd = cs(xx, 1)
+                dd *= self.hh
+                dtmp = np.zeros(stride)
+                for ii in range(self.nspline):
+                    dtmp[ii * 4 + 0] = 2 * vv[ii] - 2 * vv[ii + 1] + dd[ii] + dd[ii + 1]
+                    dtmp[ii * 4 + 1] = (
+                        -3 * vv[ii] + 3 * vv[ii + 1] - 2 * dd[ii] - dd[ii + 1]
+                    )
+                    dtmp[ii * 4 + 2] = dd[ii]
+                    dtmp[ii * 4 + 3] = vv[ii]
+                data[
+                    (t0 * self.ntypes + t1) * stride : (t0 * self.ntypes + t1) * stride
+                    + stride
+                ] = dtmp
+                data[
+                    (t1 * self.ntypes + t0) * stride : (t1 * self.ntypes + t0) * stride
+                    + stride
+                ] = dtmp
+                idx_iter += 1
+        return data
diff --git a/deepmd_utils/utils/path.py b/deepmd_utils/utils/path.py
new file mode 100644
index 0000000000..a8e4bc329f
--- /dev/null
+++ b/deepmd_utils/utils/path.py
@@ -0,0 +1,358 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import os
+from abc import (
+    ABC,
+    abstractmethod,
+)
+from functools import (
+    lru_cache,
+)
+from pathlib import (
+    Path,
+)
+from typing import (
+    List,
+    Optional,
+)
+
+import h5py
+import numpy as np
+from wcmatch.glob import (
+    globfilter,
+)
+
+
+class DPPath(ABC):
+    """The path class to data system (DeepmdData).
+
+    Parameters
+    ----------
+    path : str
+        path
+    """
+
+    def __new__(cls, path: str):
+        if cls is DPPath:
+            if os.path.isdir(path):
+                return super().__new__(DPOSPath)
+            elif os.path.isfile(path.split("#")[0]):
+                # assume h5 if it is not dir
+                # TODO: check if it is a real h5? or just check suffix?
+                return super().__new__(DPH5Path)
+            raise FileNotFoundError("%s not found" % path)
+        return super().__new__(cls)
+
+    @abstractmethod
+    def load_numpy(self) -> np.ndarray:
+        """Load NumPy array.
+
+        Returns
+        -------
+        np.ndarray
+            loaded NumPy array
+        """
+
+    @abstractmethod
+    def load_txt(self, **kwargs) -> np.ndarray:
+        """Load NumPy array from text.
+
+        Returns
+        -------
+        np.ndarray
+            loaded NumPy array
+        """
+
+    @abstractmethod
+    def glob(self, pattern: str) -> List["DPPath"]:
+        """Search path using the glob pattern.
+
+        Parameters
+        ----------
+        pattern : str
+            glob pattern
+
+        Returns
+        -------
+        List[DPPath]
+            list of paths
+        """
+
+    @abstractmethod
+    def rglob(self, pattern: str) -> List["DPPath"]:
+        """This is like calling :meth:`DPPath.glob()` with `**/` added in front
+        of the given relative pattern.
+
+        Parameters
+        ----------
+        pattern : str
+            glob pattern
+
+        Returns
+        -------
+        List[DPPath]
+            list of paths
+        """
+
+    @abstractmethod
+    def is_file(self) -> bool:
+        """Check if self is file."""
+
+    @abstractmethod
+    def is_dir(self) -> bool:
+        """Check if self is directory."""
+
+    @abstractmethod
+    def __truediv__(self, key: str) -> "DPPath":
+        """Used for / operator."""
+
+    @abstractmethod
+    def __lt__(self, other: "DPPath") -> bool:
+        """Whether this DPPath is less than other for sorting."""
+
+    @abstractmethod
+    def __str__(self) -> str:
+        """Represent string."""
+
+    def __repr__(self) -> str:
+        return f"{type(self)} ({self!s})"
+
+    def __eq__(self, other) -> bool:
+        return str(self) == str(other)
+
+    def __hash__(self):
+        return hash(str(self))
+
+
+class DPOSPath(DPPath):
+    """The OS path class to data system (DeepmdData) for real directories.
+
+    Parameters
+    ----------
+    path : str
+        path
+    """
+
+    def __init__(self, path: str) -> None:
+        super().__init__()
+        if isinstance(path, Path):
+            self.path = path
+        else:
+            self.path = Path(path)
+
+    def load_numpy(self) -> np.ndarray:
+        """Load NumPy array.
+
+        Returns
+        -------
+        np.ndarray
+            loaded NumPy array
+        """
+        return np.load(str(self.path))
+
+    def load_txt(self, **kwargs) -> np.ndarray:
+        """Load NumPy array from text.
+
+        Returns
+        -------
+        np.ndarray
+            loaded NumPy array
+        """
+        return np.loadtxt(str(self.path), **kwargs)
+
+    def glob(self, pattern: str) -> List["DPPath"]:
+        """Search path using the glob pattern.
+
+        Parameters
+        ----------
+        pattern : str
+            glob pattern
+
+        Returns
+        -------
+        List[DPPath]
+            list of paths
+        """
+        # currently DPOSPath will only derivative DPOSPath
+        # TODO: discuss if we want to mix DPOSPath and DPH5Path?
+        return [type(self)(p) for p in self.path.glob(pattern)]
+
+    def rglob(self, pattern: str) -> List["DPPath"]:
+        """This is like calling :meth:`DPPath.glob()` with `**/` added in front
+        of the given relative pattern.
+
+        Parameters
+        ----------
+        pattern : str
+            glob pattern
+
+        Returns
+        -------
+        List[DPPath]
+            list of paths
+        """
+        return [type(self)(p) for p in self.path.rglob(pattern)]
+
+    def is_file(self) -> bool:
+        """Check if self is file."""
+        return self.path.is_file()
+
+    def is_dir(self) -> bool:
+        """Check if self is directory."""
+        return self.path.is_dir()
+
+    def __truediv__(self, key: str) -> "DPPath":
+        """Used for / operator."""
+        return type(self)(self.path / key)
+
+    def __lt__(self, other: "DPOSPath") -> bool:
+        """Whether this DPPath is less than other for sorting."""
+        return self.path < other.path
+
+    def __str__(self) -> str:
+        """Represent string."""
+        return str(self.path)
+
+
+class DPH5Path(DPPath):
+    """The path class to data system (DeepmdData) for HDF5 files.
+
+    Notes
+    -----
+    OS - HDF5 relationship:
+        directory - Group
+        file - Dataset
+
+    Parameters
+    ----------
+    path : str
+        path
+    """
+
+    def __init__(self, path: str) -> None:
+        super().__init__()
+        # we use "#" to split path
+        # so we do not support file names containing #...
+        s = path.split("#")
+        self.root_path = s[0]
+        self.root = self._load_h5py(s[0])
+        # h5 path: default is the root path
+        self.name = s[1] if len(s) > 1 else "/"
+
+    @classmethod
+    @lru_cache(None)
+    def _load_h5py(cls, path: str) -> h5py.File:
+        """Load hdf5 file.
+
+        Parameters
+        ----------
+        path : str
+            path to hdf5 file
+        """
+        # this method has cache to avoid duplicated
+        # loading from different DPH5Path
+        # However the file will be never closed?
+        return h5py.File(path, "r")
+
+    def load_numpy(self) -> np.ndarray:
+        """Load NumPy array.
+
+        Returns
+        -------
+        np.ndarray
+            loaded NumPy array
+        """
+        return self.root[self.name][:]
+
+    def load_txt(self, dtype: Optional[np.dtype] = None, **kwargs) -> np.ndarray:
+        """Load NumPy array from text.
+
+        Returns
+        -------
+        np.ndarray
+            loaded NumPy array
+        """
+        arr = self.load_numpy()
+        if dtype:
+            arr = arr.astype(dtype)
+        return arr
+
+    def glob(self, pattern: str) -> List["DPPath"]:
+        """Search path using the glob pattern.
+
+        Parameters
+        ----------
+        pattern : str
+            glob pattern
+
+        Returns
+        -------
+        List[DPPath]
+            list of paths
+        """
+        # got paths starts with current path first, which is faster
+        subpaths = [ii for ii in self._keys if ii.startswith(self.name)]
+        return [
+            type(self)(f"{self.root_path}#{pp}")
+            for pp in globfilter(subpaths, self._connect_path(pattern))
+        ]
+
+    def rglob(self, pattern: str) -> List["DPPath"]:
+        """This is like calling :meth:`DPPath.glob()` with `**/` added in front
+        of the given relative pattern.
+
+        Parameters
+        ----------
+        pattern : str
+            glob pattern
+
+        Returns
+        -------
+        List[DPPath]
+            list of paths
+        """
+        return self.glob("**" + pattern)
+
+    @property
+    def _keys(self) -> List[str]:
+        """Walk all groups and dataset."""
+        return self._file_keys(self.root)
+
+    @classmethod
+    @lru_cache(None)
+    def _file_keys(cls, file: h5py.File) -> List[str]:
+        """Walk all groups and dataset."""
+        l = []
+        file.visit(lambda x: l.append("/" + x))
+        return l
+
+    def is_file(self) -> bool:
+        """Check if self is file."""
+        if self.name not in self._keys:
+            return False
+        return isinstance(self.root[self.name], h5py.Dataset)
+
+    def is_dir(self) -> bool:
+        """Check if self is directory."""
+        if self.name not in self._keys:
+            return False
+        return isinstance(self.root[self.name], h5py.Group)
+
+    def __truediv__(self, key: str) -> "DPPath":
+        """Used for / operator."""
+        return type(self)(f"{self.root_path}#{self._connect_path(key)}")
+
+    def _connect_path(self, path: str) -> str:
+        """Connect self with path."""
+        if self.name.endswith("/"):
+            return f"{self.name}{path}"
+        return f"{self.name}/{path}"
+
+    def __lt__(self, other: "DPH5Path") -> bool:
+        """Whether this DPPath is less than other for sorting."""
+        if self.root_path == other.root_path:
+            return self.name < other.name
+        return self.root_path < other.root_path
+
+    def __str__(self) -> str:
+        """Returns path of self."""
+        return f"{self.root_path}#{self.name}"
diff --git a/deepmd_utils/utils/plugin.py b/deepmd_utils/utils/plugin.py
new file mode 100644
index 0000000000..2a77b744c5
--- /dev/null
+++ b/deepmd_utils/utils/plugin.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Base of plugin systems."""
+# copied from https://github.com/deepmodeling/dpdata/blob/a3e76d75de53f6076254de82d18605a010dc3b00/dpdata/plugin.py
+
+from abc import (
+    ABCMeta,
+)
+from typing import (
+    Callable,
+)
+
+
+class Plugin:
+    """A class to register and restore plugins.
+
+    Attributes
+    ----------
+    plugins : Dict[str, object]
+        plugins
+
+    Examples
+    --------
+    >>> plugin = Plugin()
+    >>> @plugin.register("xx")
+        def xxx():
+            pass
+    >>> print(plugin.plugins['xx'])
+    """
+
+    def __init__(self):
+        self.plugins = {}
+
+    def __add__(self, other) -> "Plugin":
+        self.plugins.update(other.plugins)
+        return self
+
+    def register(self, key: str) -> Callable[[object], object]:
+        """Register a plugin.
+
+        Parameters
+        ----------
+        key : str
+            key of the plugin
+
+        Returns
+        -------
+        Callable[[object], object]
+            decorator
+        """
+
+        def decorator(object: object) -> object:
+            self.plugins[key] = object
+            return object
+
+        return decorator
+
+    def get_plugin(self, key) -> object:
+        """Visit a plugin by key.
+
+        Parameters
+        ----------
+        key : str
+            key of the plugin
+
+        Returns
+        -------
+        object
+            the plugin
+        """
+        return self.plugins[key]
+
+
+class VariantMeta:
+    def __call__(cls, *args, **kwargs):
+        """Remove `type` and keys that starts with underline."""
+        obj = cls.__new__(cls, *args, **kwargs)
+        kwargs.pop("type", None)
+        to_pop = []
+        for kk in kwargs:
+            if kk[0] == "_":
+                to_pop.append(kk)
+        for kk in to_pop:
+            kwargs.pop(kk, None)
+        obj.__init__(*args, **kwargs)
+        return obj
+
+
+class VariantABCMeta(VariantMeta, ABCMeta):
+    pass
+
+
+class PluginVariant(metaclass=VariantABCMeta):
+    """A class to remove `type` from input arguments."""
+
+    pass
diff --git a/deepmd_utils/utils/random.py b/deepmd_utils/utils/random.py
new file mode 100644
index 0000000000..8944419412
--- /dev/null
+++ b/deepmd_utils/utils/random.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Optional,
+)
+
+import numpy as np
+
+_RANDOM_GENERATOR = np.random.RandomState()
+
+
+def choice(a: np.ndarray, p: Optional[np.ndarray] = None):
+    """Generates a random sample from a given 1-D array.
+
+    Parameters
+    ----------
+    a : np.ndarray
+        A random sample is generated from its elements.
+    p : np.ndarray
+        The probabilities associated with each entry in a.
+
+    Returns
+    -------
+    np.ndarray
+        arrays with results and their shapes
+    """
+    return _RANDOM_GENERATOR.choice(a, p=p)
+
+
+def random(size=None):
+    """Return random floats in the half-open interval [0.0, 1.0).
+
+    Parameters
+    ----------
+    size
+        Output shape.
+
+    Returns
+    -------
+    np.ndarray
+        Arrays with results and their shapes.
+    """
+    return _RANDOM_GENERATOR.random_sample(size)
+
+
+def seed(val: Optional[int] = None):
+    """Seed the generator.
+
+    Parameters
+    ----------
+    val : int
+        Seed.
+    """
+    _RANDOM_GENERATOR.seed(val)
+
+
+def shuffle(x: np.ndarray):
+    """Modify a sequence in-place by shuffling its contents.
+
+    Parameters
+    ----------
+    x : np.ndarray
+        The array or list to be shuffled.
+    """
+    _RANDOM_GENERATOR.shuffle(x)
+
+
+__all__ = ["choice", "random", "seed", "shuffle"]
diff --git a/deepmd_utils/utils/weight_avg.py b/deepmd_utils/utils/weight_avg.py
new file mode 100644
index 0000000000..b344d3bb75
--- /dev/null
+++ b/deepmd_utils/utils/weight_avg.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from collections import (
+    defaultdict,
+)
+from typing import (
+    Dict,
+    List,
+    Tuple,
+)
+
+import numpy as np
+
+
+def weighted_average(errors: List[Dict[str, Tuple[float, float]]]) -> Dict:
+    """Compute wighted average of prediction errors (MAE or RMSE) for model.
+
+    Parameters
+    ----------
+    errors : List[Dict[str, Tuple[float, float]]]
+        List: the error of systems
+        Dict: the error of quantities, name given by the key
+        str: the name of the quantity, must starts with 'mae' or 'rmse'
+        Tuple: (error, weight)
+
+    Returns
+    -------
+    Dict
+        weighted averages
+    """
+    sum_err = defaultdict(float)
+    sum_siz = defaultdict(int)
+    for err in errors:
+        for kk, (ee, ss) in err.items():
+            if kk.startswith("mae"):
+                sum_err[kk] += ee * ss
+            elif kk.startswith("rmse"):
+                sum_err[kk] += ee * ee * ss
+            else:
+                raise RuntimeError("unknown error type")
+            sum_siz[kk] += ss
+    for kk in sum_err.keys():
+        if kk.startswith("mae"):
+            sum_err[kk] = sum_err[kk] / sum_siz[kk]
+        elif kk.startswith("rmse"):
+            sum_err[kk] = np.sqrt(sum_err[kk] / sum_siz[kk])
+        else:
+            raise RuntimeError("unknown error type")
+    return sum_err
diff --git a/doc/conf.py b/doc/conf.py
index 4aa513d1a7..63af974a86 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -134,7 +134,7 @@ def run_apidoc(_):
 
     sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
     cur_dir = os.path.abspath(os.path.dirname(__file__))
-    module = os.path.join(cur_dir, "..", "deepmd")
+    module = os.path.join(cur_dir, "..")
     main(
         [
             "-M",
@@ -145,6 +145,7 @@ def run_apidoc(_):
             "-o",
             os.path.join(cur_dir, "api_py"),
             module,
+            "source/*",
             "--force",
         ]
     )
@@ -298,6 +299,7 @@ def setup(app):
     "dollarmath",
     "colon_fence",
 ]
+myst_fence_as_directive = ("math",)
 # fix emoji issue in pdf
 latex_engine = "xelatex"
 latex_elements = {
diff --git a/doc/freeze/compress.md b/doc/freeze/compress.md
index 696d1377bf..7394f77143 100644
--- a/doc/freeze/compress.md
+++ b/doc/freeze/compress.md
@@ -1,5 +1,54 @@
 # Compress a model
 
+## Theory
+
+The compression of the DP model uses three techniques, tabulated inference, operator merging, and precise neighbor indexing, to improve the performance of model training and inference when the model parameters are properly trained.
+
+For better performance, the NN inference can be replaced by tabulated function evaluations if the input of the NN is of dimension one.
+The idea is to approximate the output of the NN by a piece-wise polynomial fitting.
+The input domain (a compact domain in $\mathbb R$) is divided into $L_c$ equally spaced intervals, in which we apply a fifth-order polynomial $g^l_m(x)$ approximation of the $m$-th output component of the NN function:
+```math
+    g^l_m(x) = a^l_m x^5 + b^l_m x^4 + c^l_m x^3 + d^l_m x^2 + e^l_m x + f^l_m,\quad
+    x \in [x_l, x_{l+1}),
+```
+where $l=1,2,\dots,L_c$ is the index of the intervals, $x_1, \dots, x_{L_c}, x_{L_c+1}$ are the endpoints of the intervals, and $a^l_m$, $b^l_m$, $c^l_m$, $d^l_m$, $e^l_m$, and $f^l_m$ are the fitting parameters.
+The fitting parameters can be computed by the equations below:
+```math
+    a^l_m = \frac{1}{2\Delta x_l^5}[12h_{m,l}-6(y'_{m,l+1}+y'_{m,l})\Delta x_l + (y''_{m,l+1}-y''_{m,l})\Delta x_l^2],
+```
+```math
+    b^l_m = \frac{1}{2\Delta x_l^4}[-30h_{m,l} +(14y'_{m,l+1}+16y'_{m,l})\Delta x_l + (-2y''_{m,l+1}+3y''_{m,l})\Delta x_l^2],
+```
+```math
+    c^l_m = \frac{1}{2\Delta x_l^3}[20h_{m,l}-(8y'_{m,l+1}+12y'_{m,l})\Delta x_l + (y''_{m,l+1}-3y''_{m,l})\Delta x_l^2],
+```
+```math
+    d^l_m = \frac{1}{2}y''_{m,l},
+```
+```math
+    e^l_m = y_{m,l}',
+```
+```math
+    f^l_m = y_{m,l},
+```
+where $\Delta x_l=x_{l+1}-x_l$ denotes the size of the interval. $h_{m,l}=y_{m,l+1}-y_{m,l}$. $y_{m,l} = y_m(x_l)$, $y'_{m,l} = y'_m(x_l)$ and $y''_{m,l} = y''_m(x_l)$ are the value, the first-order derivative, and the second-order derivative of the $m$-th component of the target NN function at the interval point $x_l$, respectively.
+The first and second-order derivatives are easily calculated by the back-propagation of the NN functions.
+
+In the standard DP model inference, taking the [two-body embedding descriptor](../model/train-se-e2-a.md) as an example, the matrix product $(\mathcal G^i)^T \mathcal R$ requires the transfer of the tensor  $\mathcal G^i$ between the register and the host/device memories, which usually becomes the bottle-neck of the computation due to the relatively small memory bandwidth of the GPUs.
+The compressed DP model merges the matrix multiplication $(\mathcal G^i)^T \mathcal R$ with the tabulated inference step.
+More specifically, once one column of the $(\mathcal G^i)^T$ is evaluated, it is immediately multiplied with one row of the environment matrix in the register, and the outer product is deposited to the result of $(\mathcal G^i)^T \mathcal R$.
+By the operator merging technique, the allocation of  $\mathcal G^i$ and the memory movement between register and host/device memories is avoided.
+The operator merging of the three-body embedding can be derived analogously.
+
+The first dimension, $N_c$, of the environment ($\mathcal R^i$) and embedding ($\mathcal G^i$) matrices is the expected maximum number of neighbors.
+If the number of neighbors of an atom is smaller than $N_c$, the corresponding positions of the matrices are pad with zeros.
+In practice, if the real number of neighbors is significantly smaller than $N_c$, a notable operation is spent on the multiplication of padding zeros.
+In the compressed DP model, the number of neighbors is precisely indexed at the tabulated inference stage, further saving computational costs.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
+## Instructions
+
 Once the frozen model is obtained from DeePMD-kit, we can get the neural network structure and its parameters (weights, biases, etc.) from the trained model, and compress it in the following way:
 ```bash
 dp compress -i graph.pb -o graph-compress.pb
diff --git a/doc/getting-started/quick_start.ipynb b/doc/getting-started/quick_start.ipynb
index 31209ae381..ec939265fd 100644
--- a/doc/getting-started/quick_start.ipynb
+++ b/doc/getting-started/quick_start.ipynb
@@ -102,8 +102,7 @@
    "metadata": {},
    "source": [
     "## General Introduction\n",
-    "This tutorial will introduce you to the basic usage of the DeePMD-kit, taking a gas phase methane molecule as an example. [DeePMD-kit's documentation](https://docs.deepmodeling.org/projects/deepmd/ \n",
-    ") is recommended as the complete reference.\n",
+    "This tutorial will introduce you to the basic usage of the DeePMD-kit, taking a gas phase methane molecule as an example. [DeePMD-kit's documentation](../index.rst) is recommended as the complete reference.\n",
     "\n",
     "The DP model is generated using the DeePMD-kit package (v2.1.5). The training data is converted into the format of DeePMD-kit using a tool named dpdata (v0.2.14). \n",
     "\n",
@@ -129,7 +128,7 @@
    "id": "209c5dd7-983a-468e-9406-652ade04be91",
    "metadata": {},
    "source": [
-    "Folder `abacus_md` is obtained by performing ab-initio molecular dynamics with ABACUS. Detailed instructions on ABACUS can be found in its [document](https://abacus.deepmodeling.com/en/latest/). "
+    "Folder `abacus_md` is obtained by performing ab-initio molecular dynamics with ABACUS. Detailed instructions on ABACUS can be found in its [document](https://abacus.deepmodeling.com/). "
    ]
   },
   {
@@ -208,7 +207,8 @@
     "print(\"# the data contains %d frames\" % len(data))\n",
     "\n",
     "# random choose 40 index for validation_data\n",
-    "index_validation = np.random.choice(201, size=40, replace=False)\n",
+    "rng = np.random.default_rng()\n",
+    "index_validation = rng.choice(201, size=40, replace=False)\n",
     "\n",
     "# other indexes are training_data\n",
     "index_training = list(set(range(201)) - set(index_validation))\n",
@@ -455,7 +455,7 @@
    "id": "7b0edb0f-df47-4e6c-8c37-5f32c4bd6b39",
    "metadata": {},
    "source": [
-    "More detailed docs about Data conversion can be found [here](https://docs.deepmodeling.org/projects/deepmd/en/master/data/data-conv.html)."
+    "More detailed docs about Data conversion can be found [here](../data/data-conv.md)."
    ]
   },
   {
diff --git a/doc/inference/python.md b/doc/inference/python.md
index 48eb1d7df0..b5d3ca1efc 100644
--- a/doc/inference/python.md
+++ b/doc/inference/python.md
@@ -27,3 +27,20 @@ model_devi = calc_model_devi(coord, cell, atype, graphs)
 ```
 
 Note that if the model inference or model deviation is performed cyclically, one should avoid calling the same model multiple times. Otherwise, tensorFlow will never release the memory and this may lead to an out-of-memory (OOM) error.
+
+## External neighbor list algorithm
+
+The native neighbor list algorithm of the DeePMD-kit is in $O(N^2)$ complexity ($N$ is the number of atoms).
+While this is not a problem for small systems that quantum methods can afford, the large systems for molecular dynamics have slow performance.
+In this case, one may pass an external neighbor list that has lower complexity to {class}`DeepPot <deepmd.infer.DeepPot>`, once it is compatible with {class}`ase.neighborlist.NewPrimitiveNeighborList`.
+
+```py
+import ase.neighborlist
+
+neighbor_list = ase.neighborlist.NewPrimitiveNeighborList(
+    cutoffs=6, bothways=True, self_interaction=False
+)
+dp = DeepPot("graph.pb", neighbor_list=neighbor_list)
+```
+
+The `update` and `build` methods will be called by {class}`DeepPot <deepmd.infer.DeepPot>`, and `first_neigh`, `pair_second`, and `offset_vec` properties will be used.
diff --git a/doc/install/easy-install-dev.md b/doc/install/easy-install-dev.md
index dd943c37af..f3cf52c1f5 100644
--- a/doc/install/easy-install-dev.md
+++ b/doc/install/easy-install-dev.md
@@ -6,26 +6,28 @@ The following is the way to install the pre-compiled packages without [building
 
 ## Install with docker
 
-The [`devel` tag](https://github.com/deepmodeling/deepmd-kit/pkgs/container/deepmd-kit/131827568?tag=devel) is used to mark the latest development version with CUDA support:
+The [`devel` tag](https://github.com/deepmodeling/deepmd-kit/pkgs/container/deepmd-kit/131827568?tag=devel) is used to mark the latest development version with CUDA 12.2 support:
 
 ```bash
 docker pull ghcr.io/deepmodeling/deepmd-kit:devel
 ```
 
+For CUDA 11.8 support, use the `devel_cu11` tag.
+
 ## Install with pip
 
 Below is an one-line shell command to download the [artifact](https://nightly.link/deepmodeling/deepmd-kit/workflows/build_wheel/devel/artifact.zip) containing wheels and install it with `pip`:
 
 ```sh
-pip install -U --pre deepmd-kit[gpu,cu11,lmp] --extra-index-url https://deepmodeling.github.io/deepmd-kit/simple
+pip install -U --pre deepmd-kit[gpu,cu12,lmp] --extra-index-url https://deepmodeling.github.io/deepmd-kit/simple
 ```
 
-`cu11` and `lmp` are optional, which is the same as the stable version.
+`cu12` and `lmp` are optional, which is the same as the stable version.
 
 ## Download pre-compiled C Library
 
-The [pre-comiled C library](./install-from-c-library.md) can be downloaded from [here](https://nightly.link/deepmodeling/deepmd-kit/workflows/package_c/devel/libdeepmd_c.zip), or via a shell command:
+The [pre-comiled C library](./install-from-c-library.md) can be downloaded from [here](https://nightly.link/deepmodeling/deepmd-kit/workflows/package_c/devel/libdeepmd_c-0-libdeepmd_c.tar.gz.zip), or via a shell command:
 
 ```sh
-wget https://nightly.link/deepmodeling/deepmd-kit/workflows/package_c/devel/libdeepmd_c.zip && unzip libdeepmd_c.zip
+wget https://nightly.link/deepmodeling/deepmd-kit/workflows/package_c/devel/libdeepmd_c-0-libdeepmd_c.tar.gz.zip && unzip libdeepmd_c-0-libdeepmd_c.tar.gz.zip
 ```
diff --git a/doc/install/easy-install.md b/doc/install/easy-install.md
index f033310f8f..3bc1f4b944 100644
--- a/doc/install/easy-install.md
+++ b/doc/install/easy-install.md
@@ -58,7 +58,7 @@ conda activate deepmd
 DeePMD-kit is also available on the [conda-forge](https://conda-forge.org/) channel:
 
 ```bash
-conda create -n deepmd deepmd-kit lammps -c conda-forge
+conda create -n deepmd deepmd-kit lammps horovod -c conda-forge
 ```
 
 The supported platform includes Linux x86-64, macOS x86-64, and macOS arm64.
@@ -84,13 +84,19 @@ docker pull deepmodeling/dpmdkit-rocm:dp2.0.3-rocm4.5.2-tf2.6-lmp29Sep2021
 
 ## Install Python interface with pip
 
-If you have no existing TensorFlow installed, you can use `pip` to install the pre-built package of the Python interface with CUDA 11 supported:
+If you have no existing TensorFlow installed, you can use `pip` to install the pre-built package of the Python interface with CUDA 12 supported:
 
 ```bash
-pip install deepmd-kit[gpu,cu11]
+pip install deepmd-kit[gpu,cu12]
 ```
 
-`cu11` is required only when CUDA Toolkit and cuDNN were not installed.
+`cu12` is required only when CUDA Toolkit and cuDNN were not installed.
+
+To install the package built against CUDA 11.8, use
+
+```bash
+pip install deepmd-kit-cu11[gpu,cu11]
+```
 
 Or install the CPU version without CUDA supported:
 ```bash
@@ -99,7 +105,7 @@ pip install deepmd-kit[cpu]
 
 [The LAMMPS module](../third-party/lammps-command.md) and [the i-Pi driver](../third-party/ipi.md) are only provided on Linux and macOS. To install LAMMPS and/or i-Pi, add `lmp` and/or `ipi` to extras:
 ```bash
-pip install deepmd-kit[gpu,cu11,lmp,ipi]
+pip install deepmd-kit[gpu,cu12,lmp,ipi]
 ```
 MPICH is required for parallel running. (The macOS arm64 package doesn't support MPI yet.)
 
diff --git a/doc/install/install-from-c-library.md b/doc/install/install-from-c-library.md
index 343446888c..7613fdb772 100644
--- a/doc/install/install-from-c-library.md
+++ b/doc/install/install-from-c-library.md
@@ -1,8 +1,14 @@
 # Install from pre-compiled C library
 
 DeePMD-kit provides pre-compiled C library package (`libdeepmd_c.tar.gz`) in each [release](https://github.com/deepmodeling/deepmd-kit/releases). It can be used to build the [LAMMPS plugin](./install-lammps.md) and [GROMACS patch](./install-gromacs.md), as well as many [third-party software packages](../third-party/out-of-deepmd-kit.md), without building TensorFlow and DeePMD-kit on one's own.
+It can be downloaded via the shell command:
 
-The library is built in Linux (GLIBC 2.17) with CUDA 11.8. It's noted that this package does not contain CUDA Toolkit and cuDNN, so one needs to download them from the NVIDIA website.
+```sh
+wget https://github.com/deepmodeling/deepmd-kit/releases/latest/download/libdeepmd_c.tar.gz
+tar xzf libdeepmd_c.tar.gz
+```
+
+The library is built in Linux (GLIBC 2.17) with CUDA 12.2 (`libdeepmd_c.tar.gz`) or 11.8 (`libdeepmd_c_cu11.tar.gz`). It's noted that this package does not contain CUDA Toolkit and cuDNN, so one needs to download them from the NVIDIA website.
 
 ## Use Pre-compiled C Library to build the LAMMPS plugin and GROMACS patch
 
diff --git a/doc/install/install-lammps.md b/doc/install/install-lammps.md
index e643660cd1..5dbf690c67 100644
--- a/doc/install/install-lammps.md
+++ b/doc/install/install-lammps.md
@@ -14,10 +14,10 @@ make lammps
 DeePMD-kit will generate a module called `USER-DEEPMD` in the `build` directory, which supports either double or single float precision interface. Now download the LAMMPS code, and uncompress it.
 ```bash
 cd /some/workspace
-wget https://github.com/lammps/lammps/archive/stable_2Aug2023_update1.tar.gz
-tar xf stable_2Aug2023_update1.tar.gz
+wget https://github.com/lammps/lammps/archive/stable_2Aug2023_update2.tar.gz
+tar xf stable_2Aug2023_update2.tar.gz
 ```
-The source code of LAMMPS is stored in the directory `lammps-stable_2Aug2023_update1`.
+The source code of LAMMPS is stored in the directory `lammps-stable_2Aug2023_update2`.
 
 Then, you can [build LAMMPS](https://docs.lammps.org/Build.html) with either make or CMake.
 
@@ -25,7 +25,7 @@ Then, you can [build LAMMPS](https://docs.lammps.org/Build.html) with either mak
 
 Now go into the LAMMPS code and copy the DeePMD-kit module like this
 ```bash
-cd lammps-stable_2Aug2023_update1/src/
+cd lammps-stable_2Aug2023_update2/src/
 cp -r $deepmd_source_dir/source/build/USER-DEEPMD .
 make yes-kspace
 make yes-extra-fix
@@ -51,8 +51,8 @@ make no-user-deepmd
 Now go into the LAMMPS directory and create a directory called `build`:
 
 ```bash
-mkdir -p lammps-stable_2Aug2023_update1/build/
-cd lammps-stable_2Aug2023_update1/build/
+mkdir -p lammps-stable_2Aug2023_update2/build/
+cd lammps-stable_2Aug2023_update2/build/
 ```
 
 Patch the LAMMPS `CMakeLists.txt` file:
@@ -81,15 +81,15 @@ Starting from `8Apr2021`, LAMMPS also provides a plugin mode, allowing one to bu
 Now download the LAMMPS code (`8Apr2021` or later), and uncompress it:
 ```bash
 cd /some/workspace
-wget https://github.com/lammps/lammps/archive/stable_2Aug2023_update1.tar.gz
-tar xf stable_2Aug2023_update1.tar.gz
+wget https://github.com/lammps/lammps/archive/stable_2Aug2023_update2.tar.gz
+tar xf stable_2Aug2023_update2.tar.gz
 ```
 
-The source code of LAMMPS is stored in the directory `lammps-stable_2Aug2023_update1`. The directory of the source code should be specified as the CMAKE argument `LAMMPS_SOURCE_ROOT` during installation of the DeePMD-kit C++ interface. Now go into the LAMMPS directory and create a directory called `build`
+The source code of LAMMPS is stored in the directory `lammps-stable_2Aug2023_update2`. The directory of the source code should be specified as the CMAKE argument `LAMMPS_SOURCE_ROOT` during installation of the DeePMD-kit C++ interface. Now go into the LAMMPS directory and create a directory called `build`
 
 ```bash
-mkdir -p lammps-stable_2Aug2023_update1/build/
-cd lammps-stable_2Aug2023_update1/build/
+mkdir -p lammps-stable_2Aug2023_update2/build/
+cd lammps-stable_2Aug2023_update2/build/
 ```
 Now build LAMMPS. Note that `PLUGIN` must be enabled, and `BUILD_SHARED_LIBS` must be set to `yes`. You can install any other package you want.
 ```bash
diff --git a/doc/model/dplr.md b/doc/model/dplr.md
index 035c27ee14..feea84e562 100644
--- a/doc/model/dplr.md
+++ b/doc/model/dplr.md
@@ -6,6 +6,28 @@ The method of DPLR is described in [this paper][1]. One is recommended to read t
 
 In the following, we take the DPLR model for example to introduce the training and LAMMPS simulation with the DPLR model. The DPLR model is trained in two steps.
 
+## Theory
+
+The Deep Potential Long Range (DPLR) model adds the electrostatic energy to the total energy:
+```math
+    E=E_{\text{DP}} + E_{\text{ele}},
+```
+where $E_{\text{DP}}$ is the short-range contribution constructed as the [standard energy model](./train-energy.md) that is fitted against $(E^\ast-E_{\text{ele}})$.
+$E_{\text{ele}}$ is the electrostatic energy
+introduced by a group of Gaussian distributions that is an approximation of the electronic structure of the system, and is calculated in Fourier space by
+```math
+    E_{\text{ele}} = \frac{1}{2\pi V}\sum_{m \neq 0, \|m\|\leq L} \frac{\exp({-\pi ^2 m^2/\beta ^2})}{m^2}S^2(m),
+```
+where $\beta$ is a freely tunable parameter that controls the spread of the Gaussians.
+$L$ is the cutoff in Fourier space and $S(m)$, the structure factor, is given by
+```math
+    S(m)=\sum_i q_i e^{-2\pi \imath m \boldsymbol r_i} + \sum_n q_n e^{-2\pi \imath m \boldsymbol W_n},
+```
+where $\imath = \sqrt{-1}$ denotes the imaginary unit, $\boldsymbol r_i$ indicates ion coordinates, $q_i$ is the charge of the ion $i$, and $W_n$ is the $n$-th Wannier centroid (WC) which can be obtained from a separated [dipole model](./train-fitting-tensor.md).
+It can be proved that the error in the electrostatic energy introduced by the Gaussian approximations is dominated by a summation of dipole-quadrupole interactions that decay as $r^{-4}$, where $r$ is the distance between the dipole and quadrupole.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## Train a deep Wannier model for Wannier centroids
 
 We use the deep Wannier model (DW) to represent the relative position of the Wannier centroid (WC) with the atom with which it is associated. One may consult the introduction of the [dipole model](train-fitting-tensor.md) for a detailed introduction. An example input `wc.json` and a small dataset `data` for tutorial purposes can be found in
diff --git a/doc/model/dprc.md b/doc/model/dprc.md
index 719421108a..c7547a769f 100644
--- a/doc/model/dprc.md
+++ b/doc/model/dprc.md
@@ -2,7 +2,39 @@
 
 Deep Potential - Range Correction (DPRc) is designed to combine with QM/MM method, and corrects energies from a low-level QM/MM method to a high-level QM/MM method:
 
-$$ E=E_\text{QM}(\mathbf R; \mathbf P)  + E_\text{QM/MM}(\mathbf R; \mathbf P) + E_\text{MM}(\mathbf R) + E_\text{DPRc}(\mathbf R) $$
+```math
+E=E_\text{QM}(\mathbf R; \mathbf P)  + E_\text{QM/MM}(\mathbf R; \mathbf P) + E_\text{MM}(\mathbf R) + E_\text{DPRc}(\mathbf R)
+```
+
+## Theory
+
+Deep Potential - Range Correction (DPRc) was initially designed to correct the potential energy from a fast, linear-scaling low-level semiempirical QM/MM theory to a high-level ''ab initio'' QM/MM theory in a range-correction way to quantitatively correct short and mid-range non-bonded interactions leveraging the non-bonded lists routinely used in molecular dynamics simulations using molecular mechanical force fields such as AMBER.
+In this way, long-ranged electrostatic interactions can be modeled efficiently using the particle mesh Ewald method or its extensions for multipolar and QM/MM potentials.
+In a DPRc model, the switch function is modified to disable MM-MM interaction:
+```math
+  s_\text{DPRc}(r_{ij}) =
+  \begin{cases}
+  0, &\text{if $i \in \text{MM} \land j \in \text{MM}$}, \\
+  s(r_{ij}), &\text{otherwise},
+  \end{cases}
+```
+where $s_\text{DPRc}(r_{ij})$ is the new switch function and $s(r_{ij})$ is the old one.
+This ensures the forces between MM atoms are zero, i.e.
+```math
+{\boldsymbol F}_{ij} = - \frac{\partial E}{\partial \boldsymbol r_{ij}} = 0, \quad i \in \text{MM} \land j \in \text{MM}.
+```
+The fitting network is revised to remove energy bias from MM atoms:
+```math
+  E_i=
+  \begin{cases}
+  \mathcal{F}_0(\mathcal{D}^i),  &\text{if $i \in \text{QM}$}, \\
+  \mathcal{F}_0(\mathcal{D}^i) - \mathcal{F}_0(\mathbf{0}), &\text{if $i \in \text{MM}$},
+  \end{cases}
+```
+where $\mathbf{0}$ is a zero matrix.
+It is worth mentioning that usage of DPRc is not limited to its initial design for QM/MM correction and can be expanded to any similar interaction.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
 
 See the [JCTC paper](https://doi.org/10.1021/acs.jctc.1c00201) for details.
 
@@ -10,7 +42,9 @@ See the [JCTC paper](https://doi.org/10.1021/acs.jctc.1c00201) for details.
 
 Instead the normal _ab initio_ data, one needs to provide the correction from a low-level QM/MM method to a high-level QM/MM method:
 
-$$ E = E_\text{high-level QM/MM} - E_\text{low-level QM/MM} $$
+```math
+E = E_\text{high-level QM/MM} - E_\text{low-level QM/MM}
+```
 
 Two levels of data use the same MM method, so $E_\text{MM}$ is eliminated.
 
diff --git a/doc/model/index.md b/doc/model/index.md
index 4ef508ec1b..589b39b2b5 100644
--- a/doc/model/index.md
+++ b/doc/model/index.md
@@ -17,3 +17,4 @@
 - [Deep potential long-range](dplr.md)
 - [Deep Potential - Range Correction (DPRc)](dprc.md)
 - [Linear model](linear.md)
+- [Interpolation or combination with a pairwise potential](pairtab.md)
diff --git a/doc/model/index.rst b/doc/model/index.rst
index 6597ce1d21..1e850cac67 100644
--- a/doc/model/index.rst
+++ b/doc/model/index.rst
@@ -20,3 +20,4 @@ Model
    dplr
    dprc
    linear
+   pairtab
diff --git a/doc/model/overall.md b/doc/model/overall.md
index 3d4052e464..f8fb2fa151 100644
--- a/doc/model/overall.md
+++ b/doc/model/overall.md
@@ -1,5 +1,31 @@
 # Overall
 
+## Theory
+
+A Deep Potential (DP) model, denoted by $\mathcal{M}$, can be generally represented as
+
+```math
+\boldsymbol y_i = \mathcal M (\boldsymbol x_i, \{\boldsymbol x_j\}_{j\in n(i)}; \boldsymbol \theta)
+= \mathcal{F} \big( \mathcal{D} (\boldsymbol x_i, \{\boldsymbol x_j\}_{j\in n(i)}; \boldsymbol \theta_d) ; \boldsymbol \theta_f \big),
+```
+
+where $\boldsymbol{y}_i$ is the fitting properties, $\mathcal{F}$ is the fitting network, $\mathcal{D}$ is the descriptor.
+$\boldsymbol{x} = (\boldsymbol r_i, \alpha_i)$, with $\boldsymbol r_i$ being the Cartesian coordinates and $\alpha_i$ being the chemical species, denotes the degrees of freedom of the atom $i$.
+
+The indices of the neighboring atoms (i.e. atoms within a certain cutoff radius) of atom $i$ are given by the notation $n(i)$.
+Note that the Cartesian coordinates can be either under the periodic boundary condition (PBC) or in vacuum (under the open boundary condition).
+The network parameters are denoted by $\boldsymbol \theta = \{\boldsymbol \theta_d, \boldsymbol \theta_f\}$, where $\boldsymbol \theta_d$ and $\boldsymbol\theta_f$ yield the network parameters of the descriptor (if any) and those of the fitting network, respectively.
+From the above equation, one may compute the global property of the system by
+```math
+    \boldsymbol y = \sum_{i=1}^N \boldsymbol y_i,
+```
+where $N$ is the number of atoms in a frame.
+For example, if $y_i$ represents the potential energy contribution of atom $i$, then $y$ gives the total potential energy of the frame.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
+## Instructions
+
 A model has two parts, a descriptor that maps atomic configuration to a set of symmetry invariant features, and a fitting net that takes descriptor as input and predicts the atomic contribution to the target physical property. It's defined in the {ref}`model <model>` section of the `input.json`, for example,
 ```json
     "model": {
diff --git a/doc/model/pairtab.md b/doc/model/pairtab.md
new file mode 100644
index 0000000000..115345796a
--- /dev/null
+++ b/doc/model/pairtab.md
@@ -0,0 +1,88 @@
+# Interpolation or combination with a pairwise potential
+
+## Theory
+In applications like the radiation damage simulation, the interatomic distance may become too close, so that the DFT calculations fail.
+In such cases, the DP model that is an approximation of the DFT potential energy surface is usually replaced by an empirical potential, like the Ziegler-Biersack-Littmark (ZBL) screened nuclear repulsion potential in the radiation damage simulations.
+The DeePMD-kit package supports the interpolation between DP and an empirical pairwise potential
+```math
+  E_i = (1-w_i) E_i^{\mathrm{DP}} + w_i (E_i^0 + E_i^{\mathrm{pair}}),
+```
+where the $w_i$ is the interpolation weight and the $E_i^{\mathrm{pair}}  $ is the atomic contribution due to the pairwise potential $u^{\mathrm{pair}}(r)$, i.e.
+```math
+  E_i^{\mathrm{pair}} = \sum_{j\in n(i)} u^{\mathrm{pair}}(r_{ij}).
+```
+The interpolation weight $w_i$ is defined by
+```math
+    w_i =
+    \begin{cases}
+    1, & \sigma_i \lt r_a, \\
+    u_i^3 (-6 u_i^2 +15 u_i -10) +1, & r_a \leq \sigma_i \lt r_b, \\
+    0, & \sigma_i \geq r_b,
+    \end{cases}
+```
+where $u_i = (\sigma_i - r_a ) / (r_b - r_a)$.
+$E_i^0$ is the atom energy bias.
+In the range $[r_a, r_b]$, the DP model smoothly switched off and the pairwise potential smoothly switched on from $r_b$ to $r_a$. The $\sigma_i$ is the softmin of the distance between atom $i$ and its neighbors,
+```math
+  \sigma_i =
+  \dfrac
+  {\sum\limits_{j\in n(i)} r_{ij} e^{-r_{ij} / \alpha_s}}
+  {\sum\limits_{j\in n(i)} e^{-r_{ij} / \alpha_s}},
+```
+where the scale $\alpha_s$ is a tunable scale of the interatomic distance $r_{ij}$.
+The pairwise potential $u^{\textrm{pair}}(r)$ is defined by a user-defined table that provides the value of $u^{\textrm{pair}}$ on an evenly discretized grid from 0 to the cutoff distance.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
+DeePMD-kit also supports combination with a pairwise potential:
+
+```math
+  E_i = E_i^{\mathrm{DP}} + E_i^{\mathrm{pair}},
+```
+
+## Table file
+
+The table file should be a text file that can be read by {py:meth}`numpy.loadtxt`.
+The first column is the distance between two atoms, where upper range should be larger than the cutoff radius.
+Other columns are two-body interaction energies for pairs of certain types,
+in the order of Type_0-Type_0, Type_0-Type_1, ..., Type_0-Type_N, Type_1-Type_1, ..., Type_1-Type_N, ..., and Type_N-Type_N.
+
+The interaction should be smooth at the cut-off distance.
+
+## Interpolation with a short-range pairwise potential
+
+```json
+"model": {
+  "use_srtab": "H2O_tab_potential.txt",
+  "smin_alpha": 0.1,
+  "sw_rmin": 0.8,
+  "sw_rmax": 1.0,
+  "_comment": "Below uses a normal DP model"
+}
+```
+
+{ref}`sw_rmin <model/sw_rmin>` and {ref}`sw_rmax <model/sw_rmax>` must be smaller than the cutoff radius of the DP model.
+
+## Combination with a pairwise potential
+
+To combine with a pairwise potential, use the [linear model](./linear.md):
+
+```json
+"model": {
+  "type": "linear_ener",
+  "weights": "sum",
+  "models": [
+    {
+      "_comment": "Here uses a normal DP model"
+    },
+    {
+      "type": "pairtab",
+      "tab_file": "dftd3.txt",
+      "rcut": 10.0,
+      "sel": 534
+    }
+  ]
+}
+```
+
+The {ref}`rcut <model[pairtab]/rcut>` can be larger than that of the DP model.
diff --git a/doc/model/train-energy.md b/doc/model/train-energy.md
index af3e4969b3..90e027d7a0 100644
--- a/doc/model/train-energy.md
+++ b/doc/model/train-energy.md
@@ -2,6 +2,62 @@
 
 In this section, we will take `$deepmd_source_dir/examples/water/se_e2_a/input.json` as an example of the input file.
 
+## Theory
+
+In the DP model, we let the fitting network $\mathcal{F}_ 0$ maps the descriptor $\mathcal{D}^i$ to a scalar, where the subscript $0$ means that the output is a zero-order tensor (i.e. scalar).  The model can then be used to predict the total potential energy of the system by
+```math
+    E  =  \sum_i E_i = \sum_i \mathcal F_0 (\mathcal D^i),
+```
+where the output of the fitting network is treated as the atomic potential energy contribution, i.e. $E_i$.
+The output scalar can also be treated as other scalar properties defined on an atom, for example, the partial charge of atom $i$.
+
+In some cases, atomic-specific or frame-specific  parameters, such as electron temperature, may be treated as extra input to the fitting network.
+We denote the atomic and frame-specific parameters by $\boldsymbol{P}^i\in \mathbb{R}^{N_p}$ (with $N_p$ being the dimension) and $\boldsymbol{Q}\in \mathbb{R}^{N_q}$ (with $N_q$ being the dimension), respectively.
+```math
+    E_i=\mathcal{F}_0(\{\mathcal{D}^i, \boldsymbol{P}^i, \boldsymbol Q\}).
+```
+
+The atomic force $\boldsymbol{F}_ {i}$ and the virial tensor $\boldsymbol{\Xi} = (\Xi_{\alpha\beta})$ (if PBC is applied) can be derived from the potential energy $E$:
+```math
+    F_{i,\alpha}=-\frac{\partial E}{\partial r_{i,\alpha}},
+```
+```math
+    \Xi_{\alpha\beta}=-\sum_{\gamma} \frac{\partial E}{\partial h_{\gamma\alpha}} h_{\gamma\beta},
+```
+where $r_{i,\alpha}$ and $F_{i,\alpha}$ denotes the $\alpha$-th component of the coordinate and force of atom $i$. $h_{\alpha\beta}$ is the $\beta$-th component of the $\alpha$-th basis vector of the simulation region.
+
+The properties $\eta$ of the energy loss function could be energy $E$, force $\boldsymbol{F}$, virial $\boldsymbol{\Xi}$, relative energy $\Delta E$, or any combination among them, and the loss functions of them are
+```math
+    L_E(\boldsymbol{x};\boldsymbol{\theta})=\frac{1}{N}(E(\boldsymbol{x};\boldsymbol{\theta})-E^*)^2,
+```
+```math
+    L_F(\boldsymbol{x};\boldsymbol{\theta})=\frac{1}{3N}\sum_{k=1}^{N}\sum_{\alpha=1}^3(F_{k,\alpha}(\boldsymbol{x};\boldsymbol{\theta})-F_{k,\alpha}^*)^2,
+```
+```math
+    L_\Xi(\boldsymbol{x};\boldsymbol{\theta})=\frac{1}{9N}\sum_{\alpha,\beta=1}^{3}(\Xi_{\alpha\beta}(\boldsymbol{x};\boldsymbol{\theta})-\Xi_{\alpha\beta}^*)^2,
+```
+```math
+    L_{\Delta E}(\boldsymbol{x};\boldsymbol{\theta})=\frac{1}{N}({\Delta E}(\boldsymbol{x};\boldsymbol{\theta})-{\Delta E}^*)^2,
+```
+where $F_{k,\alpha}$ is the $\alpha$-th component of the force on atom $k$, and the superscript $\ast$ indicates the label of the property that should be provided in advance.
+Using $N$ ensures that each loss of fitting property is averaged over atomic contributions before they contribute to the total loss by weight.
+
+If part of atoms is more important than others, for example, certain atoms play an essential role when calculating free energy profiles or kinetic isotope effects, the MSE of atomic forces with prefactors $q_{k}$ can also be used as the loss function:
+```math
+    L_F^p(\mathbf{x};\boldsymbol{\theta})=\frac{1}{3N}\sum_{k=1}^{N} \sum_{\alpha} q_{k} (F_{k,\alpha}(\mathbf{x};\boldsymbol{\theta})-F_{k,\alpha}^*)^2.
+```
+The atomic forces with larger prefactors will be fitted more accurately than those in other atoms.
+
+If some forces are quite large, for example, forces can be greater than 60 eV/Å in high-temperature reactive simulations, one may also prefer the force loss is relative to the magnitude:
+```math
+    L^r_F(\boldsymbol{x};\boldsymbol{\theta})=\frac{1}{3N}\sum_{k=1}^{N}\sum_\alpha \left(\frac{F_{k,\alpha}(\boldsymbol{x};\boldsymbol{\theta})-F_{k,\alpha}^*}{\lvert\boldsymbol{F}^\ast_k\lvert + \nu}\right)^2.
+```
+where $\nu$ is a small constant used to protect
+an atom where the magnitude of $\boldsymbol{F}^\ast_k$ is small from having a large $L^r_F$.
+Benefiting from the relative force loss, small forces can be fitted more accurately.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## The fitting network
 
 The construction of the fitting net is given by section {ref}`fitting_net <model/fitting_net>`
diff --git a/doc/model/train-fitting-tensor.md b/doc/model/train-fitting-tensor.md
index d7c06a25ed..90370adfcf 100644
--- a/doc/model/train-fitting-tensor.md
+++ b/doc/model/train-fitting-tensor.md
@@ -11,6 +11,40 @@ The training and validation data are also provided our examples. But note that *
 
 Similar to the `input.json` used in `ener` mode, training JSON is also divided into {ref}`model <model>`, {ref}`learning_rate <learning_rate>`, {ref}`loss <loss>` and {ref}`training <training>`. Most keywords remain the same as `ener` mode, and their meaning can be found [here](train-se-e2-a.md). To fit a tensor, one needs to modify {ref}`model/fitting_net <model/fitting_net>` and {ref}`loss <loss>`.
 
+## Theory
+
+To represent the first-order tensorial properties (i.e. vector properties), we let the fitting network, denoted by $\mathcal F_{1}$, output an $M$-dimensional vector; then we have the representation,
+
+```math
+(T_i^{(1)})_\alpha =
+\frac{1}{N_c}
+\sum_{j=1}^{N_c}\sum_{m=1}^M (\mathcal G^i)_{jm} (\mathcal R^i)_{j,\alpha+1}
+(\mathcal F_{1}(\mathcal D^i))_m, \ \alpha=1,2,3.
+```
+We let the fitting network $\mathcal F_{2}$ output an $M$-dimensional vector, and the second-order tensorial properties (matrix properties) are formulated as
+```math
+(T_i^{(2)})_{\alpha\beta} =
+\frac{1}{N_c^2}
+\sum_{j=1}^{N_c}\sum_{k=1}^{N_c}\sum_{m=1}^M
+(\mathcal G^i)_{jm}
+(\mathcal R^i)_{j,\alpha+1}
+(\mathcal R^i)_{k,\beta+1}
+(\mathcal G^i)_{km}
+(\mathcal F_{2}(\mathcal D^i))_m,
+\ \alpha,\beta=1,2,3,
+```
+
+where $\mathcal{G}^i$ and $\mathcal{R}^i$ can be found in [`se_e2_a`](./train-se-e2-a.md).
+Thus, the tensor fitting network requires the descriptor to have the same or similar form as the DeepPot-SE descriptor.
+$\mathcal{F}_1$ and $\mathcal F_2$ are the neural network functions.
+The total tensor $\boldsymbol{T}$ (total dipole $\boldsymbol{T}^{(1)}$ or total polarizability $\boldsymbol{T}^{(2)}$) is the sum of the atomic tensor:
+```math
+    \boldsymbol{T} = \sum_i \boldsymbol{T}_i.
+```
+The tensorial models can be used to calculate IR spectrum and Raman spectrum.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## The fitting Network
 
 The {ref}`fitting_net <model/fitting_net>` section tells DP which fitting net to use.
diff --git a/doc/model/train-hybrid.md b/doc/model/train-hybrid.md
index 37666668c7..58b66f25e0 100644
--- a/doc/model/train-hybrid.md
+++ b/doc/model/train-hybrid.md
@@ -2,6 +2,23 @@
 
 This descriptor hybridizes multiple descriptors to form a new descriptor. For example, we have a list of descriptors denoted by $\mathcal D_1$, $\mathcal D_2$, ..., $\mathcal D_N$, the hybrid descriptor this the concatenation of the list, i.e. $\mathcal D = (\mathcal D_1, \mathcal D_2, \cdots, \mathcal D_N)$.
 
+## Theory
+
+A hybrid descriptor $\mathcal{D}^i_\text{hyb}$ concatenates multiple kinds of descriptors into one descriptor:
+```math
+    \mathcal{D}^{i}_\text{hyb} = \{
+    \begin{array}{cccc}
+        \mathcal{D}^{i}_1 & \mathcal{D}^{i}_2 & \cdots & \mathcal{D}^{i}_n
+    \end{array}
+    \}.
+```
+The list of descriptors can be different types or the same descriptors with different parameters.
+This way, one can set the different cutoff radii for different descriptors.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
+## Instructions
+
 To use the descriptor in DeePMD-kit, one firstly set the {ref}`type <model/descriptor/type>` to {ref}`hybrid <model/descriptor[hybrid]>`, then provide the definitions of the descriptors by the items in the `list`,
 ```json
         "descriptor" :{
diff --git a/doc/model/train-se-atten.md b/doc/model/train-se-atten.md
index 55bb0458f7..7480ddbc12 100644
--- a/doc/model/train-se-atten.md
+++ b/doc/model/train-se-atten.md
@@ -8,9 +8,48 @@ Here we propose DPA-1, a Deep Potential model with a novel attention mechanism,
 
 See [this paper](https://arxiv.org/abs/2208.08236) for more information. DPA-1 is implemented as a new descriptor `"se_atten"` for model training, which can be used after simply editing the input.json.
 
-## Installation
-Follow the [standard installation](../install/install-from-source.md#install-the-python-interface) of Python interface in the DeePMD-kit.
-After that, you can smoothly use the DPA-1 model with the following instructions.
+## Theory
+
+Attention-based descriptor $\mathcal{D}^i \in \mathbb{R}^{M \times M_{<}}$, which is proposed in pretrainable DPA-1 model, is given by
+
+```math
+    \mathcal{D}^i = \frac{1}{N_c^2}(\hat{\mathcal{G}}^i)^T \mathcal{R}^i (\mathcal{R}^i)^T \hat{\mathcal{G}}^i_<,
+```
+where $\hat{\mathcal{G}}^i$ represents the embedding matrix $\mathcal{G}^i$ after additional self-attention mechanism and $\mathcal{R}^i$ is defined by the full case in the [`se_e2_a`](./train-se-e2-a.md).
+Note that we obtain $\mathcal{G}^i$ using the type embedding method by default in this descriptor.
+
+To perform the self-attention mechanism, the queries $\mathcal{Q}^{i,l} \in \mathbb{R}^{N_c\times d_k}$, keys $\mathcal{K}^{i,l} \in \mathbb{R}^{N_c\times d_k}$, and values $\mathcal{V}^{i,l} \in \mathbb{R}^{N_c\times d_v}$ are first obtained:
+```math
+    \left(\mathcal{Q}^{i,l}\right)_{j}=Q_{l}\left(\left(\mathcal{G}^{i,l-1}\right)_{j}\right),
+```
+```math
+    \left(\mathcal{K}^{i,l}\right)_{j}=K_{l}\left(\left(\mathcal{G}^{i,l-1}\right)_{j}\right),
+```
+```math
+    \left(\mathcal{V}^{i,l}\right)_{j}=V_{l}\left(\left(\mathcal{G}^{i,l-1}\right)_{j}\right),
+```
+where $Q_{l}$, $K_{l}$, $V_{l}$ represent three trainable linear transformations that output the queries and keys of dimension $d_k$ and values of dimension $d_v$, and $l$ is the index of the attention layer.
+The input embedding matrix to the attention layers,  denoted by $\mathcal{G}^{i,0}$, is chosen as the two-body embedding matrix.
+
+Then the scaled dot-product attention method is adopted:
+```math
+A(\mathcal{Q}^{i,l}, \mathcal{K}^{i,l}, \mathcal{V}^{i,l}, \mathcal{R}^{i,l})=\varphi\left(\mathcal{Q}^{i,l}, \mathcal{K}^{i,l},\mathcal{R}^{i,l}\right)\mathcal{V}^{i,l},
+```
+where $\varphi\left(\mathcal{Q}^{i,l}, \mathcal{K}^{i,l},\mathcal{R}^{i,l}\right) \in \mathbb{R}^{N_c\times N_c}$ is attention weights.
+In the original attention method, one typically has $\varphi\left(\mathcal{Q}^{i,l}, \mathcal{K}^{i,l}\right)=\mathrm{softmax}\left(\frac{\mathcal{Q}^{i,l} (\mathcal{K}^{i,l})^{T}}{\sqrt{d_{k}}}\right)$, with $\sqrt{d_{k}}$ being the normalization temperature.
+This is slightly modified to incorporate the angular information:
+```math
+\varphi\left(\mathcal{Q}^{i,l}, \mathcal{K}^{i,l},\mathcal{R}^{i,l}\right) = \mathrm{softmax}\left(\frac{\mathcal{Q}^{i,l} (\mathcal{K}^{i,l})^{T}}{\sqrt{d_{k}}}\right) \odot \hat{\mathcal{R}}^{i}(\hat{\mathcal{R}}^{i})^{T},
+```
+where $\hat{\mathcal{R}}^{i} \in \mathbb{R}^{N_c\times 3}$ denotes normalized relative coordinates , $\hat{\mathcal{R}}^{i}_{j} = \frac{\boldsymbol{r}_{ij}}{\lVert \boldsymbol{r}_{ij} \lVert}$ and $\odot$ means element-wise multiplication.
+
+Then layer normalization is added in a residual way to finally obtain the self-attention local embedding matrix $\hat{\mathcal{G}}^{i} = \mathcal{G}^{i,L_a}$ after $L_a$ attention layers:[^1]
+```math
+\mathcal{G}^{i,l} = \mathcal{G}^{i,l-1} + \mathrm{LayerNorm}(A(\mathcal{Q}^{i,l}, \mathcal{K}^{i,l}, \mathcal{V}^{i,l}, \mathcal{R}^{i,l})).
+```
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 
 ## Introduction to new features of DPA-1
 Next, we will list the detailed settings in input.json and the data format, especially for large systems with dozens of elements. An example of DPA-1 input can be found [here](../../examples/water/se_atten/input.json).
diff --git a/doc/model/train-se-e2-a-tebd.md b/doc/model/train-se-e2-a-tebd.md
index 7528202ff2..cb6ce6674f 100644
--- a/doc/model/train-se-e2-a-tebd.md
+++ b/doc/model/train-se-e2-a-tebd.md
@@ -4,7 +4,58 @@ We generate specific a type embedding vector for each atom type so that we can s
 
 The training input script is similar to that of [`se_e2_a`](train-se-e2-a.md), but different by adding the {ref}`type_embedding <model/type_embedding>` section.
 
-## Type embedding net
+## Theory
+
+Usually, when the type embedding approach is not enabled, for a system with multiple chemical species ($|\{\alpha_i\}| > 1$), parameters of the embedding network $\mathcal{N}_{e,\{2,3\}}$ are as follows chemical-species-wise:
+
+```math
+    (\mathcal{G}^i)_j = \mathcal{N}^{\alpha_i, \alpha_j}_{e,2}(s(r_{ij})) \quad \mathrm{or}\quad
+    (\mathcal{G}^i)_j = \mathcal{N}^{ \alpha_j}_{e,2}(s(r_{ij})),
+```
+```math
+    (\mathcal{G}^i)_{jk} =\mathcal{N}^{\alpha_j, \alpha_k}_{e,3}((\theta_i)_{jk}).
+```
+
+Thus, there will be $N_t^2$ or $N_t$ embedding networks where $N_t$ is the number of chemical species.
+To improve the performance of matrix operations, $n(i)$ is divided into blocks of different chemical species.
+Each matrix with a dimension of $N_c$ is divided into corresponding blocks, and each block is padded to $N_c^{\alpha_j}$ separately.
+The limitation of this approach is that when there are large numbers of chemical species, the number of embedding networks will increase, requiring large memory and decreasing computing efficiency.
+
+Similar to the embedding networks, if the type embedding approach is not used, the fitting network parameters are chemical-species-wise, and there are $N_t$ sets of fitting network parameters.
+For performance, atoms are sorted by their chemical species $\alpha_i$ in advance.
+Take an example, the atomic energy $E_i$ is represented as follows:
+```math
+E_i=\mathcal{F}_0^{\alpha_i}(\mathcal{D}^i).
+```
+
+To reduce the number of NN parameters and improve computing efficiency when there are large numbers of chemical species,
+the type embedding $\mathcal{A}$ is introduced, represented as a NN function $\mathcal{N}_t$ of the atomic type $\alpha$:
+
+```math
+    \mathcal{A}^i = \mathcal{N}_t\big( \text{one hot}(\alpha_i) \big),
+```
+
+where $\alpha_i$ is converted to a one-hot vector representing the chemical species before feeding to the NN.
+The type embeddings of central and neighboring atoms $\mathcal{A}^i$ and $\mathcal{A}^j$ are added as an extra input of the embedding network $\mathcal{N}_{e,\{2,3\}}$:
+
+```math
+    (\mathcal{G}^i)_j = \mathcal{N}_{e,2}(\{s(r_{ij}), \mathcal{A}^i, \mathcal{A}^j\})  \quad \mathrm{or}\quad
+    (\mathcal{G}^i)_j = \mathcal{N}_{e,2}(\{s(r_{ij}), \mathcal{A}^j\}) ,
+```
+```math
+    (\mathcal{G}^i)_{jk} =\mathcal{N}_{e,3}(\{(\theta_i)_{jk}, \mathcal{A}^j, \mathcal{A}^k\}).
+```
+
+In fitting networks, the type embedding is inserted into the input of the fitting networks:
+```math
+E_i=\mathcal{F}_0(\{\mathcal{D}^i, \mathcal{A}^i\}).
+```
+
+In this way, all chemical species share the same network parameters through the type embedding.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
+## Instructions
 The {ref}`model <model>` defines how the model is constructed, adding a section of type embedding net:
 ```json
     "model": {
diff --git a/doc/model/train-se-e2-a.md b/doc/model/train-se-e2-a.md
index a043f64716..537253a6d9 100644
--- a/doc/model/train-se-e2-a.md
+++ b/doc/model/train-se-e2-a.md
@@ -4,6 +4,60 @@ The notation of `se_e2_a` is short for the Deep Potential Smooth Edition (DeepPo
 
 Note that it is sometimes called a "two-atom embedding descriptor" which means the input of the embedding net is atomic distances. The descriptor **does** encode multi-body information (both angular and radial information of neighboring atoms).
 
+## Theory
+
+The two-body embedding smooth edition of the DP descriptor $\mathcal{D}^i \in \mathbb{R}^{M \times M_{<}}$, is usually named DeepPot-SE descriptor.
+It is noted that the descriptor is a multi-body representation of the local environment of the atom $i$.
+We call it two-body embedding because the embedding network takes only the distance between atoms $i$ and $j$ (see below), but it is not implied that the descriptor takes only the pairwise information between $i$ and its neighbors.
+The descriptor, using full information, is given by
+
+```math
+    \mathcal{D}^i = \frac{1}{N_c^2} (\mathcal{G}^i)^T \mathcal{R}^i (\mathcal{R}^i)^T \mathcal{G}^i_<,
+```
+
+where
+$N_c$ is the expected maximum number of neighboring atoms, which is the same constant for all atoms over all frames.
+A matrix with a dimension of $N_c$ will be padded if the number of neighboring atoms is less than $N_c$. $\mathcal{R}^i \in \mathbb{R}^{N_c \times 4}$ is the coordinate matrix, and each row of $\mathcal{R}^i$ can be constructed as
+
+```math
+    (\mathcal{R}^i)_j =
+    \{
+    \begin{array}{cccc}
+    s(r_{ij}) & \frac{s(r_{ij})x_{ij}}{r_{ij}} & \frac{s(r_{ij})y_{ij}}{r_{ij}} & \frac{s(r_{ij})z_{ij}}{r_{ij}}
+    \end{array}
+    \},
+```
+
+where $\boldsymbol{r}_{ij}=\boldsymbol{r}_j-\boldsymbol{r}_i = (x_{ij}, y_{ij}, z_{ij})$ is the relative coordinate and $r_{ij}=\lVert \boldsymbol{r}_{ij} \lVert$ is its norm. The switching function $s(r)$ is defined as
+
+```math
+    s(r)=
+    \begin{cases}
+    \frac{1}{r}, & r \lt r_s, \\
+    \frac{1}{r} \big[ x^3 (-6 x^2 +15 x -10) +1 \big], & r_s \leq r \lt r_c, \\
+    0, & r \geq r_c,
+    \end{cases}
+```
+
+where $x=\frac{r - r_s}{ r_c - r_s}$  switches from 1 at $r_s$ to 0 at the cutoff radius $r_c$.
+The switching function $s(r)$ is smooth in the sense that the second-order derivative is continuous.
+
+Each row of the embedding matrix  $\mathcal{G}^i \in \mathbb{R}^{N_c \times M}$ consists of $M$ nodes from the output layer of an NN function $\mathcal{N}_ {g}$ of $s(r_{ij})$:
+
+```math
+    (\mathcal{G}^i)_j = \mathcal{N}_{e,2}(s(r_{ij})),
+```
+
+where the subscript $e,2$ is used to distinguish the NN from other NNs used in the DP model.
+In the above equation, the network parameters are not explicitly written.
+$\mathcal{G}^i_< \in \mathbb{R}^{N_c \times M_<}$ only takes first $M_<$ columns of $\mathcal{G}^i$ to reduce the size of $\mathcal D^i$.
+$r_s$, $r_c$, $M$ and $M_<$ are hyperparameters provided by the user.
+The DeepPot-SE is continuous up to the second-order derivative in its domain.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
+## Instructions
+
 In this example, we will train a DeepPot-SE model for a water system.  A complete training input script of this example can be found in the directory.
 ```bash
 $deepmd_source_dir/examples/water/se_e2_a/input.json
diff --git a/doc/model/train-se-e2-r.md b/doc/model/train-se-e2-r.md
index f48e10c17b..f2f990b16a 100644
--- a/doc/model/train-se-e2-r.md
+++ b/doc/model/train-se-e2-r.md
@@ -2,6 +2,46 @@
 
 The notation of `se_e2_r` is short for the Deep Potential Smooth Edition (DeepPot-SE) constructed from the radial information of atomic configurations. The `e2` stands for the embedding with two-atom information.
 
+## Theory
+
+The descriptor, using either radial-only information, is given by
+
+```math
+    \mathcal{D}^i = \frac{1}{N_c} \sum_j (\mathcal{G}^i)_{jk},
+```
+
+where
+$N_c$ is the expected maximum number of neighboring atoms, which is the same constant for all atoms over all frames.
+A matrix with a dimension of $N_c$ will be padded if the number of neighboring atoms is less than $N_c$.
+
+Each row of the embedding matrix  $\mathcal{G}^i \in \mathbb{R}^{N_c \times M}$ consists of $M$ nodes from the output layer of an NN function $\mathcal{N}_ {g}$ of $s(r_{ij})$:
+
+```math
+    (\mathcal{G}^i)_j = \mathcal{N}_{e,2}(s(r_{ij})),
+```
+
+where $\boldsymbol{r}_ {ij}=\boldsymbol{r}_ j-\boldsymbol{r}_ i = (x_{ij}, y_{ij}, z_{ij})$ is the relative coordinate and $r_{ij}=\lVert \boldsymbol{r}_{ij} \lVert$ is its norm. The switching function $s(r)$ is defined as
+
+```math
+    s(r)=
+    \begin{cases}
+    \frac{1}{r}, & r \lt r_s, \\
+    \frac{1}{r} \big[ x^3 (-6 x^2 +15 x -10) +1 \big], & r_s \leq r \lt r_c, \\
+    0, & r \geq r_c,
+    \end{cases}
+```
+
+where $x=\frac{r - r_s}{ r_c - r_s}$  switches from 1 at $r_s$ to 0 at the cutoff radius $r_c$.
+The switching function $s(r)$ is smooth in the sense that the second-order derivative is continuous.
+
+In the above equations, the network parameters are not explicitly written.
+$r_s$, $r_c$ and $M$ are hyperparameters provided by the user.
+The DeepPot-SE is continuous up to the second-order derivative in its domain.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
+## Instructions
+
 A complete training input script of this example can be found in the directory
 ```bash
 $deepmd_source_dir/examples/water/se_e2_r/input.json
diff --git a/doc/model/train-se-e3.md b/doc/model/train-se-e3.md
index d59f11b264..5b0710a389 100644
--- a/doc/model/train-se-e3.md
+++ b/doc/model/train-se-e3.md
@@ -1,6 +1,38 @@
 # Descriptor `"se_e3"`
 
-The notation of `se_e3` is short for the Deep Potential Smooth Edition (DeepPot-SE) constructed from all information (both angular and radial) of atomic configurations. The embedding takes angles between two neighboring atoms as input (denoted by `e3`).
+The notation of `se_e3` is short for the Deep Potential Smooth Edition (DeepPot-SE) constructed from all information (both angular and radial) of atomic configurations. The embedding takes bond angles between a central atom and its two neighboring atoms as input (denoted by `e3`).
+
+## Theory
+
+The three-body embedding DeepPot-SE descriptor incorporates bond-angle information, making the model more accurate. The descriptor $\mathcal{D}^i$ can be represented as
+```math
+    \mathcal{D}^i = \frac{1}{N_c^2}(\mathcal{R}^i(\mathcal{R}^i)^T):\mathcal{G}^i,
+```
+where
+$N_c$ is the expected maximum number of neighboring atoms, which is the same constant for all atoms over all frames.
+$\mathcal{R}^i$ is constructed as
+
+```math
+    (\mathcal{R}^i)_j =
+    \{
+    \begin{array}{cccc}
+    s(r_{ij}) & \frac{s(r_{ij})x_{ij}}{r_{ij}} & \frac{s(r_{ij})y_{ij}}{r_{ij}} & \frac{s(r_{ij})z_{ij}}{r_{ij}}
+    \end{array}
+    \},
+```
+Currently, only the full information case of $\mathcal{R}^i$ is supported by the three-body embedding.
+Each element of $\mathcal{G}^i \in \mathbb{R}^{N_c \times N_c \times M}$ comes from $M$ nodes from the output layer of an NN $\mathcal{N}_{e,3}$ function:
+
+```math
+    (\mathcal{G}^i)_{jk}=\mathcal{N}_{e,3}((\theta_i)_{jk}),
+```
+
+where $(\theta_i)_ {jk} = (\mathcal{R}^i)_ {j,\\{2,3,4\\}}\cdot (\mathcal{R}^i)_ {k,\\{2,3,4\\}}$ considers the angle form of two neighbours ($j$ and $k$).
+The notation $:$ in the equation indicates the contraction between matrix $\mathcal{R}^i(\mathcal{R}^i)^T$ and the first two dimensions of tensor $\mathcal{G}^i$.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
+## Instructions
 
 A complete training input script of this example can be found in the directory
 ```bash
diff --git a/doc/nvnmd/nvnmd.md b/doc/nvnmd/nvnmd.md
index d89afd09e5..c11fee0bc9 100644
--- a/doc/nvnmd/nvnmd.md
+++ b/doc/nvnmd/nvnmd.md
@@ -6,7 +6,7 @@ This is the training code we used to generate the results in our paper entitled
 
 Any user can follow two consecutive steps to run molecular dynamics (MD) on the proposed NVNMD computer, which has been released online: (i) to train a machine learning (ML) model that can decently reproduce the potential energy surface (PES); and (ii) to deploy the trained ML model on the proposed NVNMD computer, then run MD there to obtain the atomistic trajectories.
 
-# Training
+## Training
 
 Our training procedure consists of not only continuous neural network (CNN) training but also quantized neural network (QNN) training which uses the results of CNN as inputs. It is performed on CPU or GPU by using the training codes we open-sourced online.
 
@@ -60,6 +60,7 @@ The "nvnmd" section is defined as
 ```json
 {
     "version": 0,
+    "max_nnei":128,
     "net_size":128,
     "sel":[60, 60],
     "rcut":6.0,
@@ -73,6 +74,7 @@ where items are defined as:
 | Item      | Mean                        | Optional Value                                |
 | --------- | --------------------------- | --------------------------------------------- |
 | version | the version of network structure | 0 or 1 |
+| max_nnei  | the maximum number of neighbors that do not distinguish element types | 128  or 256 |
 | net_size  | the size of nueral network  | 128                                     |
 | sel       | the number of neighbors     | version 0: integer list of lengths 1 to 4 are acceptable; version 1: integer |
 | rcut      | the cutoff radial           | (0, 8.0]                                      |
@@ -187,6 +189,15 @@ You can also restart the CNN training from the path prefix of checkpoint files (
 dp train-nvnmd train_cnn.json -r nvnmd_cnn/model.ckpt -s s1
 ```
 
+You can also initialize the CNN model and train it by
+
+``` bash
+mv nvnmd_cnn nvnmd_cnn_bck
+cp train_cnn.json train_cnn2.json
+# please edit train_cnn2.json
+dp train-nvnmd train_cnn2.json -s s1 -i nvnmd_cnn_bck/model.ckpt
+```
+
 
 # Testing
 
diff --git a/doc/test/model-deviation.md b/doc/test/model-deviation.md
index 6a89d7c2f4..a59696c5ee 100644
--- a/doc/test/model-deviation.md
+++ b/doc/test/model-deviation.md
@@ -1,5 +1,50 @@
 # Calculate Model Deviation
 
+## Theory
+
+Model deviation $\epsilon_y$ is the standard deviation of properties $\boldsymbol y$ inferred by an ensemble of models $\mathcal{M}_ 1, \dots, \mathcal{M}_{n_m}$ that are trained by the same dataset(s) with the model parameters initialized independently.
+The DeePMD-kit supports $\boldsymbol y$ to be the atomic force $\boldsymbol F_i$ and the virial tensor $\boldsymbol \Xi$.
+The model deviation is used to estimate the error of a model at a certain data frame, denoted by $\boldsymbol x$, containing the coordinates and chemical species of all atoms.
+We present the model deviation of the atomic force and the virial tensor
+```math
+    \epsilon_{\boldsymbol{F},i} (\boldsymbol x)=
+    \sqrt{\langle \lVert \boldsymbol F_i(\boldsymbol x; \boldsymbol \theta_k)-\langle \boldsymbol F_i(\boldsymbol x; \boldsymbol \theta_k) \rangle \rVert^2 \rangle},
+```
+```math
+    \epsilon_{\boldsymbol{\Xi},{\alpha \beta}} (\boldsymbol x)=
+    \frac{1}{N} \sqrt{\langle ( {\Xi}_{\alpha \beta}(\boldsymbol x; \boldsymbol \theta_k)-\langle {\Xi}_{\alpha \beta}(\boldsymbol x; \boldsymbol \theta_k) \rangle )^2 \rangle},
+```
+where $\boldsymbol \theta_k$ is the parameters of the model $\mathcal M_k$, and the ensemble average $\langle\cdot\rangle$ is estimated by
+```math
+    \langle \boldsymbol y(\boldsymbol x; \boldsymbol \theta_k) \rangle
+    =
+    \frac{1}{n_m} \sum_{k=1}^{n_m} \boldsymbol y(\boldsymbol x; \boldsymbol \theta_k).
+```
+Small $\epsilon_{\boldsymbol{F},i}$ means the model has learned the given data; otherwise, it is not covered, and the training data needs to be expanded.
+If the magnitude of $\boldsymbol F_i$ or $\boldsymbol \Xi$ is quite large,
+a relative model deviation $\epsilon_{\boldsymbol{F},i,\text{rel}}$ or $\epsilon_{\boldsymbol{\Xi},\alpha\beta,\text{rel}}$ can be used instead of the absolute model deviation:
+```math
+    \epsilon_{\boldsymbol{F},i,\text{rel}}  (\boldsymbol x)
+    =
+    \frac{\lvert \epsilon_{\boldsymbol{F},i} (\boldsymbol x) \lvert}
+    {\lvert \langle \boldsymbol F_i (\boldsymbol x; \boldsymbol \theta_k) \rangle \lvert + \nu},
+```
+```math
+    \epsilon_{\boldsymbol{\Xi},\alpha\beta,\text{rel}}  (\boldsymbol x)
+    =
+    \frac{ \epsilon_{\boldsymbol{\Xi},\alpha\beta} (\boldsymbol x) }
+    {\lvert \langle \boldsymbol \Xi (\boldsymbol x; \boldsymbol \theta_k) \rangle \lvert + \nu},
+```
+where $\nu$ is a small constant used to protect
+an atom where the magnitude of $\boldsymbol{F}_i$ or $\boldsymbol{\Xi}$ is small from having a large model deviation.
+
+Statistics of $\epsilon_{\boldsymbol{F},i}$ and $\epsilon_{\boldsymbol{\Xi},{\alpha \beta}}$ can be provided, including the maximum, average, and minimal model deviation over the atom index $i$ and over the component index $\alpha,\beta$, respectively.
+The maximum model deviation of forces $\epsilon_{\boldsymbol F,\text{max}}$ in a frame was found to be the best error indicator in a concurrent or active learning algorithm.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
+## Instructions
+
 One can also use a subcommand to calculate the deviation of predicted forces or virials for a bunch of models in the following way:
 ```bash
 dp model-devi -m graph.000.pb graph.001.pb graph.002.pb graph.003.pb -s ./data -o model_devi.out
diff --git a/doc/third-party/index.md b/doc/third-party/index.md
index 3de01d6944..235337974c 100644
--- a/doc/third-party/index.md
+++ b/doc/third-party/index.md
@@ -3,8 +3,7 @@
 Note that the model for inference is required to be compatible with the DeePMD-kit package. See [Model compatibility](../troubleshooting/model-compatability.html) for details.
 
 - [Use deep potential with ASE](ase.md)
-- [Run MD with LAMMPS](lammps.md)
-- [LAMMPS commands](lammps-command.md)
+- [Run MD with LAMMPS](lammps-command.md)
 - [Run path-integral MD with i-PI](ipi.md)
 - [Run MD with GROMACS](gromacs.md)
 - [Interfaces out of DeePMD-kit](out-of-deepmd-kit.md)
diff --git a/doc/third-party/index.rst b/doc/third-party/index.rst
index 678dfc9315..f88a477fc7 100644
--- a/doc/third-party/index.rst
+++ b/doc/third-party/index.rst
@@ -7,7 +7,6 @@ Note that the model for inference is required to be compatible with the DeePMD-k
    :maxdepth: 1
 
    ase
-   lammps
    lammps-command
    ipi
    gromacs
diff --git a/doc/third-party/lammps-command.md b/doc/third-party/lammps-command.md
index cdfa4b87d6..150d755795 100644
--- a/doc/third-party/lammps-command.md
+++ b/doc/third-party/lammps-command.md
@@ -1,4 +1,4 @@
-# LAMMPS commands
+# Run MD with LAMMPS
 
 ## units
 All units in LAMMPS except `lj` are supported. `lj` is not supported.
@@ -82,6 +82,7 @@ Evaluate the interaction of the system by using [Deep Potential][DP] or [Deep Po
 This pair style takes the deep potential defined in a model file that usually has the .pb extension. The model can be trained and frozen by package [DeePMD-kit](https://github.com/deepmodeling/deepmd-kit), which can have either double or single float precision interface.
 
 The model deviation evalulates the consistency of the force predictions from multiple models. By default, only the maximal, minimal and average model deviations are output. If the key `atomic` is set, then the model deviation of force prediction of each atom will be output.
+The unit follows [LAMMPS units](#units) and the [scale factor](https://docs.lammps.org/pair_hybrid.html) is not applied.
 
 By default, the model deviation is output in absolute value. If the keyword `relative` is set, then the relative model deviation of the force will be output, including values output by the keyword `atomic`. The relative model deviation of the force on atom $i$ is defined by
 
diff --git a/doc/third-party/lammps.md b/doc/third-party/lammps.md
deleted file mode 100644
index 0020db01c5..0000000000
--- a/doc/third-party/lammps.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Run MD with LAMMPS
-
-Running an MD simulation with LAMMPS is simpler. In the LAMMPS input file, one needs to specify the pair style as follows
-
-```lammps
-pair_style     deepmd graph.pb
-pair_coeff     * * O H
-```
-where `graph.pb` is the file name of the frozen model. `pair_coeff` maps atom names (`O H`) with LAMMPS atom types (integers from 1 to Ntypes, i.e. `1 2`).
diff --git a/doc/third-party/out-of-deepmd-kit.md b/doc/third-party/out-of-deepmd-kit.md
index 3d46b53578..71dc9adb23 100644
--- a/doc/third-party/out-of-deepmd-kit.md
+++ b/doc/third-party/out-of-deepmd-kit.md
@@ -19,9 +19,19 @@ By inferring with the DP model `frozen_model_compressed.pb`, dpdata will generat
 
 An [OpenMM](https://github.com/openmm/openmm) plugin is provided from [JingHuangLab/openmm_deepmd_plugin](https://github.com/JingHuangLab/openmm_deepmd_plugin), written by the [Huang Lab](http://www.compbiophysics.org/) at Westlake University.
 
-## AMBER interface to DeePMD-kit
+## Amber interface to DeePMD-kit
 
-An [AMBER](https://ambermd.org/) interface to DeePMD-kit is written by the [York [Lab](https://theory.rutgers.edu/) from Rutgers University. It is open-source at [GitLab RutgersLBSR/AmberDPRc](https://gitlab.com/RutgersLBSR/AmberDPRc/). Details can be found in [this paper](https://doi.org/10.1021/acs.jctc.1c00201).
+Starting from [AmberTools24](https://ambermd.org/), `sander` includes an interface to the DeePMD-kit, which implements the [Deep Potential Range Corrected (DPRc) correction](../model/dprc.md).
+The DPRc model and the interface were developed by the [York Lab](https://theory.rutgers.edu/) from Rutgers University.
+More details are available in
+- [Amber Reference Manuals](https://ambermd.org/Manuals.php), providing documentation for how to enable the interface and the `&dprc` namelist;
+- [GitLab RutgersLBSR/AmberDPRc](https://gitlab.com/RutgersLBSR/AmberDPRc/), providing examples mdin files;
+- [DP-Amber](https://github.com/njzjz/dpamber/), a tiny tool to convert Amber trajectory to DPRc training data;
+- [The original DPRc paper](https://doi.org/10.1021/acs.jctc.1c00201).
+
+## CP2K interface to DeePMD-kit
+
+[CP2K](https://github.com/cp2k/cp2k/) v2024.2 adds an interface to the DeePMD-kit for molecular dynamics. Read the [CP2K manual](https://manual.cp2k.org/trunk/methods/machine_learning/deepmd.html#deepmd-kit) for details.
 
 ## DP-GEN
 
diff --git a/doc/train/multi-task-training.md b/doc/train/multi-task-training.md
index c3cbe98c83..c647e6905e 100644
--- a/doc/train/multi-task-training.md
+++ b/doc/train/multi-task-training.md
@@ -1,5 +1,22 @@
 # Multi-task training
 
+## Theory
+
+The multi-task training process can simultaneously handle different datasets with properties that cannot be fitted in one network (e.g. properties from DFT calculations under different exchange-correlation functionals or different basis sets).
+These datasets are denoted by $\boldsymbol x^{(1)}, \dots, \boldsymbol x^{(n_t)}$.
+For each dataset, a training task is defined as
+```math
+    \min_{\boldsymbol \theta}   L^{(t)} (\boldsymbol x^{(t)}; \boldsymbol  \theta^{(t)}, \tau), \quad t=1, \dots, n_t.
+```
+
+During the multi-task training process, all tasks share one descriptor with trainable parameters $\boldsymbol{\theta}_ {d}$, while each of them has its own fitting network with trainable parameters $\boldsymbol{\theta}_ f^{(t)}$, thus
+$\boldsymbol{\theta}^{(t)} = \{ \boldsymbol{\theta}_ {d} , \boldsymbol{\theta}_ {f}^{(t)} \}$.
+At each training step, a task is randomly picked from ${1, \dots, n_t}$, and the Adam optimizer is executed to minimize $L^{(t)}$ for one step to update the parameter $\boldsymbol \theta^{(t)}$.
+If different fitting networks have the same architecture, they can share the parameters of some layers
+to improve training efficiency.[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## Perform the multi-task training
 Training on multiple data sets (each data set contains several data systems) can be performed in multi-task mode,
 with one common descriptor and multiple specific fitting nets for each data set.
diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md
index b0194e3471..4940b77fa7 100644
--- a/doc/train/training-advanced.md
+++ b/doc/train/training-advanced.md
@@ -4,6 +4,23 @@ In this section, we will take `$deepmd_source_dir/examples/water/se_e2_a/input.j
 
 ## Learning rate
 
+### Theory
+
+The learning rate $\gamma$ decays exponentially:
+```math
+    \gamma(\tau) = \gamma^0 r ^ {\lfloor  \tau/s \rfloor},
+```
+where $\tau \in \mathbb{N}$ is the index of the training step, $\gamma^0  \in \mathbb{R}$ is the learning rate at the first step, and the decay rate $r$ is given by
+```math
+    r = {\left(\frac{\gamma^{\text{stop}}}{\gamma^0}\right )} ^{\frac{s}{\tau^{\text{stop}}}},
+```
+where $\tau^{\text{stop}} \in \mathbb{N}$, $\gamma^{\text{stop}} \in \mathbb{R}$, and $s \in \mathbb{N}$ are the stopping step, the stopping learning rate, and the decay steps, respectively, all of which are hyperparameters provided in advance.
+[^1]
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen,  Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
+### Instructions
+
 The {ref}`learning_rate <learning_rate>` section in `input.json` is given as follows
 ```json
     "learning_rate" :{
@@ -18,10 +35,6 @@ The {ref}`learning_rate <learning_rate>` section in `input.json` is given as fol
 * {ref}`stop_lr <learning_rate[exp]/stop_lr>` gives the learning rate at the end of the training. It should be small enough to ensure that the network parameters satisfactorily converge.
 * During the training, the learning rate decays exponentially from {ref}`start_lr <learning_rate[exp]/start_lr>` to {ref}`stop_lr <learning_rate[exp]/stop_lr>` following the formula:
 
-$$ \alpha(t) = \alpha_0 \lambda ^ { t / \tau } $$
-
-where $t$ is the training step, $\alpha$ is the learning rate, $\alpha_0$ is the starting learning rate (set by {ref}`start_lr <learning_rate[exp]/start_lr>`), $\lambda$ is the decay rate, and $\tau$ is the decay steps, i.e.
-
     ```
     lr(t) = start_lr * decay_rate ^ ( t / decay_steps )
     ```
diff --git a/examples/nvnmd/train/train_cnn.json b/examples/nvnmd/train/train_cnn.json
index c89c8b13d6..1865106909 100644
--- a/examples/nvnmd/train/train_cnn.json
+++ b/examples/nvnmd/train/train_cnn.json
@@ -1,6 +1,7 @@
 {
   "nvnmd": {
     "version": 0,
+    "max_nnei": 128,
     "net_size": 128,
     "sel": [
       60,
diff --git a/examples/nvnmd/train/train_qnn.json b/examples/nvnmd/train/train_qnn.json
index 0235575f52..72b299f70d 100644
--- a/examples/nvnmd/train/train_qnn.json
+++ b/examples/nvnmd/train/train_qnn.json
@@ -1,6 +1,7 @@
 {
   "nvnmd": {
     "version": 0,
+    "max_nnei": 128,
     "net_size": 128,
     "sel": [
       60,
diff --git a/examples/water/d3/README.md b/examples/water/d3/README.md
new file mode 100644
index 0000000000..bd75960010
--- /dev/null
+++ b/examples/water/d3/README.md
@@ -0,0 +1,11 @@
+# DPD3
+
+`dftd3.txt` tabulates D3 dispersion for each pair of types (O-O, O-H, H-H).
+It can be generated by [simple-dftd3](https://github.com/dftd3/simple-dftd3).
+
+## Note
+
+As an example, it cannot be used in production:
+
+- For small file sizes in the repository, the distance interval in the tabulation is only 0.1.
+- The example training data does not contain dispersion interaction.
diff --git a/examples/water/d3/dftd3.txt b/examples/water/d3/dftd3.txt
new file mode 100644
index 0000000000..bbc9726134
--- /dev/null
+++ b/examples/water/d3/dftd3.txt
@@ -0,0 +1,100 @@
+1.000000000000000056e-01 -5.836993924755046366e-03 -3.207255698139210940e-03 -1.843064837882633228e-03
+2.000000000000000111e-01 -5.836993806911452108e-03 -3.207255613696154226e-03 -1.843064776130543892e-03
+3.000000000000000444e-01 -5.836992560106194113e-03 -3.207254720510349828e-03 -1.843064123123401392e-03
+4.000000000000000222e-01 -5.836986225627246658e-03 -3.207250184384043221e-03 -1.843060811677158526e-03
+5.000000000000000000e-01 -5.836964436915091821e-03 -3.207234589497737730e-03 -1.843052788205641135e-03
+5.999999999999999778e-01 -5.836905460107320170e-03 -3.207192410957825698e-03 -1.843338972660025360e-03
+7.000000000000000666e-01 -5.836769626930583300e-03 -3.207096085246822614e-03 -1.851839876215982238e-03
+8.000000000000000444e-01 -5.836491030513121618e-03 -3.206924889333430135e-03 -2.035200426069873857e-03
+9.000000000000000222e-01 -5.835967602710929840e-03 -3.206999537190755728e-03 -3.724418810291191088e-03
+1.000000000000000000e+00 -5.835053775792304297e-03 -3.210477055685919626e-03 -4.311009958284344433e-03
+1.100000000000000089e+00 -5.833591489567684953e-03 -3.237527828601436623e-03 -4.381510573223419171e-03
+1.200000000000000178e+00 -5.831652981781070173e-03 -3.454845258034439960e-03 -4.394419437232751843e-03
+1.300000000000000266e+00 -5.830520601296543433e-03 -4.478070067533340692e-03 -4.394683688871586433e-03
+1.400000000000000133e+00 -5.835353622834494637e-03 -5.097530655625692915e-03 -4.389691198859401421e-03
+1.500000000000000222e+00 -5.863290690264541874e-03 -5.215500241204417201e-03 -4.380686516072217034e-03
+1.600000000000000089e+00 -6.007605076700822840e-03 -5.234994618743306349e-03 -4.367337507268855175e-03
+1.700000000000000178e+00 -6.481613230242359684e-03 -5.228094160806716871e-03 -4.348706108547779198e-03
+1.800000000000000266e+00 -6.814114687600298335e-03 -5.208252365588400719e-03 -4.323505520547227775e-03
+1.900000000000000133e+00 -6.876286379079538276e-03 -5.177988357772074675e-03 -4.290186895355558444e-03
+2.000000000000000000e+00 -6.858440816799354217e-03 -5.136887568332395605e-03 -4.246989919717190920e-03
+2.100000000000000089e+00 -6.810730159155128395e-03 -5.083475665301987606e-03 -4.192000168715152505e-03
+2.200000000000000178e+00 -6.742330737387775344e-03 -5.015815334399144516e-03 -4.123231519970332187e-03
+2.300000000000000266e+00 -6.653841351238824232e-03 -4.931782661310191510e-03 -4.038743210125123918e-03
+2.400000000000000355e+00 -6.543651317938833402e-03 -4.829269294496830317e-03 -3.936795390727530070e-03
+2.500000000000000444e+00 -6.409559281498313811e-03 -4.706385522261587705e-03 -3.816040239463167755e-03
+2.600000000000000089e+00 -6.249406635892575460e-03 -4.561685215972477100e-03 -3.675736338668155346e-03
+2.700000000000000178e+00 -6.061478463281754457e-03 -4.394408172892586353e-03 -3.515962176363645990e-03
+2.800000000000000266e+00 -5.844844934626365965e-03 -4.204716954930251029e-03 -3.337792190764940319e-03
+2.900000000000000355e+00 -5.599669004675433479e-03 -3.993889719587391009e-03 -3.143390268473208755e-03
+3.000000000000000444e+00 -5.327453506642119106e-03 -3.764420755089863558e-03 -2.935977648106832729e-03
+3.100000000000000089e+00 -5.031178000843260223e-03 -3.519982860915751074e-03 -2.719650568099894056e-03
+3.200000000000000178e+00 -4.715273672783852794e-03 -3.265225882759082918e-03 -2.499057451653833965e-03
+3.300000000000000266e+00 -4.385404785641488362e-03 -3.005422601424333727e-03 -2.278985743812388717e-03
+3.400000000000000355e+00 -4.048065433713449700e-03 -2.746015696661484231e-03 -2.063937321866260270e-03
+3.500000000000000444e+00 -3.710048572169818114e-03 -2.492149763588673555e-03 -1.857774171128685628e-03
+3.600000000000000089e+00 -3.377881092113224713e-03 -2.248275746149775312e-03 -1.663491260531681313e-03
+3.700000000000000178e+00 -3.057327225182689644e-03 -2.017890114824574810e-03 -1.483133951195727196e-03
+3.800000000000000266e+00 -2.753038981057491941e-03 -1.803430168074075671e-03 -1.317840750738439540e-03
+3.900000000000000355e+00 -2.468388171389931940e-03 -1.606308000309067743e-03 -1.167971059502070875e-03
+4.000000000000000000e+00 -2.205469013267805957e-03 -1.427041871266797194e-03 -1.033273795673775699e-03
+4.099999999999999645e+00 -1.965228953751702902e-03 -1.265437879541002862e-03 -9.130610310879381641e-04
+4.200000000000000178e+00 -1.747673832278765806e-03 -1.120782158543769547e-03 -8.063636493380576522e-04
+4.299999999999999822e+00 -1.552098284175109895e-03 -9.920168984562682292e-04 -7.120580835032176920e-04
+4.399999999999999467e+00 -1.377305748647780163e-03 -8.778864597897169646e-04 -6.289618864203703032e-04
+4.500000000000000000e+00 -1.221797526507303194e-03 -7.770496638083513111e-04 -5.559009474092405914e-04
+4.599999999999999645e+00 -1.083922782809847944e-03 -6.881603844395511003e-04 -4.917533939693695443e-04
+4.700000000000000178e+00 -9.619897379282633162e-04 -6.099214740721333600e-04 -4.354756390957214944e-04
+4.799999999999999822e+00 -8.543428352989788704e-04 -5.411178648690499965e-04 -3.861155118068372257e-04
+4.900000000000000355e+00 -7.594124385866309881e-04 -4.806343247547230249e-04 -3.428165131289927659e-04
+5.000000000000000000e+00 -6.757436744162991990e-04 -4.274624687438948085e-04 -3.048162971647301774e-04
+5.099999999999999645e+00 -6.020102408497160842e-04 -3.807006248475114439e-04 -2.714416410742632600e-04
+5.200000000000000178e+00 -5.370178955485286568e-04 -3.395492294862310413e-04 -2.421014916366724180e-04
+5.299999999999999822e+00 -4.797012289428498875e-04 -3.033036596191310643e-04 -2.162791601488694472e-04
+5.400000000000000355e+00 -4.291163603974148220e-04 -2.713458112672340397e-04 -1.935243599692976007e-04
+5.500000000000000000e+00 -3.844314156775488251e-04 -2.431352896687036106e-04 -1.734455139070628909e-04
+5.599999999999999645e+00 -3.449160478270333653e-04 -2.182007570692257958e-04 -1.557025751017268144e-04
+5.700000000000000178e+00 -3.099308250478081581e-04 -1.961317615550248216e-04 -1.400004825033046053e-04
+5.799999999999999822e+00 -2.789169965744946232e-04 -1.765712194195623135e-04 -1.260832928664179986e-04
+5.900000000000000355e+00 -2.513869308376957498e-04 -1.592086242469989389e-04 -1.137289820430557137e-04
+6.000000000000000000e+00 -2.269153740910770769e-04 -1.437739928526920498e-04 -1.027448796326863424e-04
+6.099999999999999645e+00 -2.051315821421489645e-04 -1.300325201124615134e-04 -9.296368585686617101e-05
+6.200000000000000178e+00 -1.857123177371916057e-04 -1.177798933810950252e-04 -8.424001307773229020e-05
+6.299999999999999822e+00 -1.683756703844696025e-04 -1.068382068924710331e-04 -7.644739339328402133e-05
+6.400000000000000355e+00 -1.528756359693242027e-04 -9.705241326038571551e-05 -6.947569600606938272e-05
+6.500000000000000000e+00 -1.389973847900246836e-04 -8.828725024164000819e-05 -6.322890211399061677e-05
+6.599999999999999645e+00 -1.265531447216864910e-04 -8.042458445906290783e-05 -5.762318996708282935e-05
+6.700000000000000178e+00 -1.153786284350462083e-04 -7.336111861455703732e-05 -5.258528787829301387e-05
+6.799999999999999822e+00 -1.053299381724164837e-04 -6.700641408290249014e-05 -4.805105801105378322e-05
+6.900000000000000355e+00 -9.628088734156424651e-05 -6.128118618925484863e-05 -4.396427848897285724e-05
+7.000000000000000000e+00 -8.812068437769617318e-05 -5.611583465513190913e-05 -4.027559568117881135e-05
+7.099999999999999645e+00 -8.075193047879847589e-05 -5.144917649553730292e-05 -3.694162237238345173e-05
+7.200000000000000178e+00 -7.408888866698059216e-05 -4.722735299269381154e-05 -3.392416093003276399e-05
+7.299999999999999822e+00 -6.805598702152939358e-05 -4.340288624040664528e-05 -3.118953355495664799e-05
+7.400000000000000355e+00 -6.258652380321327402e-05 -3.993386415929820209e-05 -2.870800428147100793e-05
+7.500000000000000000e+00 -5.762154653038724025e-05 -3.678323585657680581e-05 -2.645327961777798337e-05
+7.599999999999999645e+00 -5.310888089285451013e-05 -3.391820178292012157e-05 -2.440207662848582849e-05
+7.700000000000000178e+00 -4.900228873380196631e-05 -3.130968536501008690e-05 -2.253374889733842244e-05
+7.799999999999999822e+00 -4.526073723647751752e-05 -2.893187470658392955e-05 -2.082996220611386151e-05
+7.900000000000000355e+00 -4.184776396661089387e-05 -2.676182459276817884e-05 -1.927441295794754013e-05
+8.000000000000000000e+00 -3.873092458939377268e-05 -2.477911043795883125e-05 -1.785258338919504348e-05
+8.099999999999999645e+00 -3.588131194417033489e-05 -2.296552701898519263e-05 -1.655152847892165657e-05
+8.199999999999999289e+00 -3.327313676038550535e-05 -2.130482586144845337e-05 -1.535969020138178571e-05
+8.300000000000000711e+00 -3.088336167038252842e-05 -1.978248602307972753e-05 -1.426673539356727330e-05
+8.400000000000000355e+00 -2.869138134992016182e-05 -1.838551376555334571e-05 -1.326341404347894562e-05
+8.500000000000000000e+00 -2.667874262351516647e-05 -1.710226724425689568e-05 -1.234143525923654349e-05
+8.599999999999999645e+00 -2.482889923305170253e-05 -1.592230289025118773e-05 -1.149335856644029766e-05
+8.699999999999999289e+00 -2.312699670543422217e-05 -1.483624062390156494e-05 -1.071249851148625846e-05
+8.800000000000000711e+00 -2.155968338640409072e-05 -1.383564543725925445e-05 -9.992840830439041262e-06
+8.900000000000000355e+00 -2.011494424844594071e-05 -1.291292322228744002e-05 -9.328968683880578804e-06
+9.000000000000000000e+00 -1.878195454422273937e-05 -1.206122901303710759e-05 -8.715997664073560640e-06
+9.099999999999999645e+00 -1.755095077450199208e-05 -1.127438605916122122e-05 -8.149518457037649769e-06
+9.199999999999999289e+00 -1.641311678073074286e-05 -1.054681436190059537e-05 -7.625546193173260002e-06
+9.300000000000000711e+00 -1.536048306550537418e-05 -9.873467487129955082e-06 -7.140475649634374041e-06
+9.400000000000000355e+00 -1.438583769617946587e-05 -9.249776627676899564e-06 -6.691041578929334717e-06
+9.500000000000000000e+00 -1.348264736372320039e-05 -8.671601022702781346e-06 -6.274283533910064576e-06
+9.599999999999999645e+00 -1.264498735578012246e-05 -8.135183958678718279e-06 -5.887514641681122086e-06
+9.700000000000001066e+00 -1.186747936398473687e-05 -7.637113677130612127e-06 -5.528293849956352819e-06
+9.800000000000000711e+00 -1.114523618469756001e-05 -7.174288601187318493e-06 -5.194401230658985063e-06
+9.900000000000000355e+00 -1.047381249252528874e-05 -6.743886368019750717e-06 -4.883815978498405921e-06
+1.000000000000000000e+01  0.000000000000000e00e+00  0.000000000000000e00e+00  0.000000000000000e00e+00
diff --git a/examples/water/d3/input.json b/examples/water/d3/input.json
new file mode 100644
index 0000000000..bbe7a2c8a9
--- /dev/null
+++ b/examples/water/d3/input.json
@@ -0,0 +1,95 @@
+{
+  "_comment1": " model parameters",
+  "model": {
+    "type": "linear_ener",
+    "weights": "sum",
+    "models": [
+      {
+        "type_map": [
+          "O",
+          "H"
+        ],
+        "descriptor": {
+          "type": "se_e2_a",
+          "sel": [
+            46,
+            92
+          ],
+          "rcut_smth": 0.50,
+          "rcut": 6.00,
+          "neuron": [
+            25,
+            50,
+            100
+          ],
+          "resnet_dt": false,
+          "axis_neuron": 16,
+          "precision": "float64",
+          "seed": 1,
+          "_comment2": " that's all"
+        },
+        "fitting_net": {
+          "neuron": [
+            240,
+            240,
+            240
+          ],
+          "resnet_dt": true,
+          "precision": "float64",
+          "seed": 1,
+          "_comment3": " that's all"
+        },
+        "_comment4": " that's all"
+      },
+      {
+        "type": "pairtab",
+        "tab_file": "dftd3.txt",
+        "rcut": 10.0,
+        "sel": 534
+      }
+    ]
+  },
+  "learning_rate": {
+    "type": "exp",
+    "decay_steps": 5000,
+    "start_lr": 0.001,
+    "stop_lr": 3.51e-8,
+    "_comment5": "that's all"
+  },
+  "loss": {
+    "type": "ener",
+    "start_pref_e": 0.02,
+    "limit_pref_e": 1,
+    "start_pref_f": 1000,
+    "limit_pref_f": 1,
+    "start_pref_v": 0,
+    "limit_pref_v": 0,
+    "_comment6": " that's all"
+  },
+  "training": {
+    "training_data": {
+      "systems": [
+        "../data/data_0/",
+        "../data/data_1/",
+        "../data/data_2/"
+      ],
+      "batch_size": "auto",
+      "_comment7": "that's all"
+    },
+    "validation_data": {
+      "systems": [
+        "../data/data_3"
+      ],
+      "batch_size": 1,
+      "numb_btch": 3,
+      "_comment8": "that's all"
+    },
+    "numb_steps": 1000000,
+    "seed": 10,
+    "disp_file": "lcurve.out",
+    "disp_freq": 100,
+    "save_freq": 1000,
+    "_comment9": "that's all"
+  },
+  "_comment10": "that's all"
+}
diff --git a/examples/zinc_protein/zinc_se_a_mask.json b/examples/zinc_protein/zinc_se_a_mask.json
index b23987cf5d..04f63aa4ed 100644
--- a/examples/zinc_protein/zinc_se_a_mask.json
+++ b/examples/zinc_protein/zinc_se_a_mask.json
@@ -68,14 +68,14 @@
   "training": {
     "training_data": {
       "systems": [
-        "example/zinc_protein/train_data_dp_mask/"
+        "examples/zinc_protein/train_data_dp_mask/"
       ],
       "batch_size": 2,
       "_comment7": "that's all"
     },
     "validation_data": {
       "systems": [
-        "example/zinc_protein/val_data_dp_mask/"
+        "examples/zinc_protein/val_data_dp_mask/"
       ],
       "batch_size": 2,
       "_comment8": "that's all"
diff --git a/pyproject.toml b/pyproject.toml
index 35a11d2163..e91fd320f3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 requires = [
     # dynamic metadata API is still unstable
     # TODO: unpin the upper bound when it is stable
-    "scikit-build-core>=0.5,<0.6",
+    "scikit-build-core>=0.5,<0.8,!=0.6.0",
     "packaging",
 ]
 build-backend = "backend.dp_backend"
@@ -25,7 +25,7 @@ classifiers = [
     "Programming Language :: C",
     "Programming Language :: C++",
     "Programming Language :: Python :: 3 :: Only",
-    "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+    "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.2",
     "Intended Audience :: Science/Research",
     "Programming Language :: Python :: 3.7",
     "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)",
@@ -53,7 +53,7 @@ keywords = ["deepmd"]
 deepmd = "deepmd.lmp:get_op_dir"
 
 [project.entry-points."dpgui"]
-"DeePMD-kit" = "deepmd.utils.argcheck:gen_args"
+"DeePMD-kit" = "deepmd_utils.utils.argcheck:gen_args"
 
 [project.urls]
 Homepage = "https://github.com/deepmodeling/deepmd-kit"
@@ -82,7 +82,7 @@ sdist.exclude = [
 ]
 wheel.packages = [
     "deepmd",
-    "deepmd_cli",
+    "deepmd_utils",
 ]
 wheel.py-api = "py37"
 build-dir = "build/{wheel_tag}"
@@ -102,7 +102,7 @@ provider-path = "backend"
 provider = "scikit_build_core.metadata.fancy_pypi_readme"
 
 [[tool.scikit-build.generate]]
-path = "deepmd_cli/_version.py"
+path = "deepmd_utils/_version.py"
 template = '''
 version = "${version}"
 '''
@@ -133,12 +133,13 @@ test-command = [
 test-extras = ["cpu", "test", "lmp", "ipi"]
 build = ["cp310-*"]
 skip = ["*-win32", "*-manylinux_i686", "*-musllinux*"]
-# TODO: bump to "latest" tag when CUDA supports GCC 12
+# TODO: uncomment when CUDA 11 is deprecated
+# manylinux-x86_64-image = "manylinux_2_28"
 manylinux-x86_64-image = "quay.io/pypa/manylinux_2_28_x86_64:2022-11-19-1b19e81"
-manylinux-aarch64-image = "quay.io/pypa/manylinux_2_28_aarch64:2022-11-19-1b19e81"
+manylinux-aarch64-image = "manylinux_2_28"
 
 [tool.cibuildwheel.macos]
-environment = { PIP_PREFER_BINARY="1", DP_LAMMPS_VERSION="stable_2Aug2023_update1", DP_ENABLE_IPI="1" }
+environment = { PIP_PREFER_BINARY="1", DP_LAMMPS_VERSION="stable_2Aug2023_update2", DP_ENABLE_IPI="1" }
 before-all = [
     """if [[ "$CIBW_BUILD" != *macosx_arm64* ]]; then brew install mpich; fi""",
 ]
@@ -149,10 +150,16 @@ repair-wheel-command = """if [[ "$CIBW_BUILD" == *macosx_arm64* ]]; then rm -rf
 
 [tool.cibuildwheel.linux]
 repair-wheel-command = "auditwheel repair --exclude libtensorflow_framework.so.2 --exclude libtensorflow_framework.so.1 --exclude libtensorflow_framework.so --exclude _pywrap_tensorflow_internal.so --exclude libtensorflow_cc.so.2 -w {dest_dir} {wheel}"
-environment-pass = ["CIBW_BUILD", "DP_VARIANT"]
-environment = { PIP_PREFER_BINARY="1", DP_LAMMPS_VERSION="stable_2Aug2023_update1", DP_ENABLE_IPI="1", MPI_HOME="/usr/lib64/mpich", PATH="/usr/lib64/mpich/bin:$PATH" }
+environment-pass = [
+    "CIBW_BUILD",
+    "DP_VARIANT",
+    "CUDA_VERSION",
+    "DP_PKG_NAME",
+]
+environment = { PIP_PREFER_BINARY="1", DP_LAMMPS_VERSION="stable_2Aug2023_update2", DP_ENABLE_IPI="1", MPI_HOME="/usr/lib64/mpich", PATH="/usr/lib64/mpich/bin:$PATH" }
 before-all = [
-    """{ if [ "$(uname -m)" = "x86_64" ] ; then yum config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && yum install -y cuda-nvcc-11-8 cuda-cudart-devel-11-8; fi }""",
+    """if [ ! -z "${DP_PKG_NAME}" ]; then sed -i "s/name = \\"deepmd-kit\\"/name = \\"${DP_PKG_NAME}\\"/g" pyproject.toml; fi""",
+    """{ if [ "$(uname -m)" = "x86_64" ] ; then yum config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && yum install -y cuda-nvcc-${CUDA_VERSION/./-} cuda-cudart-devel-${CUDA_VERSION/./-}; fi }""",
     "yum install -y mpich-devel",
 ]
 
diff --git a/source/api_c/include/c_api.h b/source/api_c/include/c_api.h
index b0c030962a..d05f790bf9 100644
--- a/source/api_c/include/c_api.h
+++ b/source/api_c/include/c_api.h
@@ -1271,6 +1271,13 @@ void DP_SelectMapInt(const int* in,
                      const int nall2,
                      int* out);
 
+/**
+ * @brief Destroy a char array.
+ *
+ * @param c_str The char array.
+ */
+void DP_DeleteChar(const char* c_str);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/source/api_c/include/deepmd.hpp b/source/api_c/include/deepmd.hpp
index 90c1c1c918..06a50ee3f0 100644
--- a/source/api_c/include/deepmd.hpp
+++ b/source/api_c/include/deepmd.hpp
@@ -35,10 +35,14 @@ struct deepmd_exception : public std::runtime_error {
 /**
  * @brief Check if any exceptions throw in the C++ API. Throw if possible.
  */
-#define DP_CHECK_OK(check_func, dp)     \
-  const char *err_msg = check_func(dp); \
-  if (std::strlen(err_msg))             \
-    throw deepmd::hpp::deepmd_exception(std::string(err_msg));
+#define DP_CHECK_OK(check_func, dp)                   \
+  const char *err_msg = check_func(dp);               \
+  if (std::strlen(err_msg)) {                         \
+    std::string err_msg_str = std::string(err_msg);   \
+    DP_DeleteChar(err_msg);                           \
+    throw deepmd::hpp::deepmd_exception(err_msg_str); \
+  }                                                   \
+  DP_DeleteChar(err_msg);
 
 template <typename FPTYPE>
 inline void _DP_DeepPotCompute(DP_DeepPot *dp,
@@ -640,8 +644,8 @@ class DeepPot {
     const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
     const int *atype_ = &atype[0];
     double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
-    force.resize(nframes * natoms * 3);
-    virial.resize(nframes * 9);
+    force.resize(static_cast<size_t>(nframes) * natoms * 3);
+    virial.resize(static_cast<size_t>(nframes) * 9);
     VALUETYPE *force_ = &force[0];
     VALUETYPE *virial_ = &virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
@@ -702,10 +706,10 @@ class DeepPot {
     const int *atype_ = &atype[0];
 
     double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
-    force.resize(nframes * natoms * 3);
-    virial.resize(nframes * 9);
-    atom_energy.resize(nframes * natoms);
-    atom_virial.resize(nframes * natoms * 9);
+    force.resize(static_cast<size_t>(nframes) * natoms * 3);
+    virial.resize(static_cast<size_t>(nframes) * 9);
+    atom_energy.resize(static_cast<size_t>(nframes) * natoms);
+    atom_virial.resize(static_cast<size_t>(nframes) * natoms * 9);
     VALUETYPE *force_ = &force[0];
     VALUETYPE *virial_ = &virial[0];
     VALUETYPE *atomic_ener_ = &atom_energy[0];
@@ -770,8 +774,8 @@ class DeepPot {
     const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
     const int *atype_ = &atype[0];
     double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
-    force.resize(nframes * natoms * 3);
-    virial.resize(nframes * 9);
+    force.resize(static_cast<size_t>(nframes) * natoms * 3);
+    virial.resize(static_cast<size_t>(nframes) * 9);
     VALUETYPE *force_ = &force[0];
     VALUETYPE *virial_ = &virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
@@ -841,10 +845,10 @@ class DeepPot {
     const int *atype_ = &atype[0];
 
     double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
-    force.resize(nframes * natoms * 3);
-    virial.resize(nframes * 9);
-    atom_energy.resize(nframes * natoms);
-    atom_virial.resize(nframes * natoms * 9);
+    force.resize(static_cast<size_t>(nframes) * natoms * 3);
+    virial.resize(static_cast<size_t>(nframes) * 9);
+    atom_energy.resize(static_cast<size_t>(nframes) * natoms);
+    atom_virial.resize(static_cast<size_t>(nframes) * natoms * 9);
     VALUETYPE *force_ = &force[0];
     VALUETYPE *virial_ = &virial[0];
     VALUETYPE *atomic_ener_ = &atom_energy[0];
@@ -906,8 +910,8 @@ class DeepPot {
     const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
     const int *atype_ = &atype[0];
     double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
-    force.resize(nframes * natoms * 3);
-    virial.resize(nframes * 9);
+    force.resize(static_cast<size_t>(nframes) * natoms * 3);
+    virial.resize(static_cast<size_t>(nframes) * 9);
     VALUETYPE *force_ = &force[0];
     VALUETYPE *virial_ = &virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
@@ -968,10 +972,10 @@ class DeepPot {
     const int *atype_ = &atype[0];
 
     double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
-    force.resize(nframes * natoms * 3);
-    virial.resize(nframes * 9);
-    atom_energy.resize(nframes * natoms);
-    atom_virial.resize(nframes * natoms * 9);
+    force.resize(static_cast<size_t>(nframes) * natoms * 3);
+    virial.resize(static_cast<size_t>(nframes) * 9);
+    atom_energy.resize(static_cast<size_t>(nframes) * natoms);
+    atom_virial.resize(static_cast<size_t>(nframes) * natoms * 9);
     VALUETYPE *force_ = &force[0];
     VALUETYPE *virial_ = &virial[0];
     VALUETYPE *atomic_ener_ = &atom_energy[0];
@@ -1019,7 +1023,7 @@ class DeepPot {
   void get_type_map(std::string &type_map) {
     const char *type_map_c = DP_DeepPotGetTypeMap(dp);
     type_map.assign(type_map_c);
-    delete[] type_map_c;
+    DP_DeleteChar(type_map_c);
   };
   /**
    * @brief Print the summary of DeePMD-kit, including the version and the build
@@ -1056,14 +1060,15 @@ class DeepPot {
                               const int &nloc,
                               const std::vector<VALUETYPE> &fparam,
                               const std::vector<VALUETYPE> &aparam) const {
-    if (fparam.size() != dfparam && fparam.size() != nframes * dfparam) {
+    if (fparam.size() != dfparam &&
+        fparam.size() != static_cast<size_t>(nframes) * dfparam) {
       throw deepmd::hpp::deepmd_exception(
           "the dim of frame parameter provided is not consistent with what the "
           "model uses");
     }
 
-    if (aparam.size() != daparam * nloc &&
-        aparam.size() != nframes * daparam * nloc) {
+    if (aparam.size() != static_cast<size_t>(daparam) * nloc &&
+        aparam.size() != static_cast<size_t>(nframes) * daparam * nloc) {
       throw deepmd::hpp::deepmd_exception(
           "the dim of atom parameter provided is not consistent with what the "
           "model uses");
@@ -1075,11 +1080,12 @@ class DeepPot {
                           const int &dparam,
                           const std::vector<VALUETYPE> &param) const {
     if (param.size() == dparam) {
-      out_param.resize(nframes * dparam);
+      out_param.resize(static_cast<size_t>(nframes) * dparam);
       for (int ii = 0; ii < nframes; ++ii) {
-        std::copy(param.begin(), param.end(), out_param.begin() + ii * dparam);
+        std::copy(param.begin(), param.end(),
+                  out_param.begin() + static_cast<std::ptrdiff_t>(ii) * dparam);
       }
-    } else if (param.size() == nframes * dparam) {
+    } else if (param.size() == static_cast<size_t>(nframes) * dparam) {
       out_param = param;
     }
   }
@@ -1180,7 +1186,8 @@ class DeepPotModelDevi {
 
     // memory will be continous for std::vector but not std::vector<std::vector>
     std::vector<double> energy_flat(numb_models);
-    std::vector<VALUETYPE> force_flat(numb_models * natoms * 3);
+    std::vector<VALUETYPE> force_flat(static_cast<size_t>(numb_models) *
+                                      natoms * 3);
     std::vector<VALUETYPE> virial_flat(numb_models * 9);
     double *ener_ = &energy_flat[0];
     VALUETYPE *force_ = &force_flat[0];
@@ -1206,7 +1213,7 @@ class DeepPotModelDevi {
     virial.resize(numb_models);
     for (int i = 0; i < numb_models; i++) {
       ener[i] = energy_flat[i];
-      force[i].resize(natoms * 3);
+      force[i].resize(static_cast<size_t>(natoms) * 3);
       virial[i].resize(9);
       for (int j = 0; j < natoms * 3; j++) {
         force[i][j] = force_flat[i * natoms * 3 + j];
@@ -1256,10 +1263,13 @@ class DeepPotModelDevi {
     const int *atype_ = &atype[0];
 
     std::vector<double> energy_flat(numb_models);
-    std::vector<VALUETYPE> force_flat(numb_models * natoms * 3);
+    std::vector<VALUETYPE> force_flat(static_cast<size_t>(numb_models) *
+                                      natoms * 3);
     std::vector<VALUETYPE> virial_flat(numb_models * 9);
-    std::vector<VALUETYPE> atom_energy_flat(numb_models * natoms);
-    std::vector<VALUETYPE> atom_virial_flat(numb_models * natoms * 9);
+    std::vector<VALUETYPE> atom_energy_flat(static_cast<size_t>(numb_models) *
+                                            natoms);
+    std::vector<VALUETYPE> atom_virial_flat(static_cast<size_t>(numb_models) *
+                                            natoms * 9);
     double *ener_ = &energy_flat[0];
     VALUETYPE *force_ = &force_flat[0];
     VALUETYPE *virial_ = &virial_flat[0];
@@ -1288,10 +1298,10 @@ class DeepPotModelDevi {
     atom_virial.resize(numb_models);
     for (int i = 0; i < numb_models; i++) {
       ener[i] = energy_flat[i];
-      force[i].resize(natoms * 3);
+      force[i].resize(static_cast<size_t>(natoms) * 3);
       virial[i].resize(9);
       atom_energy[i].resize(natoms);
-      atom_virial[i].resize(natoms * 9);
+      atom_virial[i].resize(static_cast<size_t>(natoms) * 9);
       for (int j = 0; j < natoms * 3; j++) {
         force[i][j] = force_flat[i * natoms * 3 + j];
       }
@@ -1398,8 +1408,8 @@ class DeepPotModelDevi {
 
     for (unsigned ii = 0; ii < numb_models; ++ii) {
       for (unsigned jj = 0; jj < nloc; ++jj) {
-        const VALUETYPE *tmp_f = &(xx[ii][jj * stride]);
-        const VALUETYPE *tmp_avg = &(avg[jj * stride]);
+        const VALUETYPE *tmp_f = &(xx[ii][static_cast<size_t>(jj) * stride]);
+        const VALUETYPE *tmp_avg = &(avg[static_cast<size_t>(jj) * stride]);
         for (unsigned dd = 0; dd < stride; ++dd) {
           VALUETYPE vdiff = tmp_f[dd] - tmp_avg[dd];
           std[jj] += vdiff * vdiff;
@@ -1428,7 +1438,7 @@ class DeepPotModelDevi {
     assert(nloc * stride == ndof);
 
     for (unsigned ii = 0; ii < nloc; ++ii) {
-      const VALUETYPE *tmp_avg = &(avg[ii * stride]);
+      const VALUETYPE *tmp_avg = &(avg[static_cast<size_t>(ii) * stride]);
       VALUETYPE f_norm = 0.0;
       for (unsigned dd = 0; dd < stride; ++dd) {
         f_norm += tmp_avg[dd] * tmp_avg[dd];
@@ -1473,14 +1483,15 @@ class DeepPotModelDevi {
                               const int &nloc,
                               const std::vector<VALUETYPE> &fparam,
                               const std::vector<VALUETYPE> &aparam) const {
-    if (fparam.size() != dfparam && fparam.size() != nframes * dfparam) {
+    if (fparam.size() != dfparam &&
+        fparam.size() != static_cast<size_t>(nframes) * dfparam) {
       throw deepmd::hpp::deepmd_exception(
           "the dim of frame parameter provided is not consistent with what the "
           "model uses");
     }
 
-    if (aparam.size() != daparam * nloc &&
-        aparam.size() != nframes * daparam * nloc) {
+    if (aparam.size() != static_cast<size_t>(daparam) * nloc &&
+        aparam.size() != static_cast<size_t>(nframes) * daparam * nloc) {
       throw deepmd::hpp::deepmd_exception(
           "the dim of atom parameter provided is not consistent with what the "
           "model uses");
@@ -1492,11 +1503,12 @@ class DeepPotModelDevi {
                           const int &dparam,
                           const std::vector<VALUETYPE> &param) const {
     if (param.size() == dparam) {
-      out_param.resize(nframes * dparam);
+      out_param.resize(static_cast<size_t>(nframes) * dparam);
       for (int ii = 0; ii < nframes; ++ii) {
-        std::copy(param.begin(), param.end(), out_param.begin() + ii * dparam);
+        std::copy(param.begin(), param.end(),
+                  out_param.begin() + static_cast<std::ptrdiff_t>(ii) * dparam);
       }
-    } else if (param.size() == nframes * dparam) {
+    } else if (param.size() == static_cast<size_t>(nframes) * dparam) {
       out_param = param;
     }
   }
@@ -1649,8 +1661,8 @@ class DeepTensor {
     const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
     const int *atype_ = &atype[0];
     global_tensor.resize(odim);
-    force.resize(odim * natoms * 3);
-    virial.resize(odim * 9);
+    force.resize(static_cast<size_t>(odim) * natoms * 3);
+    virial.resize(static_cast<size_t>(odim) * 9);
     VALUETYPE *global_tensor_ = &global_tensor[0];
     VALUETYPE *force_ = &force[0];
     VALUETYPE *virial_ = &virial[0];
@@ -1693,9 +1705,9 @@ class DeepTensor {
     const int *atype_ = &atype[0];
 
     global_tensor.resize(odim);
-    force.resize(odim * natoms * 3);
-    virial.resize(odim * 9);
-    atom_virial.resize(odim * natoms * 9);
+    force.resize(static_cast<size_t>(odim) * natoms * 3);
+    virial.resize(static_cast<size_t>(odim) * 9);
+    atom_virial.resize(static_cast<size_t>(odim) * natoms * 9);
     VALUETYPE *global_tensor_ = &global_tensor[0];
     VALUETYPE *force_ = &force[0];
     VALUETYPE *virial_ = &virial[0];
@@ -1748,8 +1760,8 @@ class DeepTensor {
     const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
     const int *atype_ = &atype[0];
     global_tensor.resize(odim);
-    force.resize(odim * natoms * 3);
-    virial.resize(odim * 9);
+    force.resize(static_cast<size_t>(odim) * natoms * 3);
+    virial.resize(static_cast<size_t>(odim) * 9);
     VALUETYPE *global_tensor_ = &global_tensor[0];
     VALUETYPE *force_ = &force[0];
     VALUETYPE *virial_ = &virial[0];
@@ -1796,9 +1808,9 @@ class DeepTensor {
     const int *atype_ = &atype[0];
 
     global_tensor.resize(odim);
-    force.resize(odim * natoms * 3);
-    virial.resize(odim * 9);
-    atom_virial.resize(odim * natoms * 9);
+    force.resize(static_cast<size_t>(odim) * natoms * 3);
+    virial.resize(static_cast<size_t>(odim) * 9);
+    atom_virial.resize(static_cast<size_t>(odim) * natoms * 9);
     VALUETYPE *global_tensor_ = &global_tensor[0];
     VALUETYPE *force_ = &force[0];
     VALUETYPE *virial_ = &virial[0];
@@ -1864,7 +1876,7 @@ class DeepTensor {
   void get_type_map(std::string &type_map) {
     const char *type_map_c = DP_DeepTensorGetTypeMap(dt);
     type_map.assign(type_map_c);
-    delete[] type_map_c;
+    DP_DeleteChar(type_map_c);
   };
 
  private:
@@ -1950,7 +1962,7 @@ class DipoleChargeModifier {
     const int *dpairs = reinterpret_cast<const int *>(&pairs[0]);
     const VALUETYPE *delef = &delef_[0];
 
-    dfcorr_.resize(natoms * 3);
+    dfcorr_.resize(static_cast<size_t>(natoms) * 3);
     dvcorr_.resize(9);
     VALUETYPE *dfcorr = &dfcorr_[0];
     VALUETYPE *dvcorr = &dvcorr_[0];
@@ -2009,9 +2021,11 @@ void inline read_file_to_string(std::string model, std::string &file_content) {
   if (size < 0) {
     // negtive size indicates error
     std::string error_message = std::string(c_file_content, -size);
+    DP_DeleteChar(c_file_content);
     throw deepmd::hpp::deepmd_exception(error_message);
   }
   file_content = std::string(c_file_content, size);
+  DP_DeleteChar(c_file_content);
 };
 
 /**
@@ -2065,7 +2079,7 @@ void select_map(std::vector<VT> &out,
       nall2++;
     }
   }
-  out.resize(nall2 * stride);
+  out.resize(static_cast<size_t>(nall2) * stride);
   DP_SelectMapInt(&in[0], &fwd_map[0], stride, nall1, nall2, &out[0]);
 };
 
diff --git a/source/api_c/src/c_api.cc b/source/api_c/src/c_api.cc
index 9d1ed7d323..bc6178702f 100644
--- a/source/api_c/src/c_api.cc
+++ b/source/api_c/src/c_api.cc
@@ -1414,11 +1414,13 @@ void DP_SelectMapInt(const int* in,
                      int* out) {
   std::vector<int> in_(in, in + stride * nall1);
   std::vector<int> fwd_map_(fwd_map, fwd_map + nall1);
-  std::vector<int> out_(stride * nall2);
+  std::vector<int> out_(static_cast<size_t>(stride) * nall2);
   deepmd::select_map(out_, in_, fwd_map_, stride);
   if (out) {
     std::copy(out_.begin(), out_.end(), out);
   }
 }
 
+void DP_DeleteChar(const char* c_str) { delete[] c_str; }
+
 }  // extern "C"
diff --git a/source/api_c/tests/test_deepdipole_hpp.cc b/source/api_c/tests/test_deepdipole_hpp.cc
index 49958469e0..f781c34c5b 100644
--- a/source/api_c/tests/test_deepdipole_hpp.cc
+++ b/source/api_c/tests/test_deepdipole_hpp.cc
@@ -234,7 +234,7 @@ class TestInferDeepDipoleNew : public ::testing::Test {
       }
     }
 
-    expected_gv.resize(odim * 9);
+    expected_gv.resize(static_cast<size_t>(odim) * 9);
     for (int kk = 0; kk < odim; ++kk) {
       for (int ii = 0; ii < natoms; ++ii) {
         for (int dd = 0; dd < 9; ++dd) {
diff --git a/source/api_c/tests/test_deeppolar_hpp.cc b/source/api_c/tests/test_deeppolar_hpp.cc
index 1fc2075afb..63ebf5d760 100644
--- a/source/api_c/tests/test_deeppolar_hpp.cc
+++ b/source/api_c/tests/test_deeppolar_hpp.cc
@@ -466,7 +466,7 @@ class TestInferDeepPolarNew : public ::testing::Test {
       }
     }
 
-    expected_gv.resize(odim * 9);
+    expected_gv.resize(static_cast<size_t>(odim) * 9);
     for (int kk = 0; kk < odim; ++kk) {
       for (int ii = 0; ii < natoms; ++ii) {
         for (int dd = 0; dd < 9; ++dd) {
diff --git a/source/api_c/tests/test_deeppot_a.cc b/source/api_c/tests/test_deeppot_a.cc
index 50e8131cc0..63f53e16e9 100644
--- a/source/api_c/tests/test_deeppot_a.cc
+++ b/source/api_c/tests/test_deeppot_a.cc
@@ -172,6 +172,7 @@ TEST_F(TestInferDeepPotA, type_map) {
   const char* type_map = DP_DeepPotGetTypeMap(dp);
   char expected_type_map[] = "O H";
   EXPECT_EQ(strcmp(type_map, expected_type_map), 0);
+  DP_DeleteChar(type_map);
 }
 
 class TestInferDeepPotANoPBC : public ::testing::Test {
diff --git a/source/api_c/tests/test_deeppot_a_fparam_aparam_nframes.cc b/source/api_c/tests/test_deeppot_a_fparam_aparam_nframes.cc
index b94087916a..383c8f5fb1 100644
--- a/source/api_c/tests/test_deeppot_a_fparam_aparam_nframes.cc
+++ b/source/api_c/tests/test_deeppot_a_fparam_aparam_nframes.cc
@@ -126,7 +126,7 @@ class TestInferDeepPotAFparamAparamNFrames : public ::testing::Test {
     EXPECT_EQ(nframes * natoms * 3, expected_f.size());
     EXPECT_EQ(nframes * natoms * 9, expected_v.size());
     expected_tot_e.resize(nframes);
-    expected_tot_v.resize(nframes * 9);
+    expected_tot_v.resize(static_cast<size_t>(nframes) * 9);
     std::fill(expected_tot_e.begin(), expected_tot_e.end(), 0.);
     std::fill(expected_tot_v.begin(), expected_tot_v.end(), 0.);
     for (int kk = 0; kk < nframes; ++kk) {
@@ -767,7 +767,7 @@ class TestInferDeepPotAFparamAparamNFramesSingleParam : public ::testing::Test {
     EXPECT_EQ(nframes * natoms * 3, expected_f.size());
     EXPECT_EQ(nframes * natoms * 9, expected_v.size());
     expected_tot_e.resize(nframes);
-    expected_tot_v.resize(nframes * 9);
+    expected_tot_v.resize(static_cast<size_t>(nframes) * 9);
     std::fill(expected_tot_e.begin(), expected_tot_e.end(), 0.);
     std::fill(expected_tot_v.begin(), expected_tot_v.end(), 0.);
     for (int kk = 0; kk < nframes; ++kk) {
diff --git a/source/api_c/tests/test_deeppot_a_nframes_hpp.cc b/source/api_c/tests/test_deeppot_a_nframes_hpp.cc
index 1177957899..af132c0146 100644
--- a/source/api_c/tests/test_deeppot_a_nframes_hpp.cc
+++ b/source/api_c/tests/test_deeppot_a_nframes_hpp.cc
@@ -119,7 +119,7 @@ class TestInferDeepPotANFrames : public ::testing::Test {
     EXPECT_EQ(nframes * natoms * 3, expected_f.size());
     EXPECT_EQ(nframes * natoms * 9, expected_v.size());
     expected_tot_e.resize(nframes);
-    expected_tot_v.resize(nframes * 9);
+    expected_tot_v.resize(static_cast<size_t>(nframes) * 9);
     std::fill(expected_tot_e.begin(), expected_tot_e.end(), 0.);
     std::fill(expected_tot_v.begin(), expected_tot_v.end(), 0.);
     for (int kk = 0; kk < nframes; ++kk) {
@@ -728,7 +728,7 @@ class TestInferDeepPotANFramesNoPbc : public ::testing::Test {
     EXPECT_EQ(nframes * natoms * 3, expected_f.size());
     EXPECT_EQ(nframes * natoms * 9, expected_v.size());
     expected_tot_e.resize(nframes);
-    expected_tot_v.resize(nframes * 9);
+    expected_tot_v.resize(static_cast<size_t>(nframes) * 9);
     std::fill(expected_tot_e.begin(), expected_tot_e.end(), 0.);
     std::fill(expected_tot_v.begin(), expected_tot_v.end(), 0.);
     for (int kk = 0; kk < nframes; ++kk) {
diff --git a/source/api_c/tests/test_utils.h b/source/api_c/tests/test_utils.h
index 01636156b2..5167732bc8 100644
--- a/source/api_c/tests/test_utils.h
+++ b/source/api_c/tests/test_utils.h
@@ -42,7 +42,7 @@ inline void _fold_back(std::vector<VALUETYPE> &out,
                        const int nall,
                        const int ndim,
                        const int nframes = 1) {
-  out.resize(nframes * nloc * ndim);
+  out.resize(static_cast<size_t>(nframes) * nloc * ndim);
   _fold_back<VALUETYPE>(out.begin(), in.begin(), mapping, nloc, nall, ndim,
                         nframes);
 }
diff --git a/source/api_cc/include/DataModifier.h b/source/api_cc/include/DataModifier.h
index 502d7fcf4e..1e611a3930 100644
--- a/source/api_cc/include/DataModifier.h
+++ b/source/api_cc/include/DataModifier.h
@@ -1,9 +1,92 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #pragma once
 
-#include "DeepPot.h"
+#include <memory>
+
+#include "common.h"
 
 namespace deepmd {
+/**
+ * @brief Dipole charge modifier. (Base class)
+ **/
+class DipoleChargeModifierBase {
+ public:
+  /**
+   * @brief Dipole charge modifier without initialization.
+   **/
+  DipoleChargeModifierBase(){};
+  /**
+   * @brief Dipole charge modifier without initialization.
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] name_scope The name scope.
+   **/
+  DipoleChargeModifierBase(const std::string& model,
+                           const int& gpu_rank = 0,
+                           const std::string& name_scope = "");
+  virtual ~DipoleChargeModifierBase(){};
+  /**
+   * @brief Initialize the dipole charge modifier.
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] name_scope The name scope.
+   **/
+  virtual void init(const std::string& model,
+                    const int& gpu_rank = 0,
+                    const std::string& name_scope = "") = 0;
+  /**
+   * @brief Evaluate the force and virial correction by using this dipole charge
+   *modifier.
+   * @param[out] dfcorr_ The force correction on each atom.
+   * @param[out] dvcorr_ The virial correction.
+   * @param[in] dcoord_ The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] datype_ The atom types. The list should contain natoms ints.
+   * @param[in] dbox The cell of the region. The array should be of size 9.
+   * @param[in] pairs The pairs of atoms. The list should contain npairs pairs
+   *of ints.
+   * @param[in] delef_ The electric field on each atom. The array should be of
+   *size natoms x 3.
+   * @param[in] nghost The number of ghost atoms.
+   * @param[in] lmp_list The neighbor list.
+   @{
+   **/
+  virtual void computew(std::vector<double>& dfcorr_,
+                        std::vector<double>& dvcorr_,
+                        const std::vector<double>& dcoord_,
+                        const std::vector<int>& datype_,
+                        const std::vector<double>& dbox,
+                        const std::vector<std::pair<int, int>>& pairs,
+                        const std::vector<double>& delef_,
+                        const int nghost,
+                        const InputNlist& lmp_list) = 0;
+  virtual void computew(std::vector<float>& dfcorr_,
+                        std::vector<float>& dvcorr_,
+                        const std::vector<float>& dcoord_,
+                        const std::vector<int>& datype_,
+                        const std::vector<float>& dbox,
+                        const std::vector<std::pair<int, int>>& pairs,
+                        const std::vector<float>& delef_,
+                        const int nghost,
+                        const InputNlist& lmp_list) = 0;
+  /** @} */
+  /**
+   * @brief Get cutoff radius.
+   * @return double cutoff radius.
+   */
+  virtual double cutoff() const = 0;
+  /**
+   * @brief Get the number of atom types.
+   * @return int number of atom types.
+   */
+  virtual int numb_types() const = 0;
+  /**
+   * @brief Get the list of sel types.
+   * @return The list of sel types.
+   */
+  virtual std::vector<int> sel_types() const = 0;
+};
+
 /**
  * @brief Dipole charge modifier.
  **/
@@ -38,7 +121,6 @@ class DipoleChargeModifier {
    **/
   void print_summary(const std::string& pre) const;
 
- public:
   /**
    * @brief Evaluate the force and virial correction by using this dipole charge
    *modifier.
@@ -69,50 +151,20 @@ class DipoleChargeModifier {
    * @brief Get cutoff radius.
    * @return double cutoff radius.
    */
-  double cutoff() const {
-    assert(inited);
-    return rcut;
-  };
+  double cutoff() const;
   /**
    * @brief Get the number of atom types.
    * @return int number of atom types.
    */
-  int numb_types() const {
-    assert(inited);
-    return ntypes;
-  };
+  int numb_types() const;
   /**
    * @brief Get the list of sel types.
    * @return The list of sel types.
    */
-  std::vector<int> sel_types() const {
-    assert(inited);
-    return sel_type;
-  };
+  std::vector<int> sel_types() const;
 
  private:
-  tensorflow::Session* session;
-  std::string name_scope, name_prefix;
-  int num_intra_nthreads, num_inter_nthreads;
-  tensorflow::GraphDef* graph_def;
   bool inited;
-  double rcut;
-  int dtype;
-  double cell_size;
-  int ntypes;
-  std::string model_type;
-  std::vector<int> sel_type;
-  template <class VT>
-  VT get_scalar(const std::string& name) const;
-  template <class VT>
-  void get_vector(std::vector<VT>& vec, const std::string& name) const;
-  template <typename MODELTYPE, typename VALUETYPE>
-  void run_model(std::vector<VALUETYPE>& dforce,
-                 std::vector<VALUETYPE>& dvirial,
-                 tensorflow::Session* session,
-                 const std::vector<std::pair<std::string, tensorflow::Tensor>>&
-                     input_tensors,
-                 const AtomMap& atommap,
-                 const int nghost);
+  std::shared_ptr<deepmd::DipoleChargeModifierBase> dcm;
 };
 }  // namespace deepmd
diff --git a/source/api_cc/include/DataModifierTF.h b/source/api_cc/include/DataModifierTF.h
new file mode 100644
index 0000000000..2ca3729525
--- /dev/null
+++ b/source/api_cc/include/DataModifierTF.h
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#pragma once
+
+#include "DataModifier.h"
+#include "common.h"
+
+namespace deepmd {
+/**
+ * @brief Dipole charge modifier.
+ **/
+class DipoleChargeModifierTF : public DipoleChargeModifierBase {
+ public:
+  /**
+   * @brief Dipole charge modifier without initialization.
+   **/
+  DipoleChargeModifierTF();
+  /**
+   * @brief Dipole charge modifier without initialization.
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] name_scope The name scope.
+   **/
+  DipoleChargeModifierTF(const std::string& model,
+                         const int& gpu_rank = 0,
+                         const std::string& name_scope = "");
+  ~DipoleChargeModifierTF();
+  /**
+   * @brief Initialize the dipole charge modifier.
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] name_scope The name scope.
+   **/
+  void init(const std::string& model,
+            const int& gpu_rank = 0,
+            const std::string& name_scope = "");
+
+ private:
+  /**
+   * @brief Evaluate the force and virial correction by using this dipole charge
+   *modifier.
+   * @param[out] dfcorr_ The force correction on each atom.
+   * @param[out] dvcorr_ The virial correction.
+   * @param[in] dcoord_ The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] datype_ The atom types. The list should contain natoms ints.
+   * @param[in] dbox The cell of the region. The array should be of size 9.
+   * @param[in] pairs The pairs of atoms. The list should contain npairs pairs
+   *of ints.
+   * @param[in] delef_ The electric field on each atom. The array should be of
+   *size natoms x 3.
+   * @param[in] nghost The number of ghost atoms.
+   * @param[in] lmp_list The neighbor list.
+   **/
+  template <typename VALUETYPE>
+  void compute(std::vector<VALUETYPE>& dfcorr_,
+               std::vector<VALUETYPE>& dvcorr_,
+               const std::vector<VALUETYPE>& dcoord_,
+               const std::vector<int>& datype_,
+               const std::vector<VALUETYPE>& dbox,
+               const std::vector<std::pair<int, int>>& pairs,
+               const std::vector<VALUETYPE>& delef_,
+               const int nghost,
+               const InputNlist& lmp_list);
+
+ public:
+  /**
+   * @brief Get cutoff radius.
+   * @return double cutoff radius.
+   */
+  double cutoff() const {
+    assert(inited);
+    return rcut;
+  };
+  /**
+   * @brief Get the number of atom types.
+   * @return int number of atom types.
+   */
+  int numb_types() const {
+    assert(inited);
+    return ntypes;
+  };
+  /**
+   * @brief Get the list of sel types.
+   * @return The list of sel types.
+   */
+  std::vector<int> sel_types() const {
+    assert(inited);
+    return sel_type;
+  };
+  void computew(std::vector<double>& dfcorr_,
+                std::vector<double>& dvcorr_,
+                const std::vector<double>& dcoord_,
+                const std::vector<int>& datype_,
+                const std::vector<double>& dbox,
+                const std::vector<std::pair<int, int>>& pairs,
+                const std::vector<double>& delef_,
+                const int nghost,
+                const InputNlist& lmp_list);
+  void computew(std::vector<float>& dfcorr_,
+                std::vector<float>& dvcorr_,
+                const std::vector<float>& dcoord_,
+                const std::vector<int>& datype_,
+                const std::vector<float>& dbox,
+                const std::vector<std::pair<int, int>>& pairs,
+                const std::vector<float>& delef_,
+                const int nghost,
+                const InputNlist& lmp_list);
+
+ private:
+  tensorflow::Session* session;
+  std::string name_scope, name_prefix;
+  int num_intra_nthreads, num_inter_nthreads;
+  tensorflow::GraphDef* graph_def;
+  bool inited;
+  double rcut;
+  int dtype;
+  double cell_size;
+  int ntypes;
+  std::string model_type;
+  std::vector<int> sel_type;
+  template <class VT>
+  VT get_scalar(const std::string& name) const;
+  template <class VT>
+  void get_vector(std::vector<VT>& vec, const std::string& name) const;
+  template <typename MODELTYPE, typename VALUETYPE>
+  void run_model(std::vector<VALUETYPE>& dforce,
+                 std::vector<VALUETYPE>& dvirial,
+                 tensorflow::Session* session,
+                 const std::vector<std::pair<std::string, tensorflow::Tensor>>&
+                     input_tensors,
+                 const AtomMap& atommap,
+                 const int nghost);
+};
+}  // namespace deepmd
diff --git a/source/api_cc/include/DeepPot.h b/source/api_cc/include/DeepPot.h
index 7c4a0afe10..a8aedde510 100644
--- a/source/api_cc/include/DeepPot.h
+++ b/source/api_cc/include/DeepPot.h
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #pragma once
 
+#include <memory>
+
 #include "common.h"
 #include "neighbor_list.h"
 
@@ -8,6 +10,232 @@ namespace deepmd {
 /**
  * @brief Deep Potential.
  **/
+class DeepPotBase {
+ public:
+  /**
+   * @brief DP constructor without initialization.
+   **/
+  DeepPotBase(){};
+  virtual ~DeepPotBase(){};
+  /**
+   * @brief DP constructor with initialization.
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] file_content The content of the model file. If it is not empty,
+   *DP will read from the string instead of the file.
+   **/
+  DeepPotBase(const std::string& model,
+              const int& gpu_rank = 0,
+              const std::string& file_content = "");
+  /**
+   * @brief Initialize the DP.
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] file_content The content of the model file. If it is not empty,
+   *DP will read from the string instead of the file.
+   **/
+  virtual void init(const std::string& model,
+                    const int& gpu_rank = 0,
+                    const std::string& file_content = "") = 0;
+
+  /**
+   * @brief Evaluate the energy, force, virial, atomic energy, and atomic virial
+   *by using this DP.
+   * @note The double precision interface is used by i-PI, GROMACS, ABACUS, and
+   *CP2k.
+   * @param[out] ener The system energy.
+   * @param[out] force The force on each atom.
+   * @param[out] virial The virial.
+   * @param[out] atom_energy The atomic energy.
+   * @param[out] atom_virial The atomic virial.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *nframes x natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size nframes
+   *x 9.
+   * @param[in] fparam The frame parameter. The array can be of size :
+   * nframes x dim_fparam.
+   * dim_fparam. Then all frames are assumed to be provided with the same
+   *fparam.
+   * @param[in] aparam The atomic parameter The array can be of size :
+   * nframes x natoms x dim_aparam.
+   * natoms x dim_aparam. Then all frames are assumed to be provided with the
+   *same aparam.
+   * @{
+   **/
+  virtual void computew(
+      std::vector<double>& ener,
+      std::vector<double>& force,
+      std::vector<double>& virial,
+      std::vector<double>& atom_energy,
+      std::vector<double>& atom_virial,
+      const std::vector<double>& coord,
+      const std::vector<int>& atype,
+      const std::vector<double>& box,
+      const std::vector<double>& fparam = std::vector<double>(),
+      const std::vector<double>& aparam = std::vector<double>()) = 0;
+  virtual void computew(
+      std::vector<double>& ener,
+      std::vector<float>& force,
+      std::vector<float>& virial,
+      std::vector<float>& atom_energy,
+      std::vector<float>& atom_virial,
+      const std::vector<float>& coord,
+      const std::vector<int>& atype,
+      const std::vector<float>& box,
+      const std::vector<float>& fparam = std::vector<float>(),
+      const std::vector<float>& aparam = std::vector<float>()) = 0;
+  /** @} */
+  /**
+   * @brief Evaluate the energy, force, virial, atomic energy, and atomic virial
+   *by using this DP.
+   * @note The double precision interface is used by LAMMPS and AMBER.
+   * @param[out] ener The system energy.
+   * @param[out] force The force on each atom.
+   * @param[out] virial The virial.
+   * @param[out] atom_energy The atomic energy.
+   * @param[out] atom_virial The atomic virial.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *nframes x natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size nframes
+   *x 9.
+   * @param[in] nghost The number of ghost atoms.
+   * @param[in] lmp_list The input neighbour list.
+   * @param[in] ago Update the internal neighbour list if ago is 0.
+   * @param[in] fparam The frame parameter. The array can be of size :
+   * nframes x dim_fparam.
+   * dim_fparam. Then all frames are assumed to be provided with the same
+   *fparam.
+   * @param[in] aparam The atomic parameter The array can be of size :
+   * nframes x natoms x dim_aparam.
+   * natoms x dim_aparam. Then all frames are assumed to be provided with the
+   *same aparam.
+   * @{
+   **/
+  virtual void computew(
+      std::vector<double>& ener,
+      std::vector<double>& force,
+      std::vector<double>& virial,
+      std::vector<double>& atom_energy,
+      std::vector<double>& atom_virial,
+      const std::vector<double>& coord,
+      const std::vector<int>& atype,
+      const std::vector<double>& box,
+      const int nghost,
+      const InputNlist& inlist,
+      const int& ago,
+      const std::vector<double>& fparam = std::vector<double>(),
+      const std::vector<double>& aparam = std::vector<double>()) = 0;
+  virtual void computew(
+      std::vector<double>& ener,
+      std::vector<float>& force,
+      std::vector<float>& virial,
+      std::vector<float>& atom_energy,
+      std::vector<float>& atom_virial,
+      const std::vector<float>& coord,
+      const std::vector<int>& atype,
+      const std::vector<float>& box,
+      const int nghost,
+      const InputNlist& inlist,
+      const int& ago,
+      const std::vector<float>& fparam = std::vector<float>(),
+      const std::vector<float>& aparam = std::vector<float>()) = 0;
+  /** @} */
+
+  /**
+   * @brief Evaluate the energy, force, and virial with the mixed type
+   *by using this DP.
+   * @note At this time, no external program uses this interface.
+   * @param[out] ener The system energy.
+   * @param[out] force The force on each atom.
+   * @param[out] virial The virial.
+   * @param[out] atom_energy The atomic energy.
+   * @param[out] atom_virial The atomic virial.
+   * @param[in] nframes The number of frames.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *nframes x natoms x 3.
+   * @param[in] atype The atom types. The array should be of size nframes x
+   *natoms.
+   * @param[in] box The cell of the region. The array should be of size nframes
+   *x 9.
+   * @param[in] fparam The frame parameter. The array can be of size :
+   * nframes x dim_fparam.
+   * dim_fparam. Then all frames are assumed to be provided with the same
+   *fparam.
+   * @param[in] aparam The atomic parameter The array can be of size :
+   * nframes x natoms x dim_aparam.
+   * natoms x dim_aparam. Then all frames are assumed to be provided with the
+   *same aparam.
+   * @{
+   **/
+  virtual void computew_mixed_type(
+      std::vector<double>& ener,
+      std::vector<double>& force,
+      std::vector<double>& virial,
+      std::vector<double>& atom_energy,
+      std::vector<double>& atom_virial,
+      const int& nframes,
+      const std::vector<double>& coord,
+      const std::vector<int>& atype,
+      const std::vector<double>& box,
+      const std::vector<double>& fparam = std::vector<double>(),
+      const std::vector<double>& aparam = std::vector<double>()) = 0;
+  virtual void computew_mixed_type(
+      std::vector<double>& ener,
+      std::vector<float>& force,
+      std::vector<float>& virial,
+      std::vector<float>& atom_energy,
+      std::vector<float>& atom_virial,
+      const int& nframes,
+      const std::vector<float>& coord,
+      const std::vector<int>& atype,
+      const std::vector<float>& box,
+      const std::vector<float>& fparam = std::vector<float>(),
+      const std::vector<float>& aparam = std::vector<float>()) = 0;
+  /** @} */
+  /**
+   * @brief Get the cutoff radius.
+   * @return The cutoff radius.
+   **/
+  virtual double cutoff() const = 0;
+  /**
+   * @brief Get the number of types.
+   * @return The number of types.
+   **/
+  virtual int numb_types() const = 0;
+  /**
+   * @brief Get the number of types with spin.
+   * @return The number of types with spin.
+   **/
+  virtual int numb_types_spin() const = 0;
+  /**
+   * @brief Get the dimension of the frame parameter.
+   * @return The dimension of the frame parameter.
+   **/
+  virtual int dim_fparam() const = 0;
+  /**
+   * @brief Get the dimension of the atomic parameter.
+   * @return The dimension of the atomic parameter.
+   **/
+  virtual int dim_aparam() const = 0;
+  /**
+   * @brief Get the type map (element name of the atom types) of this model.
+   * @param[out] type_map The type map of this model.
+   **/
+  virtual void get_type_map(std::string& type_map) = 0;
+
+  /**
+   * @brief Get whether the atom dimension of aparam is nall instead of fparam.
+   * @param[out] aparam_nall whether the atom dimension of aparam is nall
+   *instead of fparam.
+   **/
+  virtual bool is_aparam_nall() const = 0;
+};
+
+/**
+ * @brief Deep Potential to automatically switch backends.
+ **/
 class DeepPot {
  public:
   /**
@@ -35,12 +263,12 @@ class DeepPot {
   void init(const std::string& model,
             const int& gpu_rank = 0,
             const std::string& file_content = "");
+
   /**
    * @brief Print the DP summary to the screen.
    * @param[in] pre The prefix to each line.
    **/
   void print_summary(const std::string& pre) const;
-
   /**
    * @brief Evaluate the energy, force and virial by using this DP.
    * @param[out] ener The system energy.
@@ -59,9 +287,10 @@ class DeepPot {
    * nframes x natoms x dim_aparam.
    * natoms x dim_aparam. Then all frames are assumed to be provided with the
    *same aparam.
+   * @{
    **/
-  template <typename VALUETYPE, typename ENERGYVTYPE>
-  void compute(ENERGYVTYPE& ener,
+  template <typename VALUETYPE>
+  void compute(ENERGYTYPE& ener,
                std::vector<VALUETYPE>& force,
                std::vector<VALUETYPE>& virial,
                const std::vector<VALUETYPE>& coord,
@@ -69,6 +298,16 @@ class DeepPot {
                const std::vector<VALUETYPE>& box,
                const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
                const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  template <typename VALUETYPE>
+  void compute(std::vector<ENERGYTYPE>& ener,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
+               const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+               const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  /** @} */
   /**
    * @brief Evaluate the energy, force and virial by using this DP.
    * @param[out] ener The system energy.
@@ -90,9 +329,10 @@ class DeepPot {
    * nframes x natoms x dim_aparam.
    * natoms x dim_aparam. Then all frames are assumed to be provided with the
    *same aparam.
+   * @{
    **/
-  template <typename VALUETYPE, typename ENERGYVTYPE>
-  void compute(ENERGYVTYPE& ener,
+  template <typename VALUETYPE>
+  void compute(ENERGYTYPE& ener,
                std::vector<VALUETYPE>& force,
                std::vector<VALUETYPE>& virial,
                const std::vector<VALUETYPE>& coord,
@@ -103,6 +343,19 @@ class DeepPot {
                const int& ago,
                const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
                const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  template <typename VALUETYPE>
+  void compute(std::vector<ENERGYTYPE>& ener,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
+               const int nghost,
+               const InputNlist& inlist,
+               const int& ago,
+               const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+               const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  /** @} */
   /**
    * @brief Evaluate the energy, force, virial, atomic energy, and atomic virial
    *by using this DP.
@@ -124,9 +377,21 @@ class DeepPot {
    * nframes x natoms x dim_aparam.
    * natoms x dim_aparam. Then all frames are assumed to be provided with the
    *same aparam.
+   * @{
    **/
-  template <typename VALUETYPE, typename ENERGYVTYPE>
-  void compute(ENERGYVTYPE& ener,
+  template <typename VALUETYPE>
+  void compute(ENERGYTYPE& ener,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               std::vector<VALUETYPE>& atom_energy,
+               std::vector<VALUETYPE>& atom_virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
+               const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+               const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  template <typename VALUETYPE>
+  void compute(std::vector<ENERGYTYPE>& ener,
                std::vector<VALUETYPE>& force,
                std::vector<VALUETYPE>& virial,
                std::vector<VALUETYPE>& atom_energy,
@@ -136,6 +401,8 @@ class DeepPot {
                const std::vector<VALUETYPE>& box,
                const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
                const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  /** @} */
+
   /**
    * @brief Evaluate the energy, force, virial, atomic energy, and atomic virial
    *by using this DP.
@@ -160,9 +427,10 @@ class DeepPot {
    * nframes x natoms x dim_aparam.
    * natoms x dim_aparam. Then all frames are assumed to be provided with the
    *same aparam.
+   * @{
    **/
-  template <typename VALUETYPE, typename ENERGYVTYPE>
-  void compute(ENERGYVTYPE& ener,
+  template <typename VALUETYPE>
+  void compute(ENERGYTYPE& ener,
                std::vector<VALUETYPE>& force,
                std::vector<VALUETYPE>& virial,
                std::vector<VALUETYPE>& atom_energy,
@@ -175,6 +443,21 @@ class DeepPot {
                const int& ago,
                const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
                const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  template <typename VALUETYPE>
+  void compute(std::vector<ENERGYTYPE>& ener,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               std::vector<VALUETYPE>& atom_energy,
+               std::vector<VALUETYPE>& atom_virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
+               const int nghost,
+               const InputNlist& lmp_list,
+               const int& ago,
+               const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+               const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  /** @} */
   /**
    * @brief Evaluate the energy, force, and virial with the mixed type
    *by using this DP.
@@ -196,10 +479,11 @@ class DeepPot {
    * nframes x natoms x dim_aparam.
    * natoms x dim_aparam. Then all frames are assumed to be provided with the
    *same aparam.
+   * @{
    **/
-  template <typename VALUETYPE, typename ENERGYVTYPE>
+  template <typename VALUETYPE>
   void compute_mixed_type(
-      ENERGYVTYPE& ener,
+      ENERGYTYPE& ener,
       std::vector<VALUETYPE>& force,
       std::vector<VALUETYPE>& virial,
       const int& nframes,
@@ -208,6 +492,18 @@ class DeepPot {
       const std::vector<VALUETYPE>& box,
       const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
       const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  template <typename VALUETYPE>
+  void compute_mixed_type(
+      std::vector<ENERGYTYPE>& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& virial,
+      const int& nframes,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  /** @} */
   /**
    * @brief Evaluate the energy, force, and virial with the mixed type
    *by using this DP.
@@ -231,10 +527,24 @@ class DeepPot {
    * nframes x natoms x dim_aparam.
    * natoms x dim_aparam. Then all frames are assumed to be provided with the
    *same aparam.
+   * @{
    **/
-  template <typename VALUETYPE, typename ENERGYVTYPE>
+  template <typename VALUETYPE>
+  void compute_mixed_type(
+      ENERGYTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& virial,
+      std::vector<VALUETYPE>& atom_energy,
+      std::vector<VALUETYPE>& atom_virial,
+      const int& nframes,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  template <typename VALUETYPE>
   void compute_mixed_type(
-      ENERGYVTYPE& ener,
+      std::vector<ENERGYTYPE>& ener,
       std::vector<VALUETYPE>& force,
       std::vector<VALUETYPE>& virial,
       std::vector<VALUETYPE>& atom_energy,
@@ -245,46 +555,32 @@ class DeepPot {
       const std::vector<VALUETYPE>& box,
       const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
       const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  /** @} */
   /**
    * @brief Get the cutoff radius.
    * @return The cutoff radius.
    **/
-  double cutoff() const {
-    assert(inited);
-    return rcut;
-  };
+  double cutoff() const;
   /**
    * @brief Get the number of types.
    * @return The number of types.
    **/
-  int numb_types() const {
-    assert(inited);
-    return ntypes;
-  };
+  int numb_types() const;
   /**
    * @brief Get the number of types with spin.
    * @return The number of types with spin.
    **/
-  int numb_types_spin() const {
-    assert(inited);
-    return ntypes_spin;
-  };
+  int numb_types_spin() const;
   /**
    * @brief Get the dimension of the frame parameter.
    * @return The dimension of the frame parameter.
    **/
-  int dim_fparam() const {
-    assert(inited);
-    return dfparam;
-  };
+  int dim_fparam() const;
   /**
    * @brief Get the dimension of the atomic parameter.
    * @return The dimension of the atomic parameter.
    **/
-  int dim_aparam() const {
-    assert(inited);
-    return daparam;
-  };
+  int dim_aparam() const;
   /**
    * @brief Get the type map (element name of the atom types) of this model.
    * @param[out] type_map The type map of this model.
@@ -296,77 +592,11 @@ class DeepPot {
    * @param[out] aparam_nall whether the atom dimension of aparam is nall
    *instead of fparam.
    **/
-  bool is_aparam_nall() const {
-    assert(inited);
-    return aparam_nall;
-  };
+  bool is_aparam_nall() const;
 
  private:
-  tensorflow::Session* session;
-  int num_intra_nthreads, num_inter_nthreads;
-  tensorflow::GraphDef* graph_def;
   bool inited;
-  template <class VT>
-  VT get_scalar(const std::string& name) const;
-  // VALUETYPE get_rcut () const;
-  // int get_ntypes () const;
-  double rcut;
-  int dtype;
-  double cell_size;
-  std::string model_type;
-  std::string model_version;
-  int ntypes;
-  int ntypes_spin;
-  int dfparam;
-  int daparam;
-  bool aparam_nall;
-  /**
-   * @brief Validate the size of frame and atomic parameters.
-   * @param[in] nframes The number of frames.
-   * @param[in] nloc The number of local atoms.
-   * @param[in] fparam The frame parameter.
-   * @param[in] aparam The atomic parameter.
-   * @tparam VALUETYPE The type of the parameters, double or float.
-   */
-  template <typename VALUETYPE>
-  void validate_fparam_aparam(const int& nframes,
-                              const int& nloc,
-                              const std::vector<VALUETYPE>& fparam,
-                              const std::vector<VALUETYPE>& aparam) const;
-  /**
-   * @brief Tile the frame or atomic parameters if there is only
-   * a single frame of frame or atomic parameters.
-   * @param[out] out_param The tiled frame or atomic parameters.
-   * @param[in] nframes The number of frames.
-   * @param[in] dparam The dimension of the frame or atomic parameters in a
-   * frame.
-   * @param[in] param The frame or atomic parameters.
-   * @tparam VALUETYPE The type of the parameters, double or float.
-   */
-  template <typename VALUETYPE>
-  void tile_fparam_aparam(std::vector<VALUETYPE>& out_param,
-                          const int& nframes,
-                          const int& dparam,
-                          const std::vector<VALUETYPE>& param) const;
-  template <typename VALUETYPE, typename ENERGYVTYPE>
-  void compute_inner(
-      ENERGYVTYPE& ener,
-      std::vector<VALUETYPE>& force,
-      std::vector<VALUETYPE>& virial,
-      const std::vector<VALUETYPE>& coord,
-      const std::vector<int>& atype,
-      const std::vector<VALUETYPE>& box,
-      const int nghost,
-      const int& ago,
-      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
-
-  // copy neighbor list info from host
-  bool init_nbor;
-  std::vector<int> sec_a;
-  NeighborListData nlist_data;
-  InputNlist nlist;
-  AtomMap atommap;
+  std::shared_ptr<deepmd::DeepPotBase> dp;
 };
 
 class DeepPotModelDevi {
@@ -480,7 +710,7 @@ class DeepPotModelDevi {
    **/
   double cutoff() const {
     assert(inited);
-    return rcut;
+    return dps[0].cutoff();
   };
   /**
    * @brief Get the number of types.
@@ -488,7 +718,7 @@ class DeepPotModelDevi {
    **/
   int numb_types() const {
     assert(inited);
-    return ntypes;
+    return dps[0].numb_types();
   };
   /**
    * @brief Get the number of types with spin.
@@ -496,7 +726,7 @@ class DeepPotModelDevi {
    **/
   int numb_types_spin() const {
     assert(inited);
-    return ntypes_spin;
+    return dps[0].numb_types_spin();
   };
   /**
    * @brief Get the dimension of the frame parameter.
@@ -504,7 +734,7 @@ class DeepPotModelDevi {
    **/
   int dim_fparam() const {
     assert(inited);
-    return dfparam;
+    return dps[0].dim_fparam();
   };
   /**
    * @brief Get the dimension of the atomic parameter.
@@ -512,7 +742,7 @@ class DeepPotModelDevi {
    **/
   int dim_aparam() const {
     assert(inited);
-    return daparam;
+    return dps[0].dim_aparam();
   };
   /**
    * @brief Compute the average energy.
@@ -590,39 +820,12 @@ class DeepPotModelDevi {
    **/
   bool is_aparam_nall() const {
     assert(inited);
-    return aparam_nall;
+    return dps[0].is_aparam_nall();
   };
 
  private:
   unsigned numb_models;
-  std::vector<tensorflow::Session*> sessions;
-  int num_intra_nthreads, num_inter_nthreads;
-  std::vector<tensorflow::GraphDef*> graph_defs;
+  std::vector<deepmd::DeepPot> dps;
   bool inited;
-  template <class VT>
-  VT get_scalar(const std::string name) const;
-  // VALUETYPE get_rcut () const;
-  // int get_ntypes () const;
-  double rcut;
-  double cell_size;
-  int dtype;
-  std::string model_type;
-  std::string model_version;
-  int ntypes;
-  int ntypes_spin;
-  int dfparam;
-  int daparam;
-  bool aparam_nall;
-  template <typename VALUETYPE>
-  void validate_fparam_aparam(const int& nloc,
-                              const std::vector<VALUETYPE>& fparam,
-                              const std::vector<VALUETYPE>& aparam) const;
-
-  // copy neighbor list info from host
-  bool init_nbor;
-  std::vector<std::vector<int> > sec;
-  deepmd::AtomMap atommap;
-  NeighborListData nlist_data;
-  InputNlist nlist;
 };
 }  // namespace deepmd
diff --git a/source/api_cc/include/DeepPotTF.h b/source/api_cc/include/DeepPotTF.h
new file mode 100644
index 0000000000..0580c61da5
--- /dev/null
+++ b/source/api_cc/include/DeepPotTF.h
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#pragma once
+
+#include "DeepPot.h"
+#include "common.h"
+#include "neighbor_list.h"
+
+namespace deepmd {
+/**
+ * @brief TensorFlow implementation for Deep Potential.
+ **/
+class DeepPotTF : public DeepPotBase {
+ public:
+  /**
+   * @brief DP constructor without initialization.
+   **/
+  DeepPotTF();
+  ~DeepPotTF();
+  /**
+   * @brief DP constructor with initialization.
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] file_content The content of the model file. If it is not empty,
+   *DP will read from the string instead of the file.
+   **/
+  DeepPotTF(const std::string& model,
+            const int& gpu_rank = 0,
+            const std::string& file_content = "");
+  /**
+   * @brief Initialize the DP.
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] file_content The content of the model file. If it is not empty,
+   *DP will read from the string instead of the file.
+   **/
+  void init(const std::string& model,
+            const int& gpu_rank = 0,
+            const std::string& file_content = "");
+
+ private:
+  /**
+   * @brief Evaluate the energy, force, virial, atomic energy, and atomic virial
+   *by using this DP.
+   * @param[out] ener The system energy.
+   * @param[out] force The force on each atom.
+   * @param[out] virial The virial.
+   * @param[out] atom_energy The atomic energy.
+   * @param[out] atom_virial The atomic virial.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *nframes x natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size nframes
+   *x 9.
+   * @param[in] fparam The frame parameter. The array can be of size :
+   * nframes x dim_fparam.
+   * dim_fparam. Then all frames are assumed to be provided with the same
+   *fparam.
+   * @param[in] aparam The atomic parameter The array can be of size :
+   * nframes x natoms x dim_aparam.
+   * natoms x dim_aparam. Then all frames are assumed to be provided with the
+   *same aparam.
+   **/
+  template <typename VALUETYPE, typename ENERGYVTYPE>
+  void compute(ENERGYVTYPE& ener,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               std::vector<VALUETYPE>& atom_energy,
+               std::vector<VALUETYPE>& atom_virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
+               const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+               const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  /**
+   * @brief Evaluate the energy, force, virial, atomic energy, and atomic virial
+   *by using this DP.
+   * @param[out] ener The system energy.
+   * @param[out] force The force on each atom.
+   * @param[out] virial The virial.
+   * @param[out] atom_energy The atomic energy.
+   * @param[out] atom_virial The atomic virial.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *nframes x natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size nframes
+   *x 9.
+   * @param[in] nghost The number of ghost atoms.
+   * @param[in] lmp_list The input neighbour list.
+   * @param[in] ago Update the internal neighbour list if ago is 0.
+   * @param[in] fparam The frame parameter. The array can be of size :
+   * nframes x dim_fparam.
+   * dim_fparam. Then all frames are assumed to be provided with the same
+   *fparam.
+   * @param[in] aparam The atomic parameter The array can be of size :
+   * nframes x natoms x dim_aparam.
+   * natoms x dim_aparam. Then all frames are assumed to be provided with the
+   *same aparam.
+   **/
+  template <typename VALUETYPE, typename ENERGYVTYPE>
+  void compute(ENERGYVTYPE& ener,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               std::vector<VALUETYPE>& atom_energy,
+               std::vector<VALUETYPE>& atom_virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
+               const int nghost,
+               const InputNlist& lmp_list,
+               const int& ago,
+               const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+               const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  /**
+   * @brief Evaluate the energy, force, and virial with the mixed type
+   *by using this DP.
+   * @param[out] ener The system energy.
+   * @param[out] force The force on each atom.
+   * @param[out] virial The virial.
+   * @param[in] nframes The number of frames.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *nframes x natoms x 3.
+   * @param[in] atype The atom types. The array should be of size nframes x
+   *natoms.
+   * @param[in] box The cell of the region. The array should be of size nframes
+   *x 9.
+   * @param[in] fparam The frame parameter. The array can be of size :
+   * nframes x dim_fparam.
+   * dim_fparam. Then all frames are assumed to be provided with the same
+   *fparam.
+   * @param[in] aparam The atomic parameter The array can be of size :
+   * nframes x natoms x dim_aparam.
+   * natoms x dim_aparam. Then all frames are assumed to be provided with the
+   *same aparam.
+   **/
+  template <typename VALUETYPE, typename ENERGYVTYPE>
+  void compute_mixed_type(
+      ENERGYVTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& virial,
+      const int& nframes,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+  /**
+   * @brief Evaluate the energy, force, and virial with the mixed type
+   *by using this DP.
+   * @param[out] ener The system energy.
+   * @param[out] force The force on each atom.
+   * @param[out] virial The virial.
+   * @param[out] atom_energy The atomic energy.
+   * @param[out] atom_virial The atomic virial.
+   * @param[in] nframes The number of frames.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *nframes x natoms x 3.
+   * @param[in] atype The atom types. The array should be of size nframes x
+   *natoms.
+   * @param[in] box The cell of the region. The array should be of size nframes
+   *x 9.
+   * @param[in] fparam The frame parameter. The array can be of size :
+   * nframes x dim_fparam.
+   * dim_fparam. Then all frames are assumed to be provided with the same
+   *fparam.
+   * @param[in] aparam The atomic parameter The array can be of size :
+   * nframes x natoms x dim_aparam.
+   * natoms x dim_aparam. Then all frames are assumed to be provided with the
+   *same aparam.
+   **/
+  template <typename VALUETYPE, typename ENERGYVTYPE>
+  void compute_mixed_type(
+      ENERGYVTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& virial,
+      std::vector<VALUETYPE>& atom_energy,
+      std::vector<VALUETYPE>& atom_virial,
+      const int& nframes,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>());
+
+ public:
+  /**
+   * @brief Get the cutoff radius.
+   * @return The cutoff radius.
+   **/
+  double cutoff() const {
+    assert(inited);
+    return rcut;
+  };
+  /**
+   * @brief Get the number of types.
+   * @return The number of types.
+   **/
+  int numb_types() const {
+    assert(inited);
+    return ntypes;
+  };
+  /**
+   * @brief Get the number of types with spin.
+   * @return The number of types with spin.
+   **/
+  int numb_types_spin() const {
+    assert(inited);
+    return ntypes_spin;
+  };
+  /**
+   * @brief Get the dimension of the frame parameter.
+   * @return The dimension of the frame parameter.
+   **/
+  int dim_fparam() const {
+    assert(inited);
+    return dfparam;
+  };
+  /**
+   * @brief Get the dimension of the atomic parameter.
+   * @return The dimension of the atomic parameter.
+   **/
+  int dim_aparam() const {
+    assert(inited);
+    return daparam;
+  };
+  /**
+   * @brief Get the type map (element name of the atom types) of this model.
+   * @param[out] type_map The type map of this model.
+   **/
+  void get_type_map(std::string& type_map);
+
+  /**
+   * @brief Get whether the atom dimension of aparam is nall instead of fparam.
+   * @param[out] aparam_nall whether the atom dimension of aparam is nall
+   *instead of fparam.
+   **/
+  bool is_aparam_nall() const {
+    assert(inited);
+    return aparam_nall;
+  };
+
+  // forward to template class
+  void computew(std::vector<double>& ener,
+                std::vector<double>& force,
+                std::vector<double>& virial,
+                std::vector<double>& atom_energy,
+                std::vector<double>& atom_virial,
+                const std::vector<double>& coord,
+                const std::vector<int>& atype,
+                const std::vector<double>& box,
+                const std::vector<double>& fparam = std::vector<double>(),
+                const std::vector<double>& aparam = std::vector<double>());
+  void computew(std::vector<double>& ener,
+                std::vector<float>& force,
+                std::vector<float>& virial,
+                std::vector<float>& atom_energy,
+                std::vector<float>& atom_virial,
+                const std::vector<float>& coord,
+                const std::vector<int>& atype,
+                const std::vector<float>& box,
+                const std::vector<float>& fparam = std::vector<float>(),
+                const std::vector<float>& aparam = std::vector<float>());
+  void computew(std::vector<double>& ener,
+                std::vector<double>& force,
+                std::vector<double>& virial,
+                std::vector<double>& atom_energy,
+                std::vector<double>& atom_virial,
+                const std::vector<double>& coord,
+                const std::vector<int>& atype,
+                const std::vector<double>& box,
+                const int nghost,
+                const InputNlist& inlist,
+                const int& ago,
+                const std::vector<double>& fparam = std::vector<double>(),
+                const std::vector<double>& aparam = std::vector<double>());
+  void computew(std::vector<double>& ener,
+                std::vector<float>& force,
+                std::vector<float>& virial,
+                std::vector<float>& atom_energy,
+                std::vector<float>& atom_virial,
+                const std::vector<float>& coord,
+                const std::vector<int>& atype,
+                const std::vector<float>& box,
+                const int nghost,
+                const InputNlist& inlist,
+                const int& ago,
+                const std::vector<float>& fparam = std::vector<float>(),
+                const std::vector<float>& aparam = std::vector<float>());
+  void computew_mixed_type(
+      std::vector<double>& ener,
+      std::vector<double>& force,
+      std::vector<double>& virial,
+      std::vector<double>& atom_energy,
+      std::vector<double>& atom_virial,
+      const int& nframes,
+      const std::vector<double>& coord,
+      const std::vector<int>& atype,
+      const std::vector<double>& box,
+      const std::vector<double>& fparam = std::vector<double>(),
+      const std::vector<double>& aparam = std::vector<double>());
+  void computew_mixed_type(
+      std::vector<double>& ener,
+      std::vector<float>& force,
+      std::vector<float>& virial,
+      std::vector<float>& atom_energy,
+      std::vector<float>& atom_virial,
+      const int& nframes,
+      const std::vector<float>& coord,
+      const std::vector<int>& atype,
+      const std::vector<float>& box,
+      const std::vector<float>& fparam = std::vector<float>(),
+      const std::vector<float>& aparam = std::vector<float>());
+
+ private:
+  tensorflow::Session* session;
+  int num_intra_nthreads, num_inter_nthreads;
+  tensorflow::GraphDef* graph_def;
+  bool inited;
+  template <class VT>
+  VT get_scalar(const std::string& name) const;
+  double rcut;
+  int dtype;
+  double cell_size;
+  std::string model_type;
+  std::string model_version;
+  int ntypes;
+  int ntypes_spin;
+  int dfparam;
+  int daparam;
+  bool aparam_nall;
+  /**
+   * @brief Validate the size of frame and atomic parameters.
+   * @param[in] nframes The number of frames.
+   * @param[in] nloc The number of local atoms.
+   * @param[in] fparam The frame parameter.
+   * @param[in] aparam The atomic parameter.
+   * @tparam VALUETYPE The type of the parameters, double or float.
+   */
+  template <typename VALUETYPE>
+  void validate_fparam_aparam(const int& nframes,
+                              const int& nloc,
+                              const std::vector<VALUETYPE>& fparam,
+                              const std::vector<VALUETYPE>& aparam) const;
+  /**
+   * @brief Tile the frame or atomic parameters if there is only
+   * a single frame of frame or atomic parameters.
+   * @param[out] out_param The tiled frame or atomic parameters.
+   * @param[in] nframes The number of frames.
+   * @param[in] dparam The dimension of the frame or atomic parameters in a
+   * frame.
+   * @param[in] param The frame or atomic parameters.
+   * @tparam VALUETYPE The type of the parameters, double or float.
+   */
+  template <typename VALUETYPE>
+  void tile_fparam_aparam(std::vector<VALUETYPE>& out_param,
+                          const int& nframes,
+                          const int& dparam,
+                          const std::vector<VALUETYPE>& param) const;
+  // copy neighbor list info from host
+  bool init_nbor;
+  std::vector<int> sec_a;
+  NeighborListData nlist_data;
+  InputNlist nlist;
+  AtomMap atommap;
+};
+
+}  // namespace deepmd
diff --git a/source/api_cc/include/DeepTensor.h b/source/api_cc/include/DeepTensor.h
index af535cc9de..5592942d87 100644
--- a/source/api_cc/include/DeepTensor.h
+++ b/source/api_cc/include/DeepTensor.h
@@ -1,10 +1,149 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #pragma once
 
+#include <memory>
+
 #include "common.h"
 #include "neighbor_list.h"
 
 namespace deepmd {
+/**
+ * @brief Deep Tensor.
+ **/
+class DeepTensorBase {
+ public:
+  /**
+   * @brief Deep Tensor constructor without initialization.
+   **/
+  DeepTensorBase(){};
+  virtual ~DeepTensorBase(){};
+  /**
+   * @brief Deep Tensor constructor with initialization..
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] name_scope Name scopes of operations.
+   **/
+  DeepTensorBase(const std::string& model,
+                 const int& gpu_rank = 0,
+                 const std::string& name_scope = "");
+  /**
+   * @brief Initialize the Deep Tensor.
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] name_scope Name scopes of operations.
+   **/
+  virtual void init(const std::string& model,
+                    const int& gpu_rank = 0,
+                    const std::string& name_scope = "") = 0;
+  /**
+   * @brief Evaluate the global tensor and component-wise force and virial.
+   * @param[out] global_tensor The global tensor to evalute.
+   * @param[out] force The component-wise force of the global tensor, size odim
+   *x natoms x 3.
+   * @param[out] virial The component-wise virial of the global tensor, size
+   *odim x 9.
+   * @param[out] atom_tensor The atomic tensor value of the model, size natoms x
+   *odim.
+   * @param[out] atom_virial The component-wise atomic virial of the global
+   *tensor, size odim x natoms x 9.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size 9.
+   * @param[in] request_deriv Whether to request the derivative of the global
+   * tensor, including force and virial.
+   * @{
+   **/
+  virtual void computew(std::vector<double>& global_tensor,
+                        std::vector<double>& force,
+                        std::vector<double>& virial,
+                        std::vector<double>& atom_tensor,
+                        std::vector<double>& atom_virial,
+                        const std::vector<double>& coord,
+                        const std::vector<int>& atype,
+                        const std::vector<double>& box,
+                        const bool request_deriv) = 0;
+  virtual void computew(std::vector<float>& global_tensor,
+                        std::vector<float>& force,
+                        std::vector<float>& virial,
+                        std::vector<float>& atom_tensor,
+                        std::vector<float>& atom_virial,
+                        const std::vector<float>& coord,
+                        const std::vector<int>& atype,
+                        const std::vector<float>& box,
+                        const bool request_deriv) = 0;
+  /** @} */
+  /**
+   * @brief Evaluate the global tensor and component-wise force and virial.
+   * @param[out] global_tensor The global tensor to evalute.
+   * @param[out] force The component-wise force of the global tensor, size odim
+   *x natoms x 3.
+   * @param[out] virial The component-wise virial of the global tensor, size
+   *odim x 9.
+   * @param[out] atom_tensor The atomic tensor value of the model, size natoms x
+   *odim.
+   * @param[out] atom_virial The component-wise atomic virial of the global
+   *tensor, size odim x natoms x 9.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size 9.
+   * @param[in] nghost The number of ghost atoms.
+   * @param[in] inlist The input neighbour list.
+   * @param[in] request_deriv Whether to request the derivative of the global
+   * tensor, including force and virial.
+   * @{
+   **/
+  virtual void computew(std::vector<double>& global_tensor,
+                        std::vector<double>& force,
+                        std::vector<double>& virial,
+                        std::vector<double>& atom_tensor,
+                        std::vector<double>& atom_virial,
+                        const std::vector<double>& coord,
+                        const std::vector<int>& atype,
+                        const std::vector<double>& box,
+                        const int nghost,
+                        const InputNlist& inlist,
+                        const bool request_deriv) = 0;
+  virtual void computew(std::vector<float>& global_tensor,
+                        std::vector<float>& force,
+                        std::vector<float>& virial,
+                        std::vector<float>& atom_tensor,
+                        std::vector<float>& atom_virial,
+                        const std::vector<float>& coord,
+                        const std::vector<int>& atype,
+                        const std::vector<float>& box,
+                        const int nghost,
+                        const InputNlist& inlist,
+                        const bool request_deriv) = 0;
+  /** @} */
+  /**
+   * @brief Get the cutoff radius.
+   * @return The cutoff radius.
+   **/
+  virtual double cutoff() const = 0;
+  /**
+   * @brief Get the number of types.
+   * @return The number of types.
+   **/
+  virtual int numb_types() const = 0;
+  /**
+   * @brief Get the output dimension.
+   * @return The output dimension.
+   **/
+  virtual int output_dim() const = 0;
+  /**
+   * @brief Get the list of sel types.
+   * @return The list of sel types.
+   */
+  virtual const std::vector<int>& sel_types() const = 0;
+  /**
+   * @brief Get the type map (element name of the atom types) of this model.
+   * @param[out] type_map The type map of this model.
+   **/
+  virtual void get_type_map(std::string& type_map) = 0;
+};
+
 /**
  * @brief Deep Tensor.
  **/
@@ -169,34 +308,22 @@ class DeepTensor {
    * @brief Get the cutoff radius.
    * @return The cutoff radius.
    **/
-  double cutoff() const {
-    assert(inited);
-    return rcut;
-  };
+  double cutoff() const;
   /**
    * @brief Get the number of types.
    * @return The number of types.
    **/
-  int numb_types() const {
-    assert(inited);
-    return ntypes;
-  };
+  int numb_types() const;
   /**
    * @brief Get the output dimension.
    * @return The output dimension.
    **/
-  int output_dim() const {
-    assert(inited);
-    return odim;
-  };
+  int output_dim() const;
   /**
    * @brief Get the list of sel types.
    * @return The list of sel types.
    */
-  const std::vector<int>& sel_types() const {
-    assert(inited);
-    return sel_type;
-  };
+  const std::vector<int>& sel_types() const;
   /**
    * @brief Get the type map (element name of the atom types) of this model.
    * @param[out] type_map The type map of this model.
@@ -204,74 +331,7 @@ class DeepTensor {
   void get_type_map(std::string& type_map);
 
  private:
-  tensorflow::Session* session;
-  std::string name_scope;
-  int num_intra_nthreads, num_inter_nthreads;
-  tensorflow::GraphDef* graph_def;
   bool inited;
-  double rcut;
-  int dtype;
-  double cell_size;
-  int ntypes;
-  std::string model_type;
-  std::string model_version;
-  int odim;
-  std::vector<int> sel_type;
-  template <class VT>
-  VT get_scalar(const std::string& name) const;
-  template <class VT>
-  void get_vector(std::vector<VT>& vec, const std::string& name) const;
-  template <typename MODELTYPE, typename VALUETYPE>
-  void run_model(std::vector<VALUETYPE>& d_tensor_,
-                 tensorflow::Session* session,
-                 const std::vector<std::pair<std::string, tensorflow::Tensor>>&
-                     input_tensors,
-                 const AtomMap& atommap,
-                 const std::vector<int>& sel_fwd,
-                 const int nghost = 0);
-  template <typename MODELTYPE, typename VALUETYPE>
-  void run_model(std::vector<VALUETYPE>& dglobal_tensor_,
-                 std::vector<VALUETYPE>& dforce_,
-                 std::vector<VALUETYPE>& dvirial_,
-                 std::vector<VALUETYPE>& datom_tensor_,
-                 std::vector<VALUETYPE>& datom_virial_,
-                 tensorflow::Session* session,
-                 const std::vector<std::pair<std::string, tensorflow::Tensor>>&
-                     input_tensors,
-                 const AtomMap& atommap,
-                 const std::vector<int>& sel_fwd,
-                 const int nghost = 0);
-  template <typename VALUETYPE>
-  void compute_inner(std::vector<VALUETYPE>& value,
-                     const std::vector<VALUETYPE>& coord,
-                     const std::vector<int>& atype,
-                     const std::vector<VALUETYPE>& box);
-  template <typename VALUETYPE>
-  void compute_inner(std::vector<VALUETYPE>& value,
-                     const std::vector<VALUETYPE>& coord,
-                     const std::vector<int>& atype,
-                     const std::vector<VALUETYPE>& box,
-                     const int nghost,
-                     const InputNlist& inlist);
-  template <typename VALUETYPE>
-  void compute_inner(std::vector<VALUETYPE>& global_tensor,
-                     std::vector<VALUETYPE>& force,
-                     std::vector<VALUETYPE>& virial,
-                     std::vector<VALUETYPE>& atom_tensor,
-                     std::vector<VALUETYPE>& atom_virial,
-                     const std::vector<VALUETYPE>& coord,
-                     const std::vector<int>& atype,
-                     const std::vector<VALUETYPE>& box);
-  template <typename VALUETYPE>
-  void compute_inner(std::vector<VALUETYPE>& global_tensor,
-                     std::vector<VALUETYPE>& force,
-                     std::vector<VALUETYPE>& virial,
-                     std::vector<VALUETYPE>& atom_tensor,
-                     std::vector<VALUETYPE>& atom_virial,
-                     const std::vector<VALUETYPE>& coord,
-                     const std::vector<int>& atype,
-                     const std::vector<VALUETYPE>& box,
-                     const int nghost,
-                     const InputNlist& inlist);
+  std::shared_ptr<deepmd::DeepTensorBase> dt;
 };
 }  // namespace deepmd
diff --git a/source/api_cc/include/DeepTensorTF.h b/source/api_cc/include/DeepTensorTF.h
new file mode 100644
index 0000000000..3c724dce88
--- /dev/null
+++ b/source/api_cc/include/DeepTensorTF.h
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#pragma once
+
+#include "DeepTensor.h"
+#include "common.h"
+#include "neighbor_list.h"
+
+namespace deepmd {
+/**
+ * @brief Deep Tensor.
+ **/
+class DeepTensorTF : public DeepTensorBase {
+ public:
+  /**
+   * @brief Deep Tensor constructor without initialization.
+   **/
+  DeepTensorTF();
+  ~DeepTensorTF();
+  /**
+   * @brief Deep Tensor constructor with initialization..
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] name_scope Name scopes of operations.
+   **/
+  DeepTensorTF(const std::string& model,
+               const int& gpu_rank = 0,
+               const std::string& name_scope = "");
+  /**
+   * @brief Initialize the Deep Tensor.
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] name_scope Name scopes of operations.
+   **/
+  void init(const std::string& model,
+            const int& gpu_rank = 0,
+            const std::string& name_scope = "");
+
+ private:
+  /**
+   * @brief Evaluate the value by using this model.
+   * @param[out] value The value to evalute, usually would be the atomic tensor.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size 9.
+   **/
+  template <typename VALUETYPE>
+  void compute(std::vector<VALUETYPE>& value,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box);
+  /**
+   * @brief Evaluate the value by using this model.
+   * @param[out] value The value to evalute, usually would be the atomic tensor.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size 9.
+   * @param[in] nghost The number of ghost atoms.
+   * @param[in] inlist The input neighbour list.
+   **/
+  template <typename VALUETYPE>
+  void compute(std::vector<VALUETYPE>& value,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
+               const int nghost,
+               const InputNlist& inlist);
+  /**
+   * @brief Evaluate the global tensor and component-wise force and virial.
+   * @param[out] global_tensor The global tensor to evalute.
+   * @param[out] force The component-wise force of the global tensor, size odim
+   *x natoms x 3.
+   * @param[out] virial The component-wise virial of the global tensor, size
+   *odim x 9.
+   * @param[out] atom_tensor The atomic tensor value of the model, size natoms x
+   *odim.
+   * @param[out] atom_virial The component-wise atomic virial of the global
+   *tensor, size odim x natoms x 9.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size 9.
+   **/
+  template <typename VALUETYPE>
+  void compute(std::vector<VALUETYPE>& global_tensor,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               std::vector<VALUETYPE>& atom_tensor,
+               std::vector<VALUETYPE>& atom_virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box);
+  /**
+   * @brief Evaluate the global tensor and component-wise force and virial.
+   * @param[out] global_tensor The global tensor to evalute.
+   * @param[out] force The component-wise force of the global tensor, size odim
+   *x natoms x 3.
+   * @param[out] virial The component-wise virial of the global tensor, size
+   *odim x 9.
+   * @param[out] atom_tensor The atomic tensor value of the model, size natoms x
+   *odim.
+   * @param[out] atom_virial The component-wise atomic virial of the global
+   *tensor, size odim x natoms x 9.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size 9.
+   * @param[in] nghost The number of ghost atoms.
+   * @param[in] inlist The input neighbour list.
+   **/
+  template <typename VALUETYPE>
+  void compute(std::vector<VALUETYPE>& global_tensor,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               std::vector<VALUETYPE>& atom_tensor,
+               std::vector<VALUETYPE>& atom_virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
+               const int nghost,
+               const InputNlist& inlist);
+
+ public:
+  /**
+   * @brief Get the cutoff radius.
+   * @return The cutoff radius.
+   **/
+  double cutoff() const {
+    assert(inited);
+    return rcut;
+  };
+  /**
+   * @brief Get the number of types.
+   * @return The number of types.
+   **/
+  int numb_types() const {
+    assert(inited);
+    return ntypes;
+  };
+  /**
+   * @brief Get the output dimension.
+   * @return The output dimension.
+   **/
+  int output_dim() const {
+    assert(inited);
+    return odim;
+  };
+  /**
+   * @brief Get the list of sel types.
+   * @return The list of sel types.
+   */
+  const std::vector<int>& sel_types() const {
+    assert(inited);
+    return sel_type;
+  };
+  /**
+   * @brief Get the type map (element name of the atom types) of this model.
+   * @param[out] type_map The type map of this model.
+   **/
+  void get_type_map(std::string& type_map);
+
+  /**
+   * @brief Evaluate the global tensor and component-wise force and virial.
+   * @param[out] global_tensor The global tensor to evalute.
+   * @param[out] force The component-wise force of the global tensor, size odim
+   *x natoms x 3.
+   * @param[out] virial The component-wise virial of the global tensor, size
+   *odim x 9.
+   * @param[out] atom_tensor The atomic tensor value of the model, size natoms x
+   *odim.
+   * @param[out] atom_virial The component-wise atomic virial of the global
+   *tensor, size odim x natoms x 9.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size 9.
+   * @param[in] request_deriv Whether to request the derivative of the global
+   * tensor, including force and virial.
+   * @{
+   **/
+  void computew(std::vector<double>& global_tensor,
+                std::vector<double>& force,
+                std::vector<double>& virial,
+                std::vector<double>& atom_tensor,
+                std::vector<double>& atom_virial,
+                const std::vector<double>& coord,
+                const std::vector<int>& atype,
+                const std::vector<double>& box,
+                const bool request_deriv);
+  void computew(std::vector<float>& global_tensor,
+                std::vector<float>& force,
+                std::vector<float>& virial,
+                std::vector<float>& atom_tensor,
+                std::vector<float>& atom_virial,
+                const std::vector<float>& coord,
+                const std::vector<int>& atype,
+                const std::vector<float>& box,
+                const bool request_deriv);
+  /** @} */
+  /**
+   * @brief Evaluate the global tensor and component-wise force and virial.
+   * @param[out] global_tensor The global tensor to evalute.
+   * @param[out] force The component-wise force of the global tensor, size odim
+   *x natoms x 3.
+   * @param[out] virial The component-wise virial of the global tensor, size
+   *odim x 9.
+   * @param[out] atom_tensor The atomic tensor value of the model, size natoms x
+   *odim.
+   * @param[out] atom_virial The component-wise atomic virial of the global
+   *tensor, size odim x natoms x 9.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size 9.
+   * @param[in] nghost The number of ghost atoms.
+   * @param[in] inlist The input neighbour list.
+   * @param[in] request_deriv Whether to request the derivative of the global
+   * tensor, including force and virial.
+   * @{
+   **/
+  void computew(std::vector<double>& global_tensor,
+                std::vector<double>& force,
+                std::vector<double>& virial,
+                std::vector<double>& atom_tensor,
+                std::vector<double>& atom_virial,
+                const std::vector<double>& coord,
+                const std::vector<int>& atype,
+                const std::vector<double>& box,
+                const int nghost,
+                const InputNlist& inlist,
+                const bool request_deriv);
+  void computew(std::vector<float>& global_tensor,
+                std::vector<float>& force,
+                std::vector<float>& virial,
+                std::vector<float>& atom_tensor,
+                std::vector<float>& atom_virial,
+                const std::vector<float>& coord,
+                const std::vector<int>& atype,
+                const std::vector<float>& box,
+                const int nghost,
+                const InputNlist& inlist,
+                const bool request_deriv);
+  /** @} */
+
+ private:
+  tensorflow::Session* session;
+  std::string name_scope;
+  int num_intra_nthreads, num_inter_nthreads;
+  tensorflow::GraphDef* graph_def;
+  bool inited;
+  double rcut;
+  int dtype;
+  double cell_size;
+  int ntypes;
+  std::string model_type;
+  std::string model_version;
+  int odim;
+  std::vector<int> sel_type;
+  template <class VT>
+  VT get_scalar(const std::string& name) const;
+  template <class VT>
+  void get_vector(std::vector<VT>& vec, const std::string& name) const;
+  template <typename MODELTYPE, typename VALUETYPE>
+  void run_model(std::vector<VALUETYPE>& d_tensor_,
+                 tensorflow::Session* session,
+                 const std::vector<std::pair<std::string, tensorflow::Tensor>>&
+                     input_tensors,
+                 const AtomMap& atommap,
+                 const std::vector<int>& sel_fwd,
+                 const int nghost = 0);
+  template <typename MODELTYPE, typename VALUETYPE>
+  void run_model(std::vector<VALUETYPE>& dglobal_tensor_,
+                 std::vector<VALUETYPE>& dforce_,
+                 std::vector<VALUETYPE>& dvirial_,
+                 std::vector<VALUETYPE>& datom_tensor_,
+                 std::vector<VALUETYPE>& datom_virial_,
+                 tensorflow::Session* session,
+                 const std::vector<std::pair<std::string, tensorflow::Tensor>>&
+                     input_tensors,
+                 const AtomMap& atommap,
+                 const std::vector<int>& sel_fwd,
+                 const int nghost = 0);
+  template <typename VALUETYPE>
+  void compute_inner(std::vector<VALUETYPE>& value,
+                     const std::vector<VALUETYPE>& coord,
+                     const std::vector<int>& atype,
+                     const std::vector<VALUETYPE>& box);
+  template <typename VALUETYPE>
+  void compute_inner(std::vector<VALUETYPE>& value,
+                     const std::vector<VALUETYPE>& coord,
+                     const std::vector<int>& atype,
+                     const std::vector<VALUETYPE>& box,
+                     const int nghost,
+                     const InputNlist& inlist);
+  template <typename VALUETYPE>
+  void compute_inner(std::vector<VALUETYPE>& global_tensor,
+                     std::vector<VALUETYPE>& force,
+                     std::vector<VALUETYPE>& virial,
+                     std::vector<VALUETYPE>& atom_tensor,
+                     std::vector<VALUETYPE>& atom_virial,
+                     const std::vector<VALUETYPE>& coord,
+                     const std::vector<int>& atype,
+                     const std::vector<VALUETYPE>& box);
+  template <typename VALUETYPE>
+  void compute_inner(std::vector<VALUETYPE>& global_tensor,
+                     std::vector<VALUETYPE>& force,
+                     std::vector<VALUETYPE>& virial,
+                     std::vector<VALUETYPE>& atom_tensor,
+                     std::vector<VALUETYPE>& atom_virial,
+                     const std::vector<VALUETYPE>& coord,
+                     const std::vector<int>& atype,
+                     const std::vector<VALUETYPE>& box,
+                     const int nghost,
+                     const InputNlist& inlist);
+};
+}  // namespace deepmd
diff --git a/source/api_cc/include/common.h b/source/api_cc/include/common.h
index 481e09cc89..7982c4f89d 100644
--- a/source/api_cc/include/common.h
+++ b/source/api_cc/include/common.h
@@ -19,6 +19,8 @@
 namespace deepmd {
 
 typedef double ENERGYTYPE;
+// TODO: currently we only implement TF; reserve for future use
+enum DPBackend { TensorFlow, PyTorch, Paddle, Unknown };
 
 struct NeighborListData {
   /// Array stores the core region atom's index
diff --git a/source/api_cc/src/AtomMap.cc b/source/api_cc/src/AtomMap.cc
index 0b6105c5f2..b79f848277 100644
--- a/source/api_cc/src/AtomMap.cc
+++ b/source/api_cc/src/AtomMap.cc
@@ -39,8 +39,10 @@ void AtomMap::forward(typename std::vector<VALUETYPE>::iterator out,
       int gro_i = idx_map[ii];
       for (int dd = 0; dd < stride; ++dd) {
         // out[ii*stride+dd] = in[gro_i*stride+dd];
-        *(out + kk * nall * stride + ii * stride + dd) =
-            *(in + kk * nall * stride + gro_i * stride + dd);
+        *(out + static_cast<std::ptrdiff_t>(kk) * nall * stride +
+          static_cast<std::ptrdiff_t>(ii) * stride + dd) =
+            *(in + static_cast<std::ptrdiff_t>(kk) * nall * stride +
+              static_cast<std::ptrdiff_t>(gro_i) * stride + dd);
       }
     }
   }
@@ -58,8 +60,10 @@ void AtomMap::backward(typename std::vector<VALUETYPE>::iterator out,
       int gro_i = idx_map[ii];
       for (int dd = 0; dd < stride; ++dd) {
         // out[gro_i*stride+dd] = in[ii*stride+dd];
-        *(out + kk * nall * stride + gro_i * stride + dd) =
-            *(in + kk * nall * stride + ii * stride + dd);
+        *(out + static_cast<std::ptrdiff_t>(kk) * nall * stride +
+          static_cast<std::ptrdiff_t>(gro_i) * stride + dd) =
+            *(in + static_cast<std::ptrdiff_t>(kk) * nall * stride +
+              static_cast<std::ptrdiff_t>(ii) * stride + dd);
       }
     }
   }
diff --git a/source/api_cc/src/DataModifier.cc b/source/api_cc/src/DataModifier.cc
index 658ec68442..954c969c13 100644
--- a/source/api_cc/src/DataModifier.cc
+++ b/source/api_cc/src/DataModifier.cc
@@ -1,20 +1,21 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include "DataModifier.h"
 
+#include "DataModifierTF.h"
+#include "common.h"
+
 using namespace deepmd;
-using namespace tensorflow;
 
-DipoleChargeModifier::DipoleChargeModifier()
-    : inited(false), graph_def(new GraphDef()) {}
+DipoleChargeModifier::DipoleChargeModifier() : inited(false) {}
 
 DipoleChargeModifier::DipoleChargeModifier(const std::string& model,
                                            const int& gpu_rank,
                                            const std::string& name_scope_)
-    : inited(false), name_scope(name_scope_), graph_def(new GraphDef()) {
+    : inited(false) {
   init(model, gpu_rank, name_scope_);
 }
 
-DipoleChargeModifier::~DipoleChargeModifier() { delete graph_def; };
+DipoleChargeModifier::~DipoleChargeModifier(){};
 
 void DipoleChargeModifier::init(const std::string& model,
                                 const int& gpu_rank,
@@ -25,140 +26,22 @@ void DipoleChargeModifier::init(const std::string& model,
               << std::endl;
     return;
   }
-  name_scope = name_scope_;
-  SessionOptions options;
-  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
-  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
-  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  deepmd::load_op_library();
-  int gpu_num = -1;
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  DPGetDeviceCount(gpu_num);  // check current device environment
-  if (gpu_num > 0) {
-    options.config.set_allow_soft_placement(true);
-    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(
-        0.9);
-    options.config.mutable_gpu_options()->set_allow_growth(true);
-    DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
-    std::string str = "/gpu:";
-    str += std::to_string(gpu_rank % gpu_num);
-    graph::SetDefaultDevice(str, graph_def);
-  }
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  deepmd::check_status(NewSession(options, &session));
-  deepmd::check_status(ReadBinaryProto(Env::Default(), model, graph_def));
-  deepmd::check_status(session->Create(*graph_def));
-  // int nnodes = graph_def.node_size();
-  // for (int ii = 0; ii < nnodes; ++ii){
-  //   cout << ii << " \t " << graph_def.node(ii).name() << endl;
-  // }
-  dtype = session_get_dtype(session, "descrpt_attr/rcut");
-  if (dtype == tensorflow::DT_DOUBLE) {
-    rcut = get_scalar<double>("descrpt_attr/rcut");
+  // TODO: To implement detect_backend
+  DPBackend backend = deepmd::DPBackend::TensorFlow;
+  if (deepmd::DPBackend::TensorFlow == backend) {
+    // TODO: throw errors if TF backend is not built, without mentioning TF
+    dcm = std::make_shared<deepmd::DipoleChargeModifierTF>(model, gpu_rank,
+                                                           name_scope_);
+  } else if (deepmd::DPBackend::PyTorch == backend) {
+    throw deepmd::deepmd_exception("PyTorch backend is not supported yet");
+  } else if (deepmd::DPBackend::Paddle == backend) {
+    throw deepmd::deepmd_exception("PaddlePaddle backend is not supported yet");
   } else {
-    rcut = get_scalar<float>("descrpt_attr/rcut");
+    throw deepmd::deepmd_exception("Unknown file type");
   }
-  cell_size = rcut;
-  ntypes = get_scalar<int>("descrpt_attr/ntypes");
-  model_type = get_scalar<STRINGTYPE>("model_attr/model_type");
-  get_vector<int>(sel_type, "model_attr/sel_type");
-  sort(sel_type.begin(), sel_type.end());
   inited = true;
 }
 
-template <class VT>
-VT DipoleChargeModifier::get_scalar(const std::string& name) const {
-  return session_get_scalar<VT>(session, name, name_scope);
-}
-
-template <class VT>
-void DipoleChargeModifier::get_vector(std::vector<VT>& vec,
-                                      const std::string& name) const {
-  session_get_vector<VT>(vec, session, name, name_scope);
-}
-
-template <typename MODELTYPE, typename VALUETYPE>
-void DipoleChargeModifier::run_model(
-    std::vector<VALUETYPE>& dforce,
-    std::vector<VALUETYPE>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nghost) {
-  unsigned nloc = atommap.get_type().size();
-  unsigned nall = nloc + nghost;
-  if (nloc == 0) {
-    dforce.clear();
-    dvirial.clear();
-    return;
-  }
-
-  std::vector<Tensor> output_tensors;
-  deepmd::check_status(session->Run(input_tensors,
-                                    {"o_dm_force", "o_dm_virial", "o_dm_av"},
-                                    {}, &output_tensors));
-  int cc = 0;
-  Tensor output_f = output_tensors[cc++];
-  Tensor output_v = output_tensors[cc++];
-  Tensor output_av = output_tensors[cc++];
-  assert(output_f.dims() == 2 && "dim of output tensor should be 2");
-  assert(output_v.dims() == 2 && "dim of output tensor should be 2");
-  assert(output_av.dims() == 2 && "dim of output tensor should be 2");
-  int nframes = output_f.dim_size(0);
-  int natoms = output_f.dim_size(1) / 3;
-  assert(output_f.dim_size(0) == 1 && "nframes should match");
-  assert(natoms == nall && "natoms should be nall");
-  assert(output_v.dim_size(0) == nframes && "nframes should match");
-  assert(output_v.dim_size(1) == 9 && "dof of virial should be 9");
-  assert(output_av.dim_size(0) == nframes && "nframes should match");
-  assert(output_av.dim_size(1) == natoms * 9 &&
-         "dof of atom virial should be 9 * natoms");
-
-  auto of = output_f.flat<MODELTYPE>();
-  auto ov = output_v.flat<MODELTYPE>();
-
-  dforce.resize(nall * 3);
-  dvirial.resize(9);
-  for (int ii = 0; ii < nall * 3; ++ii) {
-    dforce[ii] = of(ii);
-  }
-  for (int ii = 0; ii < 9; ++ii) {
-    dvirial[ii] = ov(ii);
-  }
-}
-
-template void DipoleChargeModifier::run_model<double, double>(
-    std::vector<double>& dforce,
-    std::vector<double>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nghost);
-
-template void DipoleChargeModifier::run_model<float, double>(
-    std::vector<double>& dforce,
-    std::vector<double>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nghost);
-
-template void DipoleChargeModifier::run_model<double, float>(
-    std::vector<float>& dforce,
-    std::vector<float>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nghost);
-
-template void DipoleChargeModifier::run_model<float, float>(
-    std::vector<float>& dforce,
-    std::vector<float>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nghost);
-
 template <typename VALUETYPE>
 void DipoleChargeModifier::compute(
     std::vector<VALUETYPE>& dfcorr_,
@@ -170,148 +53,8 @@ void DipoleChargeModifier::compute(
     const std::vector<VALUETYPE>& delef_,
     const int nghost,
     const InputNlist& lmp_list) {
-  // firstly do selection
-  int nall = datype_.size();
-  int nloc = nall - nghost;
-  int nghost_real;
-  std::vector<int> real_fwd_map, real_bkw_map;
-  select_real_atoms(real_fwd_map, real_bkw_map, nghost_real, dcoord_, datype_,
-                    nghost, ntypes);
-  int nall_real = real_bkw_map.size();
-  int nloc_real = nall_real - nghost_real;
-  if (nloc_real == 0) {
-    dfcorr_.resize(nall * 3);
-    dvcorr_.resize(9);
-    fill(dfcorr_.begin(), dfcorr_.end(), (VALUETYPE)0.0);
-    fill(dvcorr_.begin(), dvcorr_.end(), (VALUETYPE)0.0);
-    return;
-  }
-  // resize to nall_real
-  std::vector<VALUETYPE> dcoord_real;
-  std::vector<VALUETYPE> delef_real;
-  std::vector<int> datype_real;
-  dcoord_real.resize(nall_real * 3);
-  delef_real.resize(nall_real * 3);
-  datype_real.resize(nall_real);
-  // fwd map
-  select_map<VALUETYPE>(dcoord_real, dcoord_, real_fwd_map, 3);
-  select_map<VALUETYPE>(delef_real, delef_, real_fwd_map, 3);
-  select_map<int>(datype_real, datype_, real_fwd_map, 1);
-  // internal nlist
-  NeighborListData nlist_data;
-  nlist_data.copy_from_nlist(lmp_list);
-  nlist_data.shuffle_exclude_empty(real_fwd_map);
-  // sort atoms
-  AtomMap atommap(datype_real.begin(), datype_real.begin() + nloc_real);
-  assert(nloc_real == atommap.get_type().size());
-  const std::vector<int>& sort_fwd_map(atommap.get_fwd_map());
-  const std::vector<int>& sort_bkw_map(atommap.get_bkw_map());
-  // shuffle nlist
-  nlist_data.shuffle(atommap);
-  InputNlist nlist;
-  nlist_data.make_inlist(nlist);
-  // make input tensors
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-  int ret;
-  if (dtype == tensorflow::DT_DOUBLE) {
-    ret = session_input_tensors<double>(
-        input_tensors, dcoord_real, ntypes, datype_real, dbox, nlist,
-        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap,
-        nghost_real, 0, name_scope);
-  } else {
-    ret = session_input_tensors<float>(
-        input_tensors, dcoord_real, ntypes, datype_real, dbox, nlist,
-        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap,
-        nghost_real, 0, name_scope);
-  }
-  assert(nloc_real == ret);
-  // make bond idx map
-  std::vector<int> bd_idx(nall, -1);
-  for (int ii = 0; ii < pairs.size(); ++ii) {
-    bd_idx[pairs[ii].first] = pairs[ii].second;
-  }
-  // make extf by bond idx map
-  std::vector<int> dtype_sort_loc = atommap.get_type();
-  std::vector<VALUETYPE> dextf;
-  for (int ii = 0; ii < dtype_sort_loc.size(); ++ii) {
-    if (binary_search(sel_type.begin(), sel_type.end(), dtype_sort_loc[ii])) {
-      // selected atom
-      int first_idx = real_bkw_map[sort_bkw_map[ii]];
-      int second_idx = bd_idx[first_idx];
-      assert(second_idx >= 0);
-      dextf.push_back(delef_[second_idx * 3 + 0]);
-      dextf.push_back(delef_[second_idx * 3 + 1]);
-      dextf.push_back(delef_[second_idx * 3 + 2]);
-    }
-  }
-  // dextf should be loc and virtual
-  assert(dextf.size() == (nloc - nloc_real) * 3);
-  // make tensor for extf
-  int nframes = 1;
-  TensorShape extf_shape;
-  extf_shape.AddDim(nframes);
-  extf_shape.AddDim(dextf.size());
-  Tensor extf_tensor((tensorflow::DataType)dtype, extf_shape);
-  if (dtype == tensorflow::DT_DOUBLE) {
-    auto extf = extf_tensor.matrix<double>();
-    for (int ii = 0; ii < nframes; ++ii) {
-      for (int jj = 0; jj < extf.size(); ++jj) {
-        extf(ii, jj) = dextf[jj];
-      }
-    }
-  } else {
-    auto extf = extf_tensor.matrix<float>();
-    for (int ii = 0; ii < nframes; ++ii) {
-      for (int jj = 0; jj < extf.size(); ++jj) {
-        extf(ii, jj) = dextf[jj];
-      }
-    }
-  }
-  // append extf to input tensor
-  input_tensors.push_back({"t_ef", extf_tensor});
-  // run model
-  std::vector<VALUETYPE> dfcorr, dvcorr;
-  if (dtype == tensorflow::DT_DOUBLE) {
-    run_model<double>(dfcorr, dvcorr, session, input_tensors, atommap,
-                      nghost_real);
-  } else {
-    run_model<float>(dfcorr, dvcorr, session, input_tensors, atommap,
-                     nghost_real);
-  }
-  assert(dfcorr.size() == nall_real * 3);
-  // back map force
-  std::vector<VALUETYPE> dfcorr_1 = dfcorr;
-  atommap.backward<VALUETYPE>(dfcorr_1.begin(), dfcorr.begin(), 3);
-  assert(dfcorr_1.size() == nall_real * 3);
-  // resize to all and clear
-  std::vector<VALUETYPE> dfcorr_2(nall * 3);
-  fill(dfcorr_2.begin(), dfcorr_2.end(), (VALUETYPE)0.0);
-  // back map to original position
-  for (int ii = 0; ii < nall_real; ++ii) {
-    for (int dd = 0; dd < 3; ++dd) {
-      dfcorr_2[real_bkw_map[ii] * 3 + dd] += dfcorr_1[ii * 3 + dd];
-    }
-  }
-  // self correction of bonded force
-  for (int ii = 0; ii < pairs.size(); ++ii) {
-    for (int dd = 0; dd < 3; ++dd) {
-      dfcorr_2[pairs[ii].first * 3 + dd] += delef_[pairs[ii].second * 3 + dd];
-    }
-  }
-  // add ele contrinution
-  dfcorr_ = dfcorr_2;
-  // for (int ii = 0; ii < nloc; ++ii){
-  //   for (int dd = 0; dd < 3; ++dd){
-  //     dfcorr_[ii*3+dd] += delef_[ii*3+dd];
-  //   }
-  // }
-  for (int ii = 0; ii < nloc_real; ++ii) {
-    int oii = real_bkw_map[ii];
-    for (int dd = 0; dd < 3; ++dd) {
-      dfcorr_[oii * 3 + dd] += delef_[oii * 3 + dd];
-    }
-  }
-  dvcorr_ = dvcorr;
+  dcm->computew(dfcorr_, dvcorr_, dcoord_, datype_, dbox, pairs, delef_, nghost,
+                lmp_list);
 }
 
 template void DipoleChargeModifier::compute<double>(
@@ -339,3 +82,11 @@ template void DipoleChargeModifier::compute<float>(
 void DipoleChargeModifier::print_summary(const std::string& pre) const {
   deepmd::print_summary(pre);
 }
+
+double DipoleChargeModifier::cutoff() const { return dcm->cutoff(); }
+
+int DipoleChargeModifier::numb_types() const { return dcm->numb_types(); }
+
+std::vector<int> DipoleChargeModifier::sel_types() const {
+  return dcm->sel_types();
+}
diff --git a/source/api_cc/src/DataModifierTF.cc b/source/api_cc/src/DataModifierTF.cc
new file mode 100644
index 0000000000..219139cf89
--- /dev/null
+++ b/source/api_cc/src/DataModifierTF.cc
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#include "DataModifierTF.h"
+
+#include "common.h"
+
+using namespace deepmd;
+using namespace tensorflow;
+
+DipoleChargeModifierTF::DipoleChargeModifierTF()
+    : inited(false), graph_def(new GraphDef()) {}
+
+DipoleChargeModifierTF::DipoleChargeModifierTF(const std::string& model,
+                                               const int& gpu_rank,
+                                               const std::string& name_scope_)
+    : inited(false), name_scope(name_scope_), graph_def(new GraphDef()) {
+  try {
+    init(model, gpu_rank, name_scope_);
+  } catch (...) {
+    // Clean up and rethrow, as the destructor will not be called
+    delete graph_def;
+    throw;
+  }
+}
+
+DipoleChargeModifierTF::~DipoleChargeModifierTF() { delete graph_def; };
+
+void DipoleChargeModifierTF::init(const std::string& model,
+                                  const int& gpu_rank,
+                                  const std::string& name_scope_) {
+  if (inited) {
+    std::cerr << "WARNING: deepmd-kit should not be initialized twice, do "
+                 "nothing at the second call of initializer"
+              << std::endl;
+    return;
+  }
+  name_scope = name_scope_;
+  SessionOptions options;
+  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
+  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
+  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
+  deepmd::load_op_library();
+  int gpu_num = -1;
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  DPGetDeviceCount(gpu_num);  // check current device environment
+  if (gpu_num > 0) {
+    options.config.set_allow_soft_placement(true);
+    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(
+        0.9);
+    options.config.mutable_gpu_options()->set_allow_growth(true);
+    DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
+    std::string str = "/gpu:";
+    str += std::to_string(gpu_rank % gpu_num);
+    graph::SetDefaultDevice(str, graph_def);
+  }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  deepmd::check_status(NewSession(options, &session));
+  deepmd::check_status(ReadBinaryProto(Env::Default(), model, graph_def));
+  deepmd::check_status(session->Create(*graph_def));
+  dtype = session_get_dtype(session, "descrpt_attr/rcut");
+  if (dtype == tensorflow::DT_DOUBLE) {
+    rcut = get_scalar<double>("descrpt_attr/rcut");
+  } else {
+    rcut = get_scalar<float>("descrpt_attr/rcut");
+  }
+  cell_size = rcut;
+  ntypes = get_scalar<int>("descrpt_attr/ntypes");
+  model_type = get_scalar<STRINGTYPE>("model_attr/model_type");
+  get_vector<int>(sel_type, "model_attr/sel_type");
+  sort(sel_type.begin(), sel_type.end());
+  inited = true;
+}
+
+template <class VT>
+VT DipoleChargeModifierTF::get_scalar(const std::string& name) const {
+  return session_get_scalar<VT>(session, name, name_scope);
+}
+
+template <class VT>
+void DipoleChargeModifierTF::get_vector(std::vector<VT>& vec,
+                                        const std::string& name) const {
+  session_get_vector<VT>(vec, session, name, name_scope);
+}
+
+template <typename MODELTYPE, typename VALUETYPE>
+void DipoleChargeModifierTF::run_model(
+    std::vector<VALUETYPE>& dforce,
+    std::vector<VALUETYPE>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nghost) {
+  unsigned nloc = atommap.get_type().size();
+  unsigned nall = nloc + nghost;
+  if (nloc == 0) {
+    dforce.clear();
+    dvirial.clear();
+    return;
+  }
+
+  std::vector<Tensor> output_tensors;
+  deepmd::check_status(session->Run(input_tensors,
+                                    {"o_dm_force", "o_dm_virial", "o_dm_av"},
+                                    {}, &output_tensors));
+  int cc = 0;
+  Tensor output_f = output_tensors[cc++];
+  Tensor output_v = output_tensors[cc++];
+  Tensor output_av = output_tensors[cc++];
+  assert(output_f.dims() == 2 && "dim of output tensor should be 2");
+  assert(output_v.dims() == 2 && "dim of output tensor should be 2");
+  assert(output_av.dims() == 2 && "dim of output tensor should be 2");
+  int nframes = output_f.dim_size(0);
+  int natoms = output_f.dim_size(1) / 3;
+  assert(output_f.dim_size(0) == 1 && "nframes should match");
+  assert(natoms == nall && "natoms should be nall");
+  assert(output_v.dim_size(0) == nframes && "nframes should match");
+  assert(output_v.dim_size(1) == 9 && "dof of virial should be 9");
+  assert(output_av.dim_size(0) == nframes && "nframes should match");
+  assert(output_av.dim_size(1) == natoms * 9 &&
+         "dof of atom virial should be 9 * natoms");
+
+  auto of = output_f.flat<MODELTYPE>();
+  auto ov = output_v.flat<MODELTYPE>();
+
+  dforce.resize(nall * 3);
+  dvirial.resize(9);
+  for (int ii = 0; ii < nall * 3; ++ii) {
+    dforce[ii] = of(ii);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    dvirial[ii] = ov(ii);
+  }
+}
+
+template void DipoleChargeModifierTF::run_model<double, double>(
+    std::vector<double>& dforce,
+    std::vector<double>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nghost);
+
+template void DipoleChargeModifierTF::run_model<float, double>(
+    std::vector<double>& dforce,
+    std::vector<double>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nghost);
+
+template void DipoleChargeModifierTF::run_model<double, float>(
+    std::vector<float>& dforce,
+    std::vector<float>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nghost);
+
+template void DipoleChargeModifierTF::run_model<float, float>(
+    std::vector<float>& dforce,
+    std::vector<float>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nghost);
+
+template <typename VALUETYPE>
+void DipoleChargeModifierTF::compute(
+    std::vector<VALUETYPE>& dfcorr_,
+    std::vector<VALUETYPE>& dvcorr_,
+    const std::vector<VALUETYPE>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<VALUETYPE>& dbox,
+    const std::vector<std::pair<int, int>>& pairs,
+    const std::vector<VALUETYPE>& delef_,
+    const int nghost,
+    const InputNlist& lmp_list) {
+  // firstly do selection
+  int nall = datype_.size();
+  int nloc = nall - nghost;
+  int nghost_real;
+  std::vector<int> real_fwd_map, real_bkw_map;
+  select_real_atoms(real_fwd_map, real_bkw_map, nghost_real, dcoord_, datype_,
+                    nghost, ntypes);
+  int nall_real = real_bkw_map.size();
+  int nloc_real = nall_real - nghost_real;
+  if (nloc_real == 0) {
+    dfcorr_.resize(nall * 3);
+    dvcorr_.resize(9);
+    fill(dfcorr_.begin(), dfcorr_.end(), (VALUETYPE)0.0);
+    fill(dvcorr_.begin(), dvcorr_.end(), (VALUETYPE)0.0);
+    return;
+  }
+  // resize to nall_real
+  std::vector<VALUETYPE> dcoord_real;
+  std::vector<VALUETYPE> delef_real;
+  std::vector<int> datype_real;
+  dcoord_real.resize(nall_real * 3);
+  delef_real.resize(nall_real * 3);
+  datype_real.resize(nall_real);
+  // fwd map
+  select_map<VALUETYPE>(dcoord_real, dcoord_, real_fwd_map, 3);
+  select_map<VALUETYPE>(delef_real, delef_, real_fwd_map, 3);
+  select_map<int>(datype_real, datype_, real_fwd_map, 1);
+  // internal nlist
+  NeighborListData nlist_data;
+  nlist_data.copy_from_nlist(lmp_list);
+  nlist_data.shuffle_exclude_empty(real_fwd_map);
+  // sort atoms
+  AtomMap atommap(datype_real.begin(), datype_real.begin() + nloc_real);
+  assert(nloc_real == atommap.get_type().size());
+  const std::vector<int>& sort_fwd_map(atommap.get_fwd_map());
+  const std::vector<int>& sort_bkw_map(atommap.get_bkw_map());
+  // shuffle nlist
+  nlist_data.shuffle(atommap);
+  InputNlist nlist;
+  nlist_data.make_inlist(nlist);
+  // make input tensors
+  std::vector<std::pair<std::string, Tensor>> input_tensors;
+  int ret;
+  if (dtype == tensorflow::DT_DOUBLE) {
+    ret = session_input_tensors<double>(
+        input_tensors, dcoord_real, ntypes, datype_real, dbox, nlist,
+        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap,
+        nghost_real, 0, name_scope);
+  } else {
+    ret = session_input_tensors<float>(
+        input_tensors, dcoord_real, ntypes, datype_real, dbox, nlist,
+        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap,
+        nghost_real, 0, name_scope);
+  }
+  assert(nloc_real == ret);
+  // make bond idx map
+  std::vector<int> bd_idx(nall, -1);
+  for (int ii = 0; ii < pairs.size(); ++ii) {
+    bd_idx[pairs[ii].first] = pairs[ii].second;
+  }
+  // make extf by bond idx map
+  std::vector<int> dtype_sort_loc = atommap.get_type();
+  std::vector<VALUETYPE> dextf;
+  for (int ii = 0; ii < dtype_sort_loc.size(); ++ii) {
+    if (binary_search(sel_type.begin(), sel_type.end(), dtype_sort_loc[ii])) {
+      // selected atom
+      int first_idx = real_bkw_map[sort_bkw_map[ii]];
+      int second_idx = bd_idx[first_idx];
+      assert(second_idx >= 0);
+      dextf.push_back(delef_[second_idx * 3 + 0]);
+      dextf.push_back(delef_[second_idx * 3 + 1]);
+      dextf.push_back(delef_[second_idx * 3 + 2]);
+    }
+  }
+  // dextf should be loc and virtual
+  assert(dextf.size() == (nloc - nloc_real) * 3);
+  // make tensor for extf
+  int nframes = 1;
+  TensorShape extf_shape;
+  extf_shape.AddDim(nframes);
+  extf_shape.AddDim(dextf.size());
+  Tensor extf_tensor((tensorflow::DataType)dtype, extf_shape);
+  if (dtype == tensorflow::DT_DOUBLE) {
+    auto extf = extf_tensor.matrix<double>();
+    for (int ii = 0; ii < nframes; ++ii) {
+      for (int jj = 0; jj < extf.size(); ++jj) {
+        extf(ii, jj) = dextf[jj];
+      }
+    }
+  } else {
+    auto extf = extf_tensor.matrix<float>();
+    for (int ii = 0; ii < nframes; ++ii) {
+      for (int jj = 0; jj < extf.size(); ++jj) {
+        extf(ii, jj) = dextf[jj];
+      }
+    }
+  }
+  // append extf to input tensor
+  input_tensors.push_back({"t_ef", extf_tensor});
+  // run model
+  std::vector<VALUETYPE> dfcorr, dvcorr;
+  if (dtype == tensorflow::DT_DOUBLE) {
+    run_model<double>(dfcorr, dvcorr, session, input_tensors, atommap,
+                      nghost_real);
+  } else {
+    run_model<float>(dfcorr, dvcorr, session, input_tensors, atommap,
+                     nghost_real);
+  }
+  assert(dfcorr.size() == nall_real * 3);
+  // back map force
+  std::vector<VALUETYPE> dfcorr_1 = dfcorr;
+  atommap.backward<VALUETYPE>(dfcorr_1.begin(), dfcorr.begin(), 3);
+  assert(dfcorr_1.size() == nall_real * 3);
+  // resize to all and clear
+  std::vector<VALUETYPE> dfcorr_2(nall * 3);
+  fill(dfcorr_2.begin(), dfcorr_2.end(), (VALUETYPE)0.0);
+  // back map to original position
+  for (int ii = 0; ii < nall_real; ++ii) {
+    for (int dd = 0; dd < 3; ++dd) {
+      dfcorr_2[real_bkw_map[ii] * 3 + dd] += dfcorr_1[ii * 3 + dd];
+    }
+  }
+  // self correction of bonded force
+  for (int ii = 0; ii < pairs.size(); ++ii) {
+    for (int dd = 0; dd < 3; ++dd) {
+      dfcorr_2[pairs[ii].first * 3 + dd] += delef_[pairs[ii].second * 3 + dd];
+    }
+  }
+  // add ele contrinution
+  dfcorr_ = dfcorr_2;
+  for (int ii = 0; ii < nloc_real; ++ii) {
+    int oii = real_bkw_map[ii];
+    for (int dd = 0; dd < 3; ++dd) {
+      dfcorr_[oii * 3 + dd] += delef_[oii * 3 + dd];
+    }
+  }
+  dvcorr_ = dvcorr;
+}
+
+template void DipoleChargeModifierTF::compute<double>(
+    std::vector<double>& dfcorr_,
+    std::vector<double>& dvcorr_,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
+    const std::vector<std::pair<int, int>>& pairs,
+    const std::vector<double>& delef_,
+    const int nghost,
+    const InputNlist& lmp_list);
+
+template void DipoleChargeModifierTF::compute<float>(
+    std::vector<float>& dfcorr_,
+    std::vector<float>& dvcorr_,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
+    const std::vector<std::pair<int, int>>& pairs,
+    const std::vector<float>& delef_,
+    const int nghost,
+    const InputNlist& lmp_list);
+
+void DipoleChargeModifierTF::computew(
+    std::vector<double>& dfcorr_,
+    std::vector<double>& dvcorr_,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
+    const std::vector<std::pair<int, int>>& pairs,
+    const std::vector<double>& delef_,
+    const int nghost,
+    const InputNlist& lmp_list) {
+  compute(dfcorr_, dvcorr_, dcoord_, datype_, dbox, pairs, delef_, nghost,
+          lmp_list);
+}
+void DipoleChargeModifierTF::computew(
+    std::vector<float>& dfcorr_,
+    std::vector<float>& dvcorr_,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
+    const std::vector<std::pair<int, int>>& pairs,
+    const std::vector<float>& delef_,
+    const int nghost,
+    const InputNlist& lmp_list) {
+  compute(dfcorr_, dvcorr_, dcoord_, datype_, dbox, pairs, delef_, nghost,
+          lmp_list);
+}
diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc
index 23a0a7e663..083e9b091f 100644
--- a/source/api_cc/src/DeepPot.cc
+++ b/source/api_cc/src/DeepPot.cc
@@ -1,413 +1,27 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include "DeepPot.h"
 
+#include "common.h"
+// TODO: only include when TF backend is built
+#include <memory>
 #include <stdexcept>
 
 #include "AtomMap.h"
+#include "DeepPotTF.h"
 #include "device.h"
 
-using namespace tensorflow;
 using namespace deepmd;
 
-// start multiple frames
-
-template <typename MODELTYPE, typename VALUETYPE>
-static void run_model(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<VALUETYPE>& dforce_,
-    std::vector<VALUETYPE>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nframes,
-    const int nghost = 0) {
-  unsigned nloc = atommap.get_type().size();
-  unsigned nall = nloc + nghost;
-  dener.resize(nframes);
-  if (nloc == 0) {
-    // no backward map needed
-    // dforce of size nall * 3
-    dforce_.resize(nframes * nall * 3);
-    fill(dforce_.begin(), dforce_.end(), (VALUETYPE)0.0);
-    // dvirial of size 9
-    dvirial.resize(nframes * 9);
-    fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.0);
-    return;
-  }
-
-  std::vector<Tensor> output_tensors;
-  check_status(session->Run(
-      input_tensors, {"o_energy", "o_force", "o_atom_energy", "o_atom_virial"},
-      {}, &output_tensors));
-
-  Tensor output_e = output_tensors[0];
-  Tensor output_f = output_tensors[1];
-  Tensor output_av = output_tensors[3];
-
-  auto oe = output_e.flat<ENERGYTYPE>();
-  auto of = output_f.flat<MODELTYPE>();
-  auto oav = output_av.flat<MODELTYPE>();
-
-  std::vector<VALUETYPE> dforce(nframes * 3 * nall);
-  dvirial.resize(nframes * 9);
-  for (int ii = 0; ii < nframes; ++ii) {
-    dener[ii] = oe(ii);
-  }
-  for (unsigned ii = 0; ii < nframes * nall * 3; ++ii) {
-    dforce[ii] = of(ii);
-  }
-  // set dvirial to zero, prevent input vector is not zero (#1123)
-  std::fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.);
-  for (int kk = 0; kk < nframes; ++kk) {
-    for (int ii = 0; ii < nall; ++ii) {
-      dvirial[kk * 9 + 0] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 0);
-      dvirial[kk * 9 + 1] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 1);
-      dvirial[kk * 9 + 2] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 2);
-      dvirial[kk * 9 + 3] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 3);
-      dvirial[kk * 9 + 4] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 4);
-      dvirial[kk * 9 + 5] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 5);
-      dvirial[kk * 9 + 6] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 6);
-      dvirial[kk * 9 + 7] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 7);
-      dvirial[kk * 9 + 8] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 8);
-    }
-  }
-  dforce_ = dforce;
-  atommap.backward<VALUETYPE>(dforce_.begin(), dforce.begin(), 3, nframes,
-                              nall);
-}
-
-template void run_model<double, double>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nframes,
-    const int nghost);
-
-template void run_model<double, float>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nframes,
-    const int nghost);
-
-template void run_model<float, double>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nframes,
-    const int nghost);
-
-template void run_model<float, float>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nframes,
-    const int nghost);
-
-template <typename MODELTYPE, typename VALUETYPE>
-static void run_model(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<VALUETYPE>& dforce_,
-    std::vector<VALUETYPE>& dvirial,
-    std::vector<VALUETYPE>& datom_energy_,
-    std::vector<VALUETYPE>& datom_virial_,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const deepmd::AtomMap& atommap,
-    const int& nframes,
-    const int& nghost = 0) {
-  unsigned nloc = atommap.get_type().size();
-  unsigned nall = nloc + nghost;
-  dener.resize(nframes);
-  if (nloc == 0) {
-    // no backward map needed
-    // dforce of size nall * 3
-    dforce_.resize(nframes * nall * 3);
-    fill(dforce_.begin(), dforce_.end(), (VALUETYPE)0.0);
-    // dvirial of size 9
-    dvirial.resize(nframes * 9);
-    fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.0);
-    // datom_energy_ of size nall
-    datom_energy_.resize(nframes * nall);
-    fill(datom_energy_.begin(), datom_energy_.end(), (VALUETYPE)0.0);
-    // datom_virial_ of size nall * 9
-    datom_virial_.resize(nframes * nall * 9);
-    fill(datom_virial_.begin(), datom_virial_.end(), (VALUETYPE)0.0);
-    return;
-  }
-  std::vector<Tensor> output_tensors;
-
-  check_status(session->Run(
-      input_tensors, {"o_energy", "o_force", "o_atom_energy", "o_atom_virial"},
-      {}, &output_tensors));
-
-  Tensor output_e = output_tensors[0];
-  Tensor output_f = output_tensors[1];
-  Tensor output_ae = output_tensors[2];
-  Tensor output_av = output_tensors[3];
-
-  auto oe = output_e.flat<ENERGYTYPE>();
-  auto of = output_f.flat<MODELTYPE>();
-  auto oae = output_ae.flat<MODELTYPE>();
-  auto oav = output_av.flat<MODELTYPE>();
-
-  std::vector<VALUETYPE> dforce(nframes * 3 * nall);
-  std::vector<VALUETYPE> datom_energy(nframes * nall, 0);
-  std::vector<VALUETYPE> datom_virial(nframes * 9 * nall);
-  dvirial.resize(nframes * 9);
-  for (int ii = 0; ii < nframes; ++ii) {
-    dener[ii] = oe(ii);
-  }
-  for (int ii = 0; ii < nframes * nall * 3; ++ii) {
-    dforce[ii] = of(ii);
-  }
-  for (int ii = 0; ii < nframes; ++ii) {
-    for (int jj = 0; jj < nloc; ++jj) {
-      datom_energy[ii * nall + jj] = oae(ii * nloc + jj);
-    }
-  }
-  for (int ii = 0; ii < nframes * nall * 9; ++ii) {
-    datom_virial[ii] = oav(ii);
-  }
-  // set dvirial to zero, prevent input vector is not zero (#1123)
-  std::fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.);
-  for (int kk = 0; kk < nframes; ++kk) {
-    for (int ii = 0; ii < nall; ++ii) {
-      dvirial[kk * 9 + 0] +=
-          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 0];
-      dvirial[kk * 9 + 1] +=
-          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 1];
-      dvirial[kk * 9 + 2] +=
-          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 2];
-      dvirial[kk * 9 + 3] +=
-          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 3];
-      dvirial[kk * 9 + 4] +=
-          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 4];
-      dvirial[kk * 9 + 5] +=
-          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 5];
-      dvirial[kk * 9 + 6] +=
-          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 6];
-      dvirial[kk * 9 + 7] +=
-          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 7];
-      dvirial[kk * 9 + 8] +=
-          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 8];
-    }
-  }
-  dforce_ = dforce;
-  datom_energy_ = datom_energy;
-  datom_virial_ = datom_virial;
-  atommap.backward<VALUETYPE>(dforce_.begin(), dforce.begin(), 3, nframes,
-                              nall);
-  atommap.backward<VALUETYPE>(datom_energy_.begin(), datom_energy.begin(), 1,
-                              nframes, nall);
-  atommap.backward<VALUETYPE>(datom_virial_.begin(), datom_virial.begin(), 9,
-                              nframes, nall);
-}
-
-template void run_model<double, double>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    std::vector<double>& datom_energy_,
-    std::vector<double>& datom_virial_,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const deepmd::AtomMap& atommap,
-    const int& nframes,
-    const int& nghost);
-
-template void run_model<double, float>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    std::vector<float>& datom_energy_,
-    std::vector<float>& datom_virial_,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const deepmd::AtomMap& atommap,
-    const int& nframes,
-    const int& nghost);
-
-template void run_model<float, double>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    std::vector<double>& datom_energy_,
-    std::vector<double>& datom_virial_,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const deepmd::AtomMap& atommap,
-    const int& nframes,
-    const int& nghost);
-
-template void run_model<float, float>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    std::vector<float>& datom_energy_,
-    std::vector<float>& datom_virial_,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const deepmd::AtomMap& atommap,
-    const int& nframes,
-    const int& nghost);
-
-// end multiple frames
-
-// start single frame
-
-template <typename MODELTYPE, typename VALUETYPE>
-static void run_model(
-    ENERGYTYPE& dener,
-    std::vector<VALUETYPE>& dforce_,
-    std::vector<VALUETYPE>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nframes = 1,
-    const int nghost = 0) {
-  assert(nframes == 1);
-  std::vector<ENERGYTYPE> dener_(1);
-  // call multi-frame version
-  run_model<MODELTYPE, VALUETYPE>(dener_, dforce_, dvirial, session,
-                                  input_tensors, atommap, nframes, nghost);
-  dener = dener_[0];
-}
-
-template void run_model<double, double>(
-    ENERGYTYPE& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nframes,
-    const int nghost);
-
-template void run_model<double, float>(
-    ENERGYTYPE& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nframes,
-    const int nghost);
-
-template void run_model<float, double>(
-    ENERGYTYPE& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nframes,
-    const int nghost);
-
-template void run_model<float, float>(
-    ENERGYTYPE& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const AtomMap& atommap,
-    const int nframes,
-    const int nghost);
-
-template <typename MODELTYPE, typename VALUETYPE>
-static void run_model(
-    ENERGYTYPE& dener,
-    std::vector<VALUETYPE>& dforce_,
-    std::vector<VALUETYPE>& dvirial,
-    std::vector<VALUETYPE>& datom_energy_,
-    std::vector<VALUETYPE>& datom_virial_,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const deepmd::AtomMap& atommap,
-    const int& nframes = 1,
-    const int& nghost = 0) {
-  assert(nframes == 1);
-  std::vector<ENERGYTYPE> dener_(1);
-  // call multi-frame version
-  run_model<MODELTYPE, VALUETYPE>(dener_, dforce_, dvirial, datom_energy_,
-                                  datom_virial_, session, input_tensors,
-                                  atommap, nframes, nghost);
-  dener = dener_[0];
-}
-
-template void run_model<double, double>(
-    ENERGYTYPE& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    std::vector<double>& datom_energy_,
-    std::vector<double>& datom_virial_,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const deepmd::AtomMap& atommap,
-    const int& nframes,
-    const int& nghost);
-
-template void run_model<double, float>(
-    ENERGYTYPE& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    std::vector<float>& datom_energy_,
-    std::vector<float>& datom_virial_,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const deepmd::AtomMap& atommap,
-    const int& nframes,
-    const int& nghost);
-
-template void run_model<float, double>(
-    ENERGYTYPE& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    std::vector<double>& datom_energy_,
-    std::vector<double>& datom_virial_,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const deepmd::AtomMap& atommap,
-    const int& nframes,
-    const int& nghost);
-
-template void run_model<float, float>(
-    ENERGYTYPE& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    std::vector<float>& datom_energy_,
-    std::vector<float>& datom_virial_,
-    Session* session,
-    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
-    const deepmd::AtomMap& atommap,
-    const int& nframes,
-    const int& nghost);
-
-// end single frame
-
-DeepPot::DeepPot()
-    : inited(false), init_nbor(false), graph_def(new GraphDef()) {}
+DeepPot::DeepPot() : inited(false) {}
 
 DeepPot::DeepPot(const std::string& model,
                  const int& gpu_rank,
                  const std::string& file_content)
-    : inited(false), init_nbor(false), graph_def(new GraphDef()) {
+    : inited(false) {
   init(model, gpu_rank, file_content);
 }
 
-DeepPot::~DeepPot() { delete graph_def; }
+DeepPot::~DeepPot() {}
 
 void DeepPot::init(const std::string& model,
                    const int& gpu_rank,
@@ -418,509 +32,252 @@ void DeepPot::init(const std::string& model,
               << std::endl;
     return;
   }
-  SessionOptions options;
-  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
-  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
-  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  deepmd::load_op_library();
-
-  if (file_content.size() == 0) {
-    check_status(ReadBinaryProto(Env::Default(), model, graph_def));
-  } else {
-    (*graph_def).ParseFromString(file_content);
-  }
-  int gpu_num = -1;
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  DPGetDeviceCount(gpu_num);  // check current device environment
-  if (gpu_num > 0) {
-    options.config.set_allow_soft_placement(true);
-    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(
-        0.9);
-    options.config.mutable_gpu_options()->set_allow_growth(true);
-    DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
-    std::string str = "/gpu:";
-    str += std::to_string(gpu_rank % gpu_num);
-    graph::SetDefaultDevice(str, graph_def);
-  }
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  check_status(NewSession(options, &session));
-  check_status(session->Create(*graph_def));
-  try {
-    model_version = get_scalar<STRINGTYPE>("model_attr/model_version");
-  } catch (deepmd::tf_exception& e) {
-    // no model version defined in old models
-    model_version = "0.0";
-  }
-  if (!model_compatable(model_version)) {
-    throw deepmd::deepmd_exception(
-        "incompatable model: version " + model_version +
-        " in graph, but version " + global_model_version +
-        " supported "
-        "See https://deepmd.rtfd.io/compatability/ for details.");
-  }
-  dtype = session_get_dtype(session, "descrpt_attr/rcut");
-  if (dtype == tensorflow::DT_DOUBLE) {
-    rcut = get_scalar<double>("descrpt_attr/rcut");
-  } else {
-    rcut = get_scalar<float>("descrpt_attr/rcut");
-  }
-  cell_size = rcut;
-  ntypes = get_scalar<int>("descrpt_attr/ntypes");
-  try {
-    ntypes_spin = get_scalar<int>("spin_attr/ntypes_spin");
-  } catch (deepmd::deepmd_exception) {
-    ntypes_spin = 0;
-  }
-  dfparam = get_scalar<int>("fitting_attr/dfparam");
-  daparam = get_scalar<int>("fitting_attr/daparam");
-  if (dfparam < 0) {
-    dfparam = 0;
-  }
-  if (daparam < 0) {
-    daparam = 0;
-  }
-  if (daparam > 0) {
-    try {
-      aparam_nall = get_scalar<bool>("fitting_attr/aparam_nall");
-    } catch (deepmd::deepmd_exception) {
-      aparam_nall = false;
-    }
+  // TODO: To implement detect_backend
+  DPBackend backend = deepmd::DPBackend::TensorFlow;
+  if (deepmd::DPBackend::TensorFlow == backend) {
+    // TODO: throw errors if TF backend is not built, without mentioning TF
+    dp = std::make_shared<deepmd::DeepPotTF>(model, gpu_rank, file_content);
+  } else if (deepmd::DPBackend::PyTorch == backend) {
+    throw deepmd::deepmd_exception("PyTorch backend is not supported yet");
+  } else if (deepmd::DPBackend::Paddle == backend) {
+    throw deepmd::deepmd_exception("PaddlePaddle backend is not supported yet");
   } else {
-    aparam_nall = false;
+    throw deepmd::deepmd_exception("Unknown file type");
   }
-  model_type = get_scalar<STRINGTYPE>("model_attr/model_type");
   inited = true;
-
-  init_nbor = false;
-}
-
-void DeepPot::print_summary(const std::string& pre) const {
-  deepmd::print_summary(pre);
-}
-
-template <class VT>
-VT DeepPot::get_scalar(const std::string& name) const {
-  return session_get_scalar<VT>(session, name);
-}
-
-template <typename VALUETYPE>
-void DeepPot::validate_fparam_aparam(
-    const int& nframes,
-    const int& nloc,
-    const std::vector<VALUETYPE>& fparam,
-    const std::vector<VALUETYPE>& aparam) const {
-  if (fparam.size() != dfparam && fparam.size() != nframes * dfparam) {
-    throw deepmd::deepmd_exception(
-        "the dim of frame parameter provided is not consistent with what the "
-        "model uses");
-  }
-
-  if (aparam.size() != daparam * nloc &&
-      aparam.size() != nframes * daparam * nloc) {
-    throw deepmd::deepmd_exception(
-        "the dim of atom parameter provided is not consistent with what the "
-        "model uses");
-  }
-}
-
-template void DeepPot::validate_fparam_aparam<double>(
-    const int& nframes,
-    const int& nloc,
-    const std::vector<double>& fparam,
-    const std::vector<double>& aparam) const;
-
-template void DeepPot::validate_fparam_aparam<float>(
-    const int& nframes,
-    const int& nloc,
-    const std::vector<float>& fparam,
-    const std::vector<float>& aparam) const;
-
-template <typename VALUETYPE>
-void DeepPot::tile_fparam_aparam(std::vector<VALUETYPE>& out_param,
-                                 const int& nframes,
-                                 const int& dparam,
-                                 const std::vector<VALUETYPE>& param) const {
-  if (param.size() == dparam) {
-    out_param.resize(nframes * dparam);
-    for (int ii = 0; ii < nframes; ++ii) {
-      std::copy(param.begin(), param.end(), out_param.begin() + ii * dparam);
-    }
-  } else if (param.size() == nframes * dparam) {
-    out_param = param;
-  }
-}
-
-template void DeepPot::tile_fparam_aparam<double>(
-    std::vector<double>& out_param,
-    const int& nframes,
-    const int& dparam,
-    const std::vector<double>& param) const;
-
-template void DeepPot::tile_fparam_aparam<float>(
-    std::vector<float>& out_param,
-    const int& nframes,
-    const int& dparam,
-    const std::vector<float>& param) const;
-
-// ENERGYVTYPE: std::vector<ENERGYTYPE> or ENERGYTYPE
-
-template <typename VALUETYPE, typename ENERGYVTYPE>
-void DeepPot::compute(ENERGYVTYPE& dener,
-                      std::vector<VALUETYPE>& dforce_,
-                      std::vector<VALUETYPE>& dvirial,
-                      const std::vector<VALUETYPE>& dcoord_,
-                      const std::vector<int>& datype_,
-                      const std::vector<VALUETYPE>& dbox,
-                      const std::vector<VALUETYPE>& fparam_,
-                      const std::vector<VALUETYPE>& aparam_) {
-  int nall = datype_.size();
-  // if nall==0, unclear nframes, but 1 is ok
-  int nframes = nall > 0 ? (dcoord_.size() / nall / 3) : 1;
-  int nloc = nall;
-  atommap = deepmd::AtomMap(datype_.begin(), datype_.begin() + nloc);
-  assert(nloc == atommap.get_type().size());
-  std::vector<VALUETYPE> fparam;
-  std::vector<VALUETYPE> aparam;
-  validate_fparam_aparam(nframes, (aparam_nall ? nall : nloc), fparam_,
-                         aparam_);
-  tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
-  tile_fparam_aparam(aparam, nframes, (aparam_nall ? nall : nloc) * daparam,
-                     aparam_);
-
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-
-  if (dtype == tensorflow::DT_DOUBLE) {
-    int ret = session_input_tensors<double>(input_tensors, dcoord_, ntypes,
-                                            datype_, dbox, cell_size, fparam,
-                                            aparam, atommap, "", aparam_nall);
-    assert(ret == nloc);
-    run_model<double>(dener, dforce_, dvirial, session, input_tensors, atommap,
-                      nframes);
-  } else {
-    int ret = session_input_tensors<float>(input_tensors, dcoord_, ntypes,
-                                           datype_, dbox, cell_size, fparam,
-                                           aparam, atommap, "", aparam_nall);
-    assert(ret == nloc);
-    run_model<float>(dener, dforce_, dvirial, session, input_tensors, atommap,
-                     nframes);
-  }
-}
-
-template void DeepPot::compute<double, ENERGYTYPE>(
-    ENERGYTYPE& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    const std::vector<double>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<double>& dbox,
-    const std::vector<double>& fparam,
-    const std::vector<double>& aparam);
-
-template void DeepPot::compute<float, ENERGYTYPE>(
-    ENERGYTYPE& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    const std::vector<float>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<float>& dbox,
-    const std::vector<float>& fparam,
-    const std::vector<float>& aparam);
-
-template void DeepPot::compute<double, std::vector<ENERGYTYPE>>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    const std::vector<double>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<double>& dbox,
-    const std::vector<double>& fparam,
-    const std::vector<double>& aparam);
-
-template void DeepPot::compute<float, std::vector<ENERGYTYPE>>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    const std::vector<float>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<float>& dbox,
-    const std::vector<float>& fparam,
-    const std::vector<float>& aparam);
-
-template <typename VALUETYPE, typename ENERGYVTYPE>
-void DeepPot::compute(ENERGYVTYPE& dener,
-                      std::vector<VALUETYPE>& dforce_,
-                      std::vector<VALUETYPE>& dvirial,
-                      const std::vector<VALUETYPE>& dcoord_,
-                      const std::vector<int>& datype_,
-                      const std::vector<VALUETYPE>& dbox,
-                      const int nghost,
-                      const InputNlist& lmp_list,
-                      const int& ago,
-                      const std::vector<VALUETYPE>& fparam_,
-                      const std::vector<VALUETYPE>& aparam__) {
-  int nall = datype_.size();
-  // if nall==0, unclear nframes, but 1 is ok
-  int nframes = nall > 0 ? (dcoord_.size() / nall / 3) : 1;
-  std::vector<VALUETYPE> fparam;
-  std::vector<VALUETYPE> aparam_;
-  validate_fparam_aparam(nframes, (aparam_nall ? nall : (nall - nghost)),
-                         fparam_, aparam__);
-  tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
-  tile_fparam_aparam(aparam_, nframes,
-                     (aparam_nall ? nall : (nall - nghost)) * daparam,
-                     aparam__);
-
-  // select real atoms
-  std::vector<VALUETYPE> dcoord, dforce, aparam;
-  std::vector<int> datype, fwd_map, bkw_map;
-  int nghost_real, nall_real, nloc_real;
-  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
-                          nall_real, nloc_real, dcoord_, datype_, aparam_,
-                          nghost, ntypes, nframes, daparam, nall, aparam_nall);
-
-  // internal nlist
-  if (ago == 0) {
-    nlist_data.copy_from_nlist(lmp_list);
-    nlist_data.shuffle_exclude_empty(fwd_map);
-  }
-  compute_inner(dener, dforce, dvirial, dcoord, datype, dbox, nghost_real, ago,
-                fparam, aparam);
-  // bkw map
-  dforce_.resize(nframes * fwd_map.size() * 3);
-  select_map<VALUETYPE>(dforce_, dforce, bkw_map, 3, nframes, fwd_map.size(),
-                        bkw_map.size());
 }
 
-template void DeepPot::compute<double, ENERGYTYPE>(
-    ENERGYTYPE& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    const std::vector<double>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<double>& dbox,
-    const int nghost,
-    const InputNlist& lmp_list,
-    const int& ago,
-    const std::vector<double>& fparam,
-    const std::vector<double>& aparam_);
-
-template void DeepPot::compute<float, ENERGYTYPE>(
-    ENERGYTYPE& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    const std::vector<float>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<float>& dbox,
-    const int nghost,
-    const InputNlist& lmp_list,
-    const int& ago,
-    const std::vector<float>& fparam,
-    const std::vector<float>& aparam_);
-
-template void DeepPot::compute<double, std::vector<ENERGYTYPE>>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    const std::vector<double>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<double>& dbox,
-    const int nghost,
-    const InputNlist& lmp_list,
-    const int& ago,
-    const std::vector<double>& fparam,
-    const std::vector<double>& aparam_);
-
-template void DeepPot::compute<float, std::vector<ENERGYTYPE>>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    const std::vector<float>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<float>& dbox,
-    const int nghost,
-    const InputNlist& lmp_list,
-    const int& ago,
-    const std::vector<float>& fparam,
-    const std::vector<float>& aparam_);
-
-template <typename VALUETYPE, typename ENERGYVTYPE>
-void DeepPot::compute_inner(ENERGYVTYPE& dener,
-                            std::vector<VALUETYPE>& dforce_,
-                            std::vector<VALUETYPE>& dvirial,
-                            const std::vector<VALUETYPE>& dcoord_,
-                            const std::vector<int>& datype_,
-                            const std::vector<VALUETYPE>& dbox,
-                            const int nghost,
-                            const int& ago,
-                            const std::vector<VALUETYPE>& fparam,
-                            const std::vector<VALUETYPE>& aparam) {
-  int nall = datype_.size();
-  // if nall==0, unclear nframes, but 1 is ok
-  int nframes = nall > 0 ? (dcoord_.size() / nall / 3) : 1;
-  int nloc = nall - nghost;
-
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-
-  // agp == 0 means that the LAMMPS nbor list has been updated
-  if (ago == 0) {
-    atommap = deepmd::AtomMap(datype_.begin(), datype_.begin() + nloc);
-    assert(nloc == atommap.get_type().size());
-    nlist_data.shuffle(atommap);
-    nlist_data.make_inlist(nlist);
-  }
-  if (dtype == tensorflow::DT_DOUBLE) {
-    int ret = session_input_tensors<double>(
-        input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam,
-        atommap, nghost, ago, "", aparam_nall);
-    assert(nloc == ret);
-    run_model<double>(dener, dforce_, dvirial, session, input_tensors, atommap,
-                      nframes, nghost);
-  } else {
-    int ret = session_input_tensors<float>(
-        input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam,
-        atommap, nghost, ago, "", aparam_nall);
-    assert(nloc == ret);
-    run_model<float>(dener, dforce_, dvirial, session, input_tensors, atommap,
-                     nframes, nghost);
-  }
-}
-
-template void DeepPot::compute_inner<double, ENERGYTYPE>(
-    ENERGYTYPE& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    const std::vector<double>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<double>& dbox,
-    const int nghost,
-    const int& ago,
-    const std::vector<double>& fparam,
-    const std::vector<double>& aparam);
-
-template void DeepPot::compute_inner<float, ENERGYTYPE>(
-    ENERGYTYPE& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    const std::vector<float>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<float>& dbox,
-    const int nghost,
-    const int& ago,
-    const std::vector<float>& fparam,
-    const std::vector<float>& aparam);
-
-template void DeepPot::compute_inner<double, std::vector<ENERGYTYPE>>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    const std::vector<double>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<double>& dbox,
-    const int nghost,
-    const int& ago,
-    const std::vector<double>& fparam,
-    const std::vector<double>& aparam);
-
-template void DeepPot::compute_inner<float, std::vector<ENERGYTYPE>>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    const std::vector<float>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<float>& dbox,
-    const int nghost,
-    const int& ago,
-    const std::vector<float>& fparam,
-    const std::vector<float>& aparam);
+void DeepPot::print_summary(const std::string& pre) const {
+  deepmd::print_summary(pre);
+}
 
-template <typename VALUETYPE, typename ENERGYVTYPE>
-void DeepPot::compute(ENERGYVTYPE& dener,
+template <typename VALUETYPE>
+void DeepPot::compute(ENERGYTYPE& dener,
                       std::vector<VALUETYPE>& dforce_,
                       std::vector<VALUETYPE>& dvirial,
-                      std::vector<VALUETYPE>& datom_energy_,
-                      std::vector<VALUETYPE>& datom_virial_,
                       const std::vector<VALUETYPE>& dcoord_,
                       const std::vector<int>& datype_,
                       const std::vector<VALUETYPE>& dbox,
                       const std::vector<VALUETYPE>& fparam_,
                       const std::vector<VALUETYPE>& aparam_) {
-  // if datype.size is 0, not clear nframes; but 1 is just ok
-  int nframes = datype_.size() > 0 ? (dcoord_.size() / 3 / datype_.size()) : 1;
-  atommap = deepmd::AtomMap(datype_.begin(), datype_.end());
-  int nloc = datype_.size();
-  std::vector<VALUETYPE> fparam;
-  std::vector<VALUETYPE> aparam;
-  validate_fparam_aparam(nframes, nloc, fparam_, aparam_);
-  tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
-  tile_fparam_aparam(aparam, nframes, nloc * daparam, aparam_);
-
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-
-  if (dtype == tensorflow::DT_DOUBLE) {
-    int nloc = session_input_tensors<double>(input_tensors, dcoord_, ntypes,
-                                             datype_, dbox, cell_size, fparam,
-                                             aparam, atommap, "", aparam_nall);
-    run_model<double>(dener, dforce_, dvirial, datom_energy_, datom_virial_,
-                      session, input_tensors, atommap, nframes);
-  } else {
-    int nloc = session_input_tensors<float>(input_tensors, dcoord_, ntypes,
-                                            datype_, dbox, cell_size, fparam,
-                                            aparam, atommap, "", aparam_nall);
-    run_model<float>(dener, dforce_, dvirial, datom_energy_, datom_virial_,
-                     session, input_tensors, atommap, nframes);
-  }
+  std::vector<ENERGYTYPE> dener_;
+  std::vector<VALUETYPE> datom_energy_, datom_virial_;
+  dp->computew(dener_, dforce_, dvirial, datom_energy_, datom_virial_, dcoord_,
+               datype_, dbox, fparam_, aparam_);
+  dener = dener_[0];
 }
 
-template void DeepPot::compute<double, ENERGYTYPE>(
-    ENERGYTYPE& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    std::vector<double>& datom_energy_,
-    std::vector<double>& datom_virial_,
-    const std::vector<double>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<double>& dbox,
-    const std::vector<double>& fparam,
-    const std::vector<double>& aparam);
+template <typename VALUETYPE>
+void DeepPot::compute(std::vector<ENERGYTYPE>& dener,
+                      std::vector<VALUETYPE>& dforce_,
+                      std::vector<VALUETYPE>& dvirial,
+                      const std::vector<VALUETYPE>& dcoord_,
+                      const std::vector<int>& datype_,
+                      const std::vector<VALUETYPE>& dbox,
+                      const std::vector<VALUETYPE>& fparam_,
+                      const std::vector<VALUETYPE>& aparam_) {
+  std::vector<VALUETYPE> datom_energy_, datom_virial_;
+  dp->computew(dener, dforce_, dvirial, datom_energy_, datom_virial_, dcoord_,
+               datype_, dbox, fparam_, aparam_);
+}
+
+template void DeepPot::compute<double>(ENERGYTYPE& dener,
+                                       std::vector<double>& dforce_,
+                                       std::vector<double>& dvirial,
+                                       const std::vector<double>& dcoord_,
+                                       const std::vector<int>& datype_,
+                                       const std::vector<double>& dbox,
+                                       const std::vector<double>& fparam,
+                                       const std::vector<double>& aparam);
+
+template void DeepPot::compute<float>(ENERGYTYPE& dener,
+                                      std::vector<float>& dforce_,
+                                      std::vector<float>& dvirial,
+                                      const std::vector<float>& dcoord_,
+                                      const std::vector<int>& datype_,
+                                      const std::vector<float>& dbox,
+                                      const std::vector<float>& fparam,
+                                      const std::vector<float>& aparam);
+
+template void DeepPot::compute<double>(std::vector<ENERGYTYPE>& dener,
+                                       std::vector<double>& dforce_,
+                                       std::vector<double>& dvirial,
+                                       const std::vector<double>& dcoord_,
+                                       const std::vector<int>& datype_,
+                                       const std::vector<double>& dbox,
+                                       const std::vector<double>& fparam,
+                                       const std::vector<double>& aparam);
+
+template void DeepPot::compute<float>(std::vector<ENERGYTYPE>& dener,
+                                      std::vector<float>& dforce_,
+                                      std::vector<float>& dvirial,
+                                      const std::vector<float>& dcoord_,
+                                      const std::vector<int>& datype_,
+                                      const std::vector<float>& dbox,
+                                      const std::vector<float>& fparam,
+                                      const std::vector<float>& aparam);
 
-template void DeepPot::compute<float, ENERGYTYPE>(
-    ENERGYTYPE& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    std::vector<float>& datom_energy_,
-    std::vector<float>& datom_virial_,
-    const std::vector<float>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<float>& dbox,
-    const std::vector<float>& fparam,
-    const std::vector<float>& aparam);
+template <typename VALUETYPE>
+void DeepPot::compute(ENERGYTYPE& dener,
+                      std::vector<VALUETYPE>& dforce_,
+                      std::vector<VALUETYPE>& dvirial,
+                      const std::vector<VALUETYPE>& dcoord_,
+                      const std::vector<int>& datype_,
+                      const std::vector<VALUETYPE>& dbox,
+                      const int nghost,
+                      const InputNlist& lmp_list,
+                      const int& ago,
+                      const std::vector<VALUETYPE>& fparam_,
+                      const std::vector<VALUETYPE>& aparam__) {
+  std::vector<ENERGYTYPE> dener_;
+  std::vector<VALUETYPE> datom_energy_, datom_virial_;
+  dp->computew(dener_, dforce_, dvirial, datom_energy_, datom_virial_, dcoord_,
+               datype_, dbox, nghost, lmp_list, ago, fparam_, aparam__);
+  dener = dener_[0];
+}
 
-template void DeepPot::compute<double, std::vector<ENERGYTYPE>>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    std::vector<double>& datom_energy_,
-    std::vector<double>& datom_virial_,
-    const std::vector<double>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<double>& dbox,
-    const std::vector<double>& fparam,
-    const std::vector<double>& aparam);
+template <typename VALUETYPE>
+void DeepPot::compute(std::vector<ENERGYTYPE>& dener,
+                      std::vector<VALUETYPE>& dforce_,
+                      std::vector<VALUETYPE>& dvirial,
+                      const std::vector<VALUETYPE>& dcoord_,
+                      const std::vector<int>& datype_,
+                      const std::vector<VALUETYPE>& dbox,
+                      const int nghost,
+                      const InputNlist& lmp_list,
+                      const int& ago,
+                      const std::vector<VALUETYPE>& fparam_,
+                      const std::vector<VALUETYPE>& aparam__) {
+  std::vector<VALUETYPE> datom_energy_, datom_virial_;
+  dp->computew(dener, dforce_, dvirial, datom_energy_, datom_virial_, dcoord_,
+               datype_, dbox, nghost, lmp_list, ago, fparam_, aparam__);
+}
+
+template void DeepPot::compute<double>(ENERGYTYPE& dener,
+                                       std::vector<double>& dforce_,
+                                       std::vector<double>& dvirial,
+                                       const std::vector<double>& dcoord_,
+                                       const std::vector<int>& datype_,
+                                       const std::vector<double>& dbox,
+                                       const int nghost,
+                                       const InputNlist& lmp_list,
+                                       const int& ago,
+                                       const std::vector<double>& fparam,
+                                       const std::vector<double>& aparam_);
+
+template void DeepPot::compute<float>(ENERGYTYPE& dener,
+                                      std::vector<float>& dforce_,
+                                      std::vector<float>& dvirial,
+                                      const std::vector<float>& dcoord_,
+                                      const std::vector<int>& datype_,
+                                      const std::vector<float>& dbox,
+                                      const int nghost,
+                                      const InputNlist& lmp_list,
+                                      const int& ago,
+                                      const std::vector<float>& fparam,
+                                      const std::vector<float>& aparam_);
+
+template void DeepPot::compute<double>(std::vector<ENERGYTYPE>& dener,
+                                       std::vector<double>& dforce_,
+                                       std::vector<double>& dvirial,
+                                       const std::vector<double>& dcoord_,
+                                       const std::vector<int>& datype_,
+                                       const std::vector<double>& dbox,
+                                       const int nghost,
+                                       const InputNlist& lmp_list,
+                                       const int& ago,
+                                       const std::vector<double>& fparam,
+                                       const std::vector<double>& aparam_);
+
+template void DeepPot::compute<float>(std::vector<ENERGYTYPE>& dener,
+                                      std::vector<float>& dforce_,
+                                      std::vector<float>& dvirial,
+                                      const std::vector<float>& dcoord_,
+                                      const std::vector<int>& datype_,
+                                      const std::vector<float>& dbox,
+                                      const int nghost,
+                                      const InputNlist& lmp_list,
+                                      const int& ago,
+                                      const std::vector<float>& fparam,
+                                      const std::vector<float>& aparam_);
 
-template void DeepPot::compute<float, std::vector<ENERGYTYPE>>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    std::vector<float>& datom_energy_,
-    std::vector<float>& datom_virial_,
-    const std::vector<float>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<float>& dbox,
-    const std::vector<float>& fparam,
-    const std::vector<float>& aparam);
+template <typename VALUETYPE>
+void DeepPot::compute(ENERGYTYPE& dener,
+                      std::vector<VALUETYPE>& dforce_,
+                      std::vector<VALUETYPE>& dvirial,
+                      std::vector<VALUETYPE>& datom_energy_,
+                      std::vector<VALUETYPE>& datom_virial_,
+                      const std::vector<VALUETYPE>& dcoord_,
+                      const std::vector<int>& datype_,
+                      const std::vector<VALUETYPE>& dbox,
+                      const std::vector<VALUETYPE>& fparam_,
+                      const std::vector<VALUETYPE>& aparam_) {
+  std::vector<ENERGYTYPE> dener_;
+  dp->computew(dener_, dforce_, dvirial, datom_energy_, datom_virial_, dcoord_,
+               datype_, dbox, fparam_, aparam_);
+  dener = dener_[0];
+}
+template <typename VALUETYPE>
+void DeepPot::compute(std::vector<ENERGYTYPE>& dener,
+                      std::vector<VALUETYPE>& dforce_,
+                      std::vector<VALUETYPE>& dvirial,
+                      std::vector<VALUETYPE>& datom_energy_,
+                      std::vector<VALUETYPE>& datom_virial_,
+                      const std::vector<VALUETYPE>& dcoord_,
+                      const std::vector<int>& datype_,
+                      const std::vector<VALUETYPE>& dbox,
+                      const std::vector<VALUETYPE>& fparam_,
+                      const std::vector<VALUETYPE>& aparam_) {
+  dp->computew(dener, dforce_, dvirial, datom_energy_, datom_virial_, dcoord_,
+               datype_, dbox, fparam_, aparam_);
+}
+
+template void DeepPot::compute<double>(ENERGYTYPE& dener,
+                                       std::vector<double>& dforce_,
+                                       std::vector<double>& dvirial,
+                                       std::vector<double>& datom_energy_,
+                                       std::vector<double>& datom_virial_,
+                                       const std::vector<double>& dcoord_,
+                                       const std::vector<int>& datype_,
+                                       const std::vector<double>& dbox,
+                                       const std::vector<double>& fparam,
+                                       const std::vector<double>& aparam);
+
+template void DeepPot::compute<float>(ENERGYTYPE& dener,
+                                      std::vector<float>& dforce_,
+                                      std::vector<float>& dvirial,
+                                      std::vector<float>& datom_energy_,
+                                      std::vector<float>& datom_virial_,
+                                      const std::vector<float>& dcoord_,
+                                      const std::vector<int>& datype_,
+                                      const std::vector<float>& dbox,
+                                      const std::vector<float>& fparam,
+                                      const std::vector<float>& aparam);
+
+template void DeepPot::compute<double>(std::vector<ENERGYTYPE>& dener,
+                                       std::vector<double>& dforce_,
+                                       std::vector<double>& dvirial,
+                                       std::vector<double>& datom_energy_,
+                                       std::vector<double>& datom_virial_,
+                                       const std::vector<double>& dcoord_,
+                                       const std::vector<int>& datype_,
+                                       const std::vector<double>& dbox,
+                                       const std::vector<double>& fparam,
+                                       const std::vector<double>& aparam);
+
+template void DeepPot::compute<float>(std::vector<ENERGYTYPE>& dener,
+                                      std::vector<float>& dforce_,
+                                      std::vector<float>& dvirial,
+                                      std::vector<float>& datom_energy_,
+                                      std::vector<float>& datom_virial_,
+                                      const std::vector<float>& dcoord_,
+                                      const std::vector<int>& datype_,
+                                      const std::vector<float>& dbox,
+                                      const std::vector<float>& fparam,
+                                      const std::vector<float>& aparam);
 
-template <typename VALUETYPE, typename ENERGYVTYPE>
-void DeepPot::compute(ENERGYVTYPE& dener,
+template <typename VALUETYPE>
+void DeepPot::compute(ENERGYTYPE& dener,
                       std::vector<VALUETYPE>& dforce_,
                       std::vector<VALUETYPE>& dvirial,
                       std::vector<VALUETYPE>& datom_energy_,
@@ -933,127 +290,88 @@ void DeepPot::compute(ENERGYVTYPE& dener,
                       const int& ago,
                       const std::vector<VALUETYPE>& fparam_,
                       const std::vector<VALUETYPE>& aparam__) {
-  int nall = datype_.size();
-  // if nall==0, unclear nframes, but 1 is ok
-  int nframes = nall > 0 ? (dcoord_.size() / nall / 3) : 1;
-  int nloc = nall - nghost;
-  std::vector<VALUETYPE> fparam;
-  std::vector<VALUETYPE> aparam_;
-  validate_fparam_aparam(nframes, (aparam_nall ? nall : nloc), fparam_,
-                         aparam__);
-  tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
-  tile_fparam_aparam(aparam_, nframes, (aparam_nall ? nall : nloc) * daparam,
-                     aparam__);
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-  // select real atoms
-  std::vector<VALUETYPE> dcoord, dforce, aparam, datom_energy, datom_virial;
-  std::vector<int> datype, fwd_map, bkw_map;
-  int nghost_real, nall_real, nloc_real;
-  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
-                          nall_real, nloc_real, dcoord_, datype_, aparam_,
-                          nghost, ntypes, nframes, daparam, nall, aparam_nall);
-
-  if (ago == 0) {
-    atommap = deepmd::AtomMap(datype.begin(), datype.begin() + nloc_real);
-    assert(nloc_real == atommap.get_type().size());
-
-    nlist_data.copy_from_nlist(lmp_list);
-    nlist_data.shuffle_exclude_empty(fwd_map);
-    nlist_data.shuffle(atommap);
-    nlist_data.make_inlist(nlist);
-  }
-
-  if (dtype == tensorflow::DT_DOUBLE) {
-    int ret = session_input_tensors<double>(
-        input_tensors, dcoord, ntypes, datype, dbox, nlist, fparam, aparam,
-        atommap, nghost_real, ago, "", aparam_nall);
-    assert(nloc_real == ret);
-    run_model<double>(dener, dforce, dvirial, datom_energy, datom_virial,
-                      session, input_tensors, atommap, nframes, nghost_real);
-  } else {
-    int ret = session_input_tensors<float>(
-        input_tensors, dcoord, ntypes, datype, dbox, nlist, fparam, aparam,
-        atommap, nghost_real, ago, "", aparam_nall);
-    assert(nloc_real == ret);
-    run_model<float>(dener, dforce, dvirial, datom_energy, datom_virial,
-                     session, input_tensors, atommap, nframes, nghost_real);
-  }
-
-  // bkw map
-  dforce_.resize(nframes * fwd_map.size() * 3);
-  datom_energy_.resize(nframes * fwd_map.size());
-  datom_virial_.resize(nframes * fwd_map.size() * 9);
-  select_map<VALUETYPE>(dforce_, dforce, bkw_map, 3, nframes, fwd_map.size(),
-                        nall_real);
-  select_map<VALUETYPE>(datom_energy_, datom_energy, bkw_map, 1, nframes,
-                        fwd_map.size(), nall_real);
-  select_map<VALUETYPE>(datom_virial_, datom_virial, bkw_map, 9, nframes,
-                        fwd_map.size(), nall_real);
+  std::vector<ENERGYTYPE> dener_;
+  dp->computew(dener_, dforce_, dvirial, datom_energy_, datom_virial_, dcoord_,
+               datype_, dbox, nghost, lmp_list, ago, fparam_, aparam__);
+  dener = dener_[0];
 }
-
-template void DeepPot::compute<double, ENERGYTYPE>(
-    ENERGYTYPE& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    std::vector<double>& datom_energy_,
-    std::vector<double>& datom_virial_,
-    const std::vector<double>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<double>& dbox,
-    const int nghost,
-    const InputNlist& lmp_list,
-    const int& ago,
-    const std::vector<double>& fparam,
-    const std::vector<double>& aparam_);
-
-template void DeepPot::compute<float, ENERGYTYPE>(
-    ENERGYTYPE& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    std::vector<float>& datom_energy_,
-    std::vector<float>& datom_virial_,
-    const std::vector<float>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<float>& dbox,
-    const int nghost,
-    const InputNlist& lmp_list,
-    const int& ago,
-    const std::vector<float>& fparam,
-    const std::vector<float>& aparam_);
-
-template void DeepPot::compute<double, std::vector<ENERGYTYPE>>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<double>& dforce_,
-    std::vector<double>& dvirial,
-    std::vector<double>& datom_energy_,
-    std::vector<double>& datom_virial_,
-    const std::vector<double>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<double>& dbox,
-    const int nghost,
-    const InputNlist& lmp_list,
-    const int& ago,
-    const std::vector<double>& fparam,
-    const std::vector<double>& aparam_);
-
-template void DeepPot::compute<float, std::vector<ENERGYTYPE>>(
-    std::vector<ENERGYTYPE>& dener,
-    std::vector<float>& dforce_,
-    std::vector<float>& dvirial,
-    std::vector<float>& datom_energy_,
-    std::vector<float>& datom_virial_,
-    const std::vector<float>& dcoord_,
-    const std::vector<int>& datype_,
-    const std::vector<float>& dbox,
-    const int nghost,
-    const InputNlist& lmp_list,
-    const int& ago,
-    const std::vector<float>& fparam,
-    const std::vector<float>& aparam_);
+template <typename VALUETYPE>
+void DeepPot::compute(std::vector<ENERGYTYPE>& dener,
+                      std::vector<VALUETYPE>& dforce_,
+                      std::vector<VALUETYPE>& dvirial,
+                      std::vector<VALUETYPE>& datom_energy_,
+                      std::vector<VALUETYPE>& datom_virial_,
+                      const std::vector<VALUETYPE>& dcoord_,
+                      const std::vector<int>& datype_,
+                      const std::vector<VALUETYPE>& dbox,
+                      const int nghost,
+                      const InputNlist& lmp_list,
+                      const int& ago,
+                      const std::vector<VALUETYPE>& fparam_,
+                      const std::vector<VALUETYPE>& aparam__) {
+  dp->computew(dener, dforce_, dvirial, datom_energy_, datom_virial_, dcoord_,
+               datype_, dbox, nghost, lmp_list, ago, fparam_, aparam__);
+}
+
+template void DeepPot::compute<double>(ENERGYTYPE& dener,
+                                       std::vector<double>& dforce_,
+                                       std::vector<double>& dvirial,
+                                       std::vector<double>& datom_energy_,
+                                       std::vector<double>& datom_virial_,
+                                       const std::vector<double>& dcoord_,
+                                       const std::vector<int>& datype_,
+                                       const std::vector<double>& dbox,
+                                       const int nghost,
+                                       const InputNlist& lmp_list,
+                                       const int& ago,
+                                       const std::vector<double>& fparam,
+                                       const std::vector<double>& aparam_);
+
+template void DeepPot::compute<float>(ENERGYTYPE& dener,
+                                      std::vector<float>& dforce_,
+                                      std::vector<float>& dvirial,
+                                      std::vector<float>& datom_energy_,
+                                      std::vector<float>& datom_virial_,
+                                      const std::vector<float>& dcoord_,
+                                      const std::vector<int>& datype_,
+                                      const std::vector<float>& dbox,
+                                      const int nghost,
+                                      const InputNlist& lmp_list,
+                                      const int& ago,
+                                      const std::vector<float>& fparam,
+                                      const std::vector<float>& aparam_);
+
+template void DeepPot::compute<double>(std::vector<ENERGYTYPE>& dener,
+                                       std::vector<double>& dforce_,
+                                       std::vector<double>& dvirial,
+                                       std::vector<double>& datom_energy_,
+                                       std::vector<double>& datom_virial_,
+                                       const std::vector<double>& dcoord_,
+                                       const std::vector<int>& datype_,
+                                       const std::vector<double>& dbox,
+                                       const int nghost,
+                                       const InputNlist& lmp_list,
+                                       const int& ago,
+                                       const std::vector<double>& fparam,
+                                       const std::vector<double>& aparam_);
+
+template void DeepPot::compute<float>(std::vector<ENERGYTYPE>& dener,
+                                      std::vector<float>& dforce_,
+                                      std::vector<float>& dvirial,
+                                      std::vector<float>& datom_energy_,
+                                      std::vector<float>& datom_virial_,
+                                      const std::vector<float>& dcoord_,
+                                      const std::vector<int>& datype_,
+                                      const std::vector<float>& dbox,
+                                      const int nghost,
+                                      const InputNlist& lmp_list,
+                                      const int& ago,
+                                      const std::vector<float>& fparam,
+                                      const std::vector<float>& aparam_);
 
 // mixed type
-template <typename VALUETYPE, typename ENERGYVTYPE>
-void DeepPot::compute_mixed_type(ENERGYVTYPE& dener,
+template <typename VALUETYPE>
+void DeepPot::compute_mixed_type(ENERGYTYPE& dener,
                                  std::vector<VALUETYPE>& dforce_,
                                  std::vector<VALUETYPE>& dvirial,
                                  const int& nframes,
@@ -1062,35 +380,29 @@ void DeepPot::compute_mixed_type(ENERGYVTYPE& dener,
                                  const std::vector<VALUETYPE>& dbox,
                                  const std::vector<VALUETYPE>& fparam_,
                                  const std::vector<VALUETYPE>& aparam_) {
-  int nloc = datype_.size() / nframes;
-  // here atommap only used to get nloc
-  atommap = deepmd::AtomMap(datype_.begin(), datype_.begin() + nloc);
-  std::vector<VALUETYPE> fparam;
-  std::vector<VALUETYPE> aparam;
-  validate_fparam_aparam(nframes, nloc, fparam_, aparam_);
-  tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
-  tile_fparam_aparam(aparam, nframes, nloc * daparam, aparam_);
-
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-
-  if (dtype == tensorflow::DT_DOUBLE) {
-    int ret = session_input_tensors_mixed_type<double>(
-        input_tensors, nframes, dcoord_, ntypes, datype_, dbox, cell_size,
-        fparam, aparam, atommap, "", aparam_nall);
-    assert(ret == nloc);
-    run_model<double>(dener, dforce_, dvirial, session, input_tensors, atommap,
-                      nframes);
-  } else {
-    int ret = session_input_tensors_mixed_type<float>(
-        input_tensors, nframes, dcoord_, ntypes, datype_, dbox, cell_size,
-        fparam, aparam, atommap, "", aparam_nall);
-    assert(ret == nloc);
-    run_model<float>(dener, dforce_, dvirial, session, input_tensors, atommap,
-                     nframes);
-  }
+  std::vector<ENERGYTYPE> dener_;
+  std::vector<VALUETYPE> datom_energy_, datom_virial_;
+  dp->computew_mixed_type(dener_, dforce_, dvirial, datom_energy_,
+                          datom_virial_, nframes, dcoord_, datype_, dbox,
+                          fparam_, aparam_);
+  dener = dener_[0];
+}
+template <typename VALUETYPE>
+void DeepPot::compute_mixed_type(std::vector<ENERGYTYPE>& dener,
+                                 std::vector<VALUETYPE>& dforce_,
+                                 std::vector<VALUETYPE>& dvirial,
+                                 const int& nframes,
+                                 const std::vector<VALUETYPE>& dcoord_,
+                                 const std::vector<int>& datype_,
+                                 const std::vector<VALUETYPE>& dbox,
+                                 const std::vector<VALUETYPE>& fparam_,
+                                 const std::vector<VALUETYPE>& aparam_) {
+  std::vector<VALUETYPE> datom_energy_, datom_virial_;
+  dp->computew_mixed_type(dener, dforce_, dvirial, datom_energy_, datom_virial_,
+                          nframes, dcoord_, datype_, dbox, fparam_, aparam_);
 }
 
-template void DeepPot::compute_mixed_type<double, ENERGYTYPE>(
+template void DeepPot::compute_mixed_type<double>(
     ENERGYTYPE& dener,
     std::vector<double>& dforce_,
     std::vector<double>& dvirial,
@@ -1101,7 +413,7 @@ template void DeepPot::compute_mixed_type<double, ENERGYTYPE>(
     const std::vector<double>& fparam,
     const std::vector<double>& aparam);
 
-template void DeepPot::compute_mixed_type<float, ENERGYTYPE>(
+template void DeepPot::compute_mixed_type<float>(
     ENERGYTYPE& dener,
     std::vector<float>& dforce_,
     std::vector<float>& dvirial,
@@ -1112,7 +424,7 @@ template void DeepPot::compute_mixed_type<float, ENERGYTYPE>(
     const std::vector<float>& fparam,
     const std::vector<float>& aparam);
 
-template void DeepPot::compute_mixed_type<double, std::vector<ENERGYTYPE>>(
+template void DeepPot::compute_mixed_type<double>(
     std::vector<ENERGYTYPE>& dener,
     std::vector<double>& dforce_,
     std::vector<double>& dvirial,
@@ -1123,7 +435,7 @@ template void DeepPot::compute_mixed_type<double, std::vector<ENERGYTYPE>>(
     const std::vector<double>& fparam,
     const std::vector<double>& aparam);
 
-template void DeepPot::compute_mixed_type<float, std::vector<ENERGYTYPE>>(
+template void DeepPot::compute_mixed_type<float>(
     std::vector<ENERGYTYPE>& dener,
     std::vector<float>& dforce_,
     std::vector<float>& dvirial,
@@ -1134,8 +446,8 @@ template void DeepPot::compute_mixed_type<float, std::vector<ENERGYTYPE>>(
     const std::vector<float>& fparam,
     const std::vector<float>& aparam);
 
-template <typename VALUETYPE, typename ENERGYVTYPE>
-void DeepPot::compute_mixed_type(ENERGYVTYPE& dener,
+template <typename VALUETYPE>
+void DeepPot::compute_mixed_type(ENERGYTYPE& dener,
                                  std::vector<VALUETYPE>& dforce_,
                                  std::vector<VALUETYPE>& dvirial,
                                  std::vector<VALUETYPE>& datom_energy_,
@@ -1146,33 +458,29 @@ void DeepPot::compute_mixed_type(ENERGYVTYPE& dener,
                                  const std::vector<VALUETYPE>& dbox,
                                  const std::vector<VALUETYPE>& fparam_,
                                  const std::vector<VALUETYPE>& aparam_) {
-  int nloc = datype_.size() / nframes;
-  // here atommap only used to get nloc
-  atommap = deepmd::AtomMap(datype_.begin(), datype_.begin() + nloc);
-  std::vector<VALUETYPE> fparam;
-  std::vector<VALUETYPE> aparam;
-  validate_fparam_aparam(nframes, nloc, fparam_, aparam_);
-  tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
-  tile_fparam_aparam(aparam, nframes, nloc * daparam, aparam_);
-
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-
-  if (dtype == tensorflow::DT_DOUBLE) {
-    int nloc = session_input_tensors_mixed_type<double>(
-        input_tensors, nframes, dcoord_, ntypes, datype_, dbox, cell_size,
-        fparam, aparam, atommap, "", aparam_nall);
-    run_model<double>(dener, dforce_, dvirial, datom_energy_, datom_virial_,
-                      session, input_tensors, atommap, nframes);
-  } else {
-    int nloc = session_input_tensors_mixed_type<float>(
-        input_tensors, nframes, dcoord_, ntypes, datype_, dbox, cell_size,
-        fparam, aparam, atommap, "", aparam_nall);
-    run_model<float>(dener, dforce_, dvirial, datom_energy_, datom_virial_,
-                     session, input_tensors, atommap, nframes);
-  }
+  std::vector<ENERGYTYPE> dener_;
+  dp->computew_mixed_type(dener_, dforce_, dvirial, datom_energy_,
+                          datom_virial_, nframes, dcoord_, datype_, dbox,
+                          fparam_, aparam_);
+  dener = dener_[0];
+}
+template <typename VALUETYPE>
+void DeepPot::compute_mixed_type(std::vector<ENERGYTYPE>& dener,
+                                 std::vector<VALUETYPE>& dforce_,
+                                 std::vector<VALUETYPE>& dvirial,
+                                 std::vector<VALUETYPE>& datom_energy_,
+                                 std::vector<VALUETYPE>& datom_virial_,
+                                 const int& nframes,
+                                 const std::vector<VALUETYPE>& dcoord_,
+                                 const std::vector<int>& datype_,
+                                 const std::vector<VALUETYPE>& dbox,
+                                 const std::vector<VALUETYPE>& fparam_,
+                                 const std::vector<VALUETYPE>& aparam_) {
+  dp->computew_mixed_type(dener, dforce_, dvirial, datom_energy_, datom_virial_,
+                          nframes, dcoord_, datype_, dbox, fparam_, aparam_);
 }
 
-template void DeepPot::compute_mixed_type<double, ENERGYTYPE>(
+template void DeepPot::compute_mixed_type<double>(
     ENERGYTYPE& dener,
     std::vector<double>& dforce_,
     std::vector<double>& dvirial,
@@ -1185,7 +493,7 @@ template void DeepPot::compute_mixed_type<double, ENERGYTYPE>(
     const std::vector<double>& fparam,
     const std::vector<double>& aparam);
 
-template void DeepPot::compute_mixed_type<float, ENERGYTYPE>(
+template void DeepPot::compute_mixed_type<float>(
     ENERGYTYPE& dener,
     std::vector<float>& dforce_,
     std::vector<float>& dvirial,
@@ -1198,7 +506,7 @@ template void DeepPot::compute_mixed_type<float, ENERGYTYPE>(
     const std::vector<float>& fparam,
     const std::vector<float>& aparam);
 
-template void DeepPot::compute_mixed_type<double, std::vector<ENERGYTYPE>>(
+template void DeepPot::compute_mixed_type<double>(
     std::vector<ENERGYTYPE>& dener,
     std::vector<double>& dforce_,
     std::vector<double>& dvirial,
@@ -1211,7 +519,7 @@ template void DeepPot::compute_mixed_type<double, std::vector<ENERGYTYPE>>(
     const std::vector<double>& fparam,
     const std::vector<double>& aparam);
 
-template void DeepPot::compute_mixed_type<float, std::vector<ENERGYTYPE>>(
+template void DeepPot::compute_mixed_type<float>(
     std::vector<ENERGYTYPE>& dener,
     std::vector<float>& dforce_,
     std::vector<float>& dvirial,
@@ -1224,26 +532,33 @@ template void DeepPot::compute_mixed_type<float, std::vector<ENERGYTYPE>>(
     const std::vector<float>& fparam,
     const std::vector<float>& aparam);
 
+double DeepPot::cutoff() const { return dp->cutoff(); }
+
+int DeepPot::numb_types() const { return dp->numb_types(); }
+
+int DeepPot::numb_types_spin() const { return dp->numb_types_spin(); }
+
+int DeepPot::dim_fparam() const { return dp->dim_fparam(); }
+
+int DeepPot::dim_aparam() const { return dp->dim_aparam(); }
+
 void DeepPot::get_type_map(std::string& type_map) {
-  type_map = get_scalar<STRINGTYPE>("model_attr/tmap");
+  dp->get_type_map(type_map);
 }
 
-DeepPotModelDevi::DeepPotModelDevi()
-    : inited(false), init_nbor(false), numb_models(0) {}
+bool DeepPot::is_aparam_nall() const { return dp->is_aparam_nall(); }
+
+DeepPotModelDevi::DeepPotModelDevi() : inited(false), numb_models(0) {}
 
 DeepPotModelDevi::DeepPotModelDevi(
     const std::vector<std::string>& models,
     const int& gpu_rank,
     const std::vector<std::string>& file_contents)
-    : inited(false), init_nbor(false), numb_models(0) {
+    : inited(false), numb_models(0) {
   init(models, gpu_rank, file_contents);
 }
 
-DeepPotModelDevi::~DeepPotModelDevi() {
-  for (unsigned ii = 0; ii < numb_models; ++ii) {
-    delete graph_defs[ii];
-  }
-}
+DeepPotModelDevi::~DeepPotModelDevi() {}
 
 void DeepPotModelDevi::init(const std::vector<std::string>& models,
                             const int& gpu_rank,
@@ -1255,188 +570,17 @@ void DeepPotModelDevi::init(const std::vector<std::string>& models,
     return;
   }
   numb_models = models.size();
-  sessions.resize(numb_models);
-  graph_defs.resize(numb_models);
-
-  int gpu_num = -1;
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  DPGetDeviceCount(gpu_num);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-  SessionOptions options;
-  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
-  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
-  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  for (unsigned ii = 0; ii < numb_models; ++ii) {
-    graph_defs[ii] = new GraphDef();
-    if (file_contents.size() == 0) {
-      check_status(ReadBinaryProto(Env::Default(), models[ii], graph_defs[ii]));
-    } else {
-      (*graph_defs[ii]).ParseFromString(file_contents[ii]);
-    }
-  }
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  if (gpu_num > 0) {
-    options.config.set_allow_soft_placement(true);
-    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(
-        0.9);
-    options.config.mutable_gpu_options()->set_allow_growth(true);
-    DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
-  }
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-  for (unsigned ii = 0; ii < numb_models; ++ii) {
-    if (gpu_num > 0) {
-      std::string str = "/gpu:";
-      str += std::to_string(gpu_rank % gpu_num);
-      graph::SetDefaultDevice(str, &(*graph_defs[ii]));
-    }
-    check_status(NewSession(options, &(sessions[ii])));
-    check_status(sessions[ii]->Create(*graph_defs[ii]));
-  }
-  try {
-    model_version = get_scalar<STRINGTYPE>("model_attr/model_version");
-  } catch (deepmd::tf_exception& e) {
-    // no model version defined in old models
-    model_version = "0.0";
-  }
-  if (!model_compatable(model_version)) {
-    throw deepmd::deepmd_exception(
-        "incompatable model: version " + model_version +
-        " in graph, but version " + global_model_version +
-        " supported. "
-        "See https://deepmd.rtfd.io/compatability/ for details.");
-  }
-  dtype = session_get_dtype(sessions[0], "descrpt_attr/rcut");
-  if (dtype == tensorflow::DT_DOUBLE) {
-    rcut = get_scalar<double>("descrpt_attr/rcut");
-  } else {
-    rcut = get_scalar<float>("descrpt_attr/rcut");
-  }
-  cell_size = rcut;
-  ntypes = get_scalar<int>("descrpt_attr/ntypes");
-  try {
-    ntypes_spin = get_scalar<int>("spin_attr/ntypes_spin");
-  } catch (deepmd::deepmd_exception) {
-    ntypes_spin = 0;
-  }
-  dfparam = get_scalar<int>("fitting_attr/dfparam");
-  daparam = get_scalar<int>("fitting_attr/daparam");
-  if (dfparam < 0) {
-    dfparam = 0;
-  }
-  if (daparam < 0) {
-    daparam = 0;
+  if (numb_models == 0) {
+    throw deepmd::deepmd_exception("no model is specified");
   }
-  if (daparam > 0) {
-    try {
-      aparam_nall = get_scalar<bool>("fitting_attr/aparam_nall");
-    } catch (deepmd::deepmd_exception) {
-      aparam_nall = false;
-    }
-  } else {
-    aparam_nall = false;
+  dps.resize(numb_models);
+  for (unsigned int ii = 0; ii < numb_models; ++ii) {
+    dps[ii].init(models[ii], gpu_rank,
+                 file_contents.size() > ii ? file_contents[ii] : "");
   }
-  model_type = get_scalar<STRINGTYPE>("model_attr/model_type");
-  // rcut = get_rcut();
-  // cell_size = rcut;
-  // ntypes = get_ntypes();
   inited = true;
-
-  init_nbor = false;
-}
-
-template <class VT>
-VT DeepPotModelDevi::get_scalar(const std::string name) const {
-  VT myrcut;
-  for (unsigned ii = 0; ii < numb_models; ++ii) {
-    VT ret = session_get_scalar<VT>(sessions[ii], name);
-    if (ii == 0) {
-      myrcut = ret;
-    } else {
-      assert(myrcut == ret);
-    }
-  }
-  return myrcut;
-}
-
-template <typename VALUETYPE>
-void DeepPotModelDevi::validate_fparam_aparam(
-    const int& nloc,
-    const std::vector<VALUETYPE>& fparam,
-    const std::vector<VALUETYPE>& aparam) const {
-  if (fparam.size() != dfparam) {
-    throw deepmd::deepmd_exception(
-        "the dim of frame parameter provided is not consistent with what the "
-        "model uses");
-  }
-  if (aparam.size() != daparam * nloc) {
-    throw deepmd::deepmd_exception(
-        "the dim of atom parameter provided is not consistent with what the "
-        "model uses");
-  }
 }
 
-template void DeepPotModelDevi::validate_fparam_aparam<double>(
-    const int& nloc,
-    const std::vector<double>& fparam,
-    const std::vector<double>& aparam) const;
-
-template void DeepPotModelDevi::validate_fparam_aparam<float>(
-    const int& nloc,
-    const std::vector<float>& fparam,
-    const std::vector<float>& aparam) const;
-
-// void
-// DeepPotModelDevi::
-// compute (ENERGYTYPE &			dener,
-// 	 std::vector<VALUETYPE> &	dforce_,
-// 	 std::vector<VALUETYPE> &	dvirial,
-// 	 std::vector<VALUETYPE> &	model_devi,
-// 	 const std::vector<VALUETYPE> &	dcoord_,
-// 	 const std::vector<int> &	datype_,
-// 	 const std::vector<VALUETYPE> &	dbox,
-// 	 const std::vector<VALUETYPE> &	fparam,
-// 	 const std::vector<VALUETYPE> &	aparam)
-// {
-//   if (numb_models == 0) return;
-
-//   atommap = AtomMap<VALUETYPE> (datype_.begin(), datype_.end());
-//   validate_fparam_aparam(atommap.get_type().size(), fparam, aparam);
-
-//   std::vector<std::pair<std::string, Tensor>> input_tensors;
-//   int nloc = session_input_tensors (input_tensors, dcoord_, ntypes, datype_,
-//   dbox, cell_size, fparam, aparam, atommap);
-
-//   std::vector<ENERGYTYPE > all_energy (numb_models);
-//   std::vector<std::vector<VALUETYPE > > all_force (numb_models);
-//   std::vector<std::vector<VALUETYPE > > all_virial (numb_models);
-
-//   for (unsigned ii = 0; ii < numb_models; ++ii){
-//     run_model (all_energy[ii], all_force[ii], all_virial[ii], sessions[ii],
-//     input_tensors, atommap);
-//   }
-
-//   dener = 0;
-//   for (unsigned ii = 0; ii < numb_models; ++ii){
-//     dener += all_energy[ii];
-//   }
-//   dener /= VALUETYPE(numb_models);
-//   compute_avg (dvirial, all_virial);
-//   compute_avg (dforce_, all_force);
-
-//   compute_std_f (model_devi, dforce_, all_force);
-
-//   // for (unsigned ii = 0; ii < numb_models; ++ii){
-//   //   cout << all_force[ii][573] << " " << all_force[ii][574] << " " <<
-//   all_force[ii][575] << endl;
-//   // }
-//   // cout << dforce_[573] << " "
-//   //      << dforce_[574] << " "
-//   //      << dforce_[575] << " "
-//   //      << model_devi[191] << endl;
-// }
-
 template <typename VALUETYPE>
 void DeepPotModelDevi::compute(std::vector<ENERGYTYPE>& all_energy,
                                std::vector<std::vector<VALUETYPE>>& all_force,
@@ -1452,57 +596,12 @@ void DeepPotModelDevi::compute(std::vector<ENERGYTYPE>& all_energy,
   if (numb_models == 0) {
     return;
   }
-  int nall = dcoord_.size() / 3;
-  int nframes = 1;
-  int nloc = nall - nghost;
-  validate_fparam_aparam((aparam_nall ? nall : nloc), fparam, aparam_);
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-
-  // select real atoms
-  std::vector<VALUETYPE> dcoord, dforce, aparam, datom_energy, datom_virial;
-  std::vector<int> datype, fwd_map, bkw_map;
-  int nghost_real, nall_real, nloc_real;
-  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
-                          nall_real, nloc_real, dcoord_, datype_, aparam_,
-                          nghost, ntypes, nframes, daparam, nall, aparam_nall);
-
-  // agp == 0 means that the LAMMPS nbor list has been updated
-  if (ago == 0) {
-    atommap = AtomMap(datype.begin(), datype.begin() + nloc_real);
-    assert(nloc == atommap.get_type().size());
-
-    nlist_data.copy_from_nlist(lmp_list);
-    nlist_data.shuffle_exclude_empty(fwd_map);
-    nlist_data.shuffle(atommap);
-    nlist_data.make_inlist(nlist);
-  }
-  int ret;
-  if (dtype == tensorflow::DT_DOUBLE) {
-    ret = session_input_tensors<double>(input_tensors, dcoord, ntypes, datype,
-                                        dbox, nlist, fparam, aparam, atommap,
-                                        nghost_real, ago, "", aparam_nall);
-  } else {
-    ret = session_input_tensors<float>(input_tensors, dcoord, ntypes, datype,
-                                       dbox, nlist, fparam, aparam, atommap,
-                                       nghost_real, ago, "", aparam_nall);
-  }
   all_energy.resize(numb_models);
   all_force.resize(numb_models);
   all_virial.resize(numb_models);
-  assert(nloc == ret);
   for (unsigned ii = 0; ii < numb_models; ++ii) {
-    std::vector<VALUETYPE> dforce;
-    if (dtype == tensorflow::DT_DOUBLE) {
-      run_model<double>(all_energy[ii], dforce, all_virial[ii], sessions[ii],
-                        input_tensors, atommap, 1, nghost_real);
-    } else {
-      run_model<float>(all_energy[ii], dforce, all_virial[ii], sessions[ii],
-                       input_tensors, atommap, 1, nghost_real);
-    }
-    // bkw map
-    all_force[ii].resize(nframes * fwd_map.size() * 3);
-    select_map<VALUETYPE>(all_force[ii], dforce, bkw_map, 3, nframes,
-                          fwd_map.size(), nall_real);
+    dps[ii].compute(all_energy[ii], all_force[ii], all_virial[ii], dcoord_,
+                    datype_, dbox, nghost, lmp_list, ago, fparam, aparam_);
   }
 }
 
@@ -1550,68 +649,15 @@ void DeepPotModelDevi::compute(
   if (numb_models == 0) {
     return;
   }
-  int nframes = 1;
-  int nall = dcoord_.size() / 3;
-  int nloc = nall - nghost;
-  validate_fparam_aparam((aparam_nall ? nall : nloc), fparam, aparam_);
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-
-  // select real atoms
-  std::vector<VALUETYPE> dcoord, dforce, aparam, datom_energy, datom_virial;
-  std::vector<int> datype, fwd_map, bkw_map;
-  int nghost_real, nall_real, nloc_real;
-  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
-                          nall_real, nloc_real, dcoord_, datype_, aparam_,
-                          nghost, ntypes, nframes, daparam, nall, aparam_nall);
-  // agp == 0 means that the LAMMPS nbor list has been updated
-
-  if (ago == 0) {
-    atommap = AtomMap(datype.begin(), datype.begin() + nloc_real);
-    assert(nloc == atommap.get_type().size());
-
-    nlist_data.copy_from_nlist(lmp_list);
-    nlist_data.shuffle_exclude_empty(fwd_map);
-    nlist_data.shuffle(atommap);
-    nlist_data.make_inlist(nlist);
-  }
-  int ret;
-  if (dtype == tensorflow::DT_DOUBLE) {
-    ret = session_input_tensors<double>(input_tensors, dcoord, ntypes, datype,
-                                        dbox, nlist, fparam, aparam, atommap,
-                                        nghost_real, ago, "", aparam_nall);
-  } else {
-    ret = session_input_tensors<float>(input_tensors, dcoord, ntypes, datype,
-                                       dbox, nlist, fparam, aparam, atommap,
-                                       nghost_real, ago, "", aparam_nall);
-  }
-
   all_energy.resize(numb_models);
   all_force.resize(numb_models);
   all_virial.resize(numb_models);
   all_atom_energy.resize(numb_models);
   all_atom_virial.resize(numb_models);
-  assert(nloc == ret);
   for (unsigned ii = 0; ii < numb_models; ++ii) {
-    std::vector<VALUETYPE> dforce, datom_energy, datom_virial;
-    if (dtype == tensorflow::DT_DOUBLE) {
-      run_model<double>(all_energy[ii], dforce, all_virial[ii], datom_energy,
-                        datom_virial, sessions[ii], input_tensors, atommap, 1,
-                        nghost_real);
-    } else {
-      run_model<float>(all_energy[ii], dforce, all_virial[ii], datom_energy,
-                       datom_virial, sessions[ii], input_tensors, atommap, 1,
-                       nghost_real);
-    }
-    // bkw map
-    all_force[ii].resize(nframes * fwd_map.size() * 3);
-    all_atom_energy[ii].resize(nframes * fwd_map.size());
-    all_atom_virial[ii].resize(nframes * fwd_map.size() * 9);
-    select_map<VALUETYPE>(all_force[ii], dforce, bkw_map, 3, nframes,
-                          fwd_map.size(), nall_real);
-    select_map<VALUETYPE>(all_atom_energy[ii], datom_energy, bkw_map, 1,
-                          nframes, fwd_map.size(), nall_real);
-    select_map<VALUETYPE>(all_atom_virial[ii], datom_virial, bkw_map, 9,
-                          nframes, fwd_map.size(), nall_real);
+    dps[ii].compute(all_energy[ii], all_force[ii], all_virial[ii],
+                    all_atom_energy[ii], all_atom_virial[ii], dcoord_, datype_,
+                    dbox, nghost, lmp_list, ago, fparam, aparam_);
   }
 }
 
@@ -1715,8 +761,8 @@ void DeepPotModelDevi::compute_std(
 
   for (unsigned ii = 0; ii < numb_models; ++ii) {
     for (unsigned jj = 0; jj < nloc; ++jj) {
-      const VALUETYPE* tmp_f = &(xx[ii][jj * stride]);
-      const VALUETYPE* tmp_avg = &(avg[jj * stride]);
+      const VALUETYPE* tmp_f = &(xx[ii][static_cast<size_t>(jj) * stride]);
+      const VALUETYPE* tmp_avg = &(avg[static_cast<size_t>(jj) * stride]);
       for (unsigned dd = 0; dd < stride; ++dd) {
         VALUETYPE vdiff = tmp_f[dd] - tmp_avg[dd];
         std[jj] += vdiff * vdiff;
@@ -1787,7 +833,7 @@ void DeepPotModelDevi::compute_relative_std(std::vector<VALUETYPE>& std,
   assert(nloc * stride == ndof);
 
   for (unsigned ii = 0; ii < nloc; ++ii) {
-    const VALUETYPE* tmp_avg = &(avg[ii * stride]);
+    const VALUETYPE* tmp_avg = &(avg[static_cast<size_t>(ii) * stride]);
     VALUETYPE f_norm = 0.0;
     for (unsigned dd = 0; dd < stride; ++dd) {
       f_norm += tmp_avg[dd] * tmp_avg[dd];
diff --git a/source/api_cc/src/DeepPotTF.cc b/source/api_cc/src/DeepPotTF.cc
new file mode 100644
index 0000000000..ef348fe14c
--- /dev/null
+++ b/source/api_cc/src/DeepPotTF.cc
@@ -0,0 +1,1053 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#include "DeepPotTF.h"
+
+#include <stdexcept>
+
+#include "AtomMap.h"
+#include "common.h"
+#include "device.h"
+
+using namespace tensorflow;
+using namespace deepmd;
+
+// start multiple frames
+
+template <typename MODELTYPE, typename VALUETYPE>
+static void run_model(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<VALUETYPE>& dforce_,
+    std::vector<VALUETYPE>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nframes,
+    const int nghost = 0) {
+  unsigned nloc = atommap.get_type().size();
+  unsigned nall = nloc + nghost;
+  dener.resize(nframes);
+  if (nloc == 0) {
+    // no backward map needed
+    // dforce of size nall * 3
+    dforce_.resize(static_cast<size_t>(nframes) * nall * 3);
+    fill(dforce_.begin(), dforce_.end(), (VALUETYPE)0.0);
+    // dvirial of size 9
+    dvirial.resize(static_cast<size_t>(nframes) * 9);
+    fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.0);
+    return;
+  }
+
+  std::vector<Tensor> output_tensors;
+  check_status(session->Run(
+      input_tensors, {"o_energy", "o_force", "o_atom_energy", "o_atom_virial"},
+      {}, &output_tensors));
+
+  Tensor output_e = output_tensors[0];
+  Tensor output_f = output_tensors[1];
+  Tensor output_av = output_tensors[3];
+
+  auto oe = output_e.flat<ENERGYTYPE>();
+  auto of = output_f.flat<MODELTYPE>();
+  auto oav = output_av.flat<MODELTYPE>();
+
+  std::vector<VALUETYPE> dforce(static_cast<size_t>(nframes) * 3 * nall);
+  dvirial.resize(static_cast<size_t>(nframes) * 9);
+  for (int ii = 0; ii < nframes; ++ii) {
+    dener[ii] = oe(ii);
+  }
+  for (size_t ii = 0; ii < static_cast<size_t>(nframes) * nall * 3; ++ii) {
+    dforce[ii] = of(ii);
+  }
+  // set dvirial to zero, prevent input vector is not zero (#1123)
+  std::fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.);
+  for (int kk = 0; kk < nframes; ++kk) {
+    for (int ii = 0; ii < nall; ++ii) {
+      dvirial[kk * 9 + 0] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 0);
+      dvirial[kk * 9 + 1] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 1);
+      dvirial[kk * 9 + 2] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 2);
+      dvirial[kk * 9 + 3] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 3);
+      dvirial[kk * 9 + 4] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 4);
+      dvirial[kk * 9 + 5] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 5);
+      dvirial[kk * 9 + 6] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 6);
+      dvirial[kk * 9 + 7] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 7);
+      dvirial[kk * 9 + 8] += (VALUETYPE)1.0 * oav(kk * nall * 9 + 9 * ii + 8);
+    }
+  }
+  dforce_ = dforce;
+  atommap.backward<VALUETYPE>(dforce_.begin(), dforce.begin(), 3, nframes,
+                              nall);
+}
+
+template void run_model<double, double>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nframes,
+    const int nghost);
+
+template void run_model<double, float>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nframes,
+    const int nghost);
+
+template void run_model<float, double>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nframes,
+    const int nghost);
+
+template void run_model<float, float>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nframes,
+    const int nghost);
+
+template <typename MODELTYPE, typename VALUETYPE>
+static void run_model(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<VALUETYPE>& dforce_,
+    std::vector<VALUETYPE>& dvirial,
+    std::vector<VALUETYPE>& datom_energy_,
+    std::vector<VALUETYPE>& datom_virial_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost = 0) {
+  unsigned nloc = atommap.get_type().size();
+  unsigned nall = nloc + nghost;
+  dener.resize(nframes);
+  if (nloc == 0) {
+    // no backward map needed
+    // dforce of size nall * 3
+    dforce_.resize(static_cast<size_t>(nframes) * nall * 3);
+    fill(dforce_.begin(), dforce_.end(), (VALUETYPE)0.0);
+    // dvirial of size 9
+    dvirial.resize(static_cast<size_t>(nframes) * 9);
+    fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.0);
+    // datom_energy_ of size nall
+    datom_energy_.resize(static_cast<size_t>(nframes) * nall);
+    fill(datom_energy_.begin(), datom_energy_.end(), (VALUETYPE)0.0);
+    // datom_virial_ of size nall * 9
+    datom_virial_.resize(static_cast<size_t>(nframes) * nall * 9);
+    fill(datom_virial_.begin(), datom_virial_.end(), (VALUETYPE)0.0);
+    return;
+  }
+  std::vector<Tensor> output_tensors;
+
+  check_status(session->Run(
+      input_tensors, {"o_energy", "o_force", "o_atom_energy", "o_atom_virial"},
+      {}, &output_tensors));
+
+  Tensor output_e = output_tensors[0];
+  Tensor output_f = output_tensors[1];
+  Tensor output_ae = output_tensors[2];
+  Tensor output_av = output_tensors[3];
+
+  auto oe = output_e.flat<ENERGYTYPE>();
+  auto of = output_f.flat<MODELTYPE>();
+  auto oae = output_ae.flat<MODELTYPE>();
+  auto oav = output_av.flat<MODELTYPE>();
+
+  std::vector<VALUETYPE> dforce(static_cast<size_t>(nframes) * 3 * nall);
+  std::vector<VALUETYPE> datom_energy(static_cast<size_t>(nframes) * nall, 0);
+  std::vector<VALUETYPE> datom_virial(static_cast<size_t>(nframes) * 9 * nall);
+  dvirial.resize(static_cast<size_t>(nframes) * 9);
+  for (int ii = 0; ii < nframes; ++ii) {
+    dener[ii] = oe(ii);
+  }
+  for (size_t ii = 0; ii < static_cast<size_t>(nframes) * nall * 3; ++ii) {
+    dforce[ii] = of(ii);
+  }
+  for (int ii = 0; ii < nframes; ++ii) {
+    for (int jj = 0; jj < nloc; ++jj) {
+      datom_energy[ii * nall + jj] = oae(ii * nloc + jj);
+    }
+  }
+  for (size_t ii = 0; ii < static_cast<size_t>(nframes) * nall * 9; ++ii) {
+    datom_virial[ii] = oav(ii);
+  }
+  // set dvirial to zero, prevent input vector is not zero (#1123)
+  std::fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.);
+  for (int kk = 0; kk < nframes; ++kk) {
+    for (int ii = 0; ii < nall; ++ii) {
+      dvirial[kk * 9 + 0] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 0];
+      dvirial[kk * 9 + 1] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 1];
+      dvirial[kk * 9 + 2] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 2];
+      dvirial[kk * 9 + 3] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 3];
+      dvirial[kk * 9 + 4] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 4];
+      dvirial[kk * 9 + 5] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 5];
+      dvirial[kk * 9 + 6] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 6];
+      dvirial[kk * 9 + 7] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 7];
+      dvirial[kk * 9 + 8] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 8];
+    }
+  }
+  dforce_ = dforce;
+  datom_energy_ = datom_energy;
+  datom_virial_ = datom_virial;
+  atommap.backward<VALUETYPE>(dforce_.begin(), dforce.begin(), 3, nframes,
+                              nall);
+  atommap.backward<VALUETYPE>(datom_energy_.begin(), datom_energy.begin(), 1,
+                              nframes, nall);
+  atommap.backward<VALUETYPE>(datom_virial_.begin(), datom_virial.begin(), 9,
+                              nframes, nall);
+}
+
+template void run_model<double, double>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void run_model<double, float>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void run_model<float, double>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void run_model<float, float>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+// end multiple frames
+
+// start single frame
+
+template <typename MODELTYPE, typename VALUETYPE>
+static void run_model(
+    ENERGYTYPE& dener,
+    std::vector<VALUETYPE>& dforce_,
+    std::vector<VALUETYPE>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nframes = 1,
+    const int nghost = 0) {
+  assert(nframes == 1);
+  std::vector<ENERGYTYPE> dener_(1);
+  // call multi-frame version
+  run_model<MODELTYPE, VALUETYPE>(dener_, dforce_, dvirial, session,
+                                  input_tensors, atommap, nframes, nghost);
+  dener = dener_[0];
+}
+
+template void run_model<double, double>(
+    ENERGYTYPE& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nframes,
+    const int nghost);
+
+template void run_model<double, float>(
+    ENERGYTYPE& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nframes,
+    const int nghost);
+
+template void run_model<float, double>(
+    ENERGYTYPE& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nframes,
+    const int nghost);
+
+template void run_model<float, float>(
+    ENERGYTYPE& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const int nframes,
+    const int nghost);
+
+template <typename MODELTYPE, typename VALUETYPE>
+static void run_model(
+    ENERGYTYPE& dener,
+    std::vector<VALUETYPE>& dforce_,
+    std::vector<VALUETYPE>& dvirial,
+    std::vector<VALUETYPE>& datom_energy_,
+    std::vector<VALUETYPE>& datom_virial_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const deepmd::AtomMap& atommap,
+    const int& nframes = 1,
+    const int& nghost = 0) {
+  assert(nframes == 1);
+  std::vector<ENERGYTYPE> dener_(1);
+  // call multi-frame version
+  run_model<MODELTYPE, VALUETYPE>(dener_, dforce_, dvirial, datom_energy_,
+                                  datom_virial_, session, input_tensors,
+                                  atommap, nframes, nghost);
+  dener = dener_[0];
+}
+
+template void run_model<double, double>(
+    ENERGYTYPE& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void run_model<double, float>(
+    ENERGYTYPE& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void run_model<float, double>(
+    ENERGYTYPE& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void run_model<float, float>(
+    ENERGYTYPE& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+// end single frame
+
+DeepPotTF::DeepPotTF()
+    : inited(false), init_nbor(false), graph_def(new GraphDef()) {}
+
+DeepPotTF::DeepPotTF(const std::string& model,
+                     const int& gpu_rank,
+                     const std::string& file_content)
+    : inited(false), init_nbor(false), graph_def(new GraphDef()) {
+  try {
+    init(model, gpu_rank, file_content);
+  } catch (...) {
+    // Clean up and rethrow, as the destructor will not be called
+    delete graph_def;
+    throw;
+  }
+}
+
+DeepPotTF::~DeepPotTF() { delete graph_def; }
+
+void DeepPotTF::init(const std::string& model,
+                     const int& gpu_rank,
+                     const std::string& file_content) {
+  if (inited) {
+    std::cerr << "WARNING: deepmd-kit should not be initialized twice, do "
+                 "nothing at the second call of initializer"
+              << std::endl;
+    return;
+  }
+  SessionOptions options;
+  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
+  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
+  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
+  deepmd::load_op_library();
+
+  if (file_content.size() == 0) {
+    check_status(ReadBinaryProto(Env::Default(), model, graph_def));
+  } else {
+    (*graph_def).ParseFromString(file_content);
+  }
+  int gpu_num = -1;
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  DPGetDeviceCount(gpu_num);  // check current device environment
+  if (gpu_num > 0) {
+    options.config.set_allow_soft_placement(true);
+    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(
+        0.9);
+    options.config.mutable_gpu_options()->set_allow_growth(true);
+    DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
+    std::string str = "/gpu:";
+    str += std::to_string(gpu_rank % gpu_num);
+    graph::SetDefaultDevice(str, graph_def);
+  }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  check_status(NewSession(options, &session));
+  check_status(session->Create(*graph_def));
+  try {
+    model_version = get_scalar<STRINGTYPE>("model_attr/model_version");
+  } catch (deepmd::tf_exception& e) {
+    // no model version defined in old models
+    model_version = "0.0";
+  }
+  if (!model_compatable(model_version)) {
+    throw deepmd::deepmd_exception(
+        "incompatable model: version " + model_version +
+        " in graph, but version " + global_model_version +
+        " supported "
+        "See https://deepmd.rtfd.io/compatability/ for details.");
+  }
+  dtype = session_get_dtype(session, "descrpt_attr/rcut");
+  if (dtype == tensorflow::DT_DOUBLE) {
+    rcut = get_scalar<double>("descrpt_attr/rcut");
+  } else {
+    rcut = get_scalar<float>("descrpt_attr/rcut");
+  }
+  cell_size = rcut;
+  ntypes = get_scalar<int>("descrpt_attr/ntypes");
+  try {
+    ntypes_spin = get_scalar<int>("spin_attr/ntypes_spin");
+  } catch (const deepmd::deepmd_exception&) {
+    ntypes_spin = 0;
+  }
+  dfparam = get_scalar<int>("fitting_attr/dfparam");
+  daparam = get_scalar<int>("fitting_attr/daparam");
+  if (dfparam < 0) {
+    dfparam = 0;
+  }
+  if (daparam < 0) {
+    daparam = 0;
+  }
+  if (daparam > 0) {
+    try {
+      aparam_nall = get_scalar<bool>("fitting_attr/aparam_nall");
+    } catch (const deepmd::deepmd_exception&) {
+      aparam_nall = false;
+    }
+  } else {
+    aparam_nall = false;
+  }
+  model_type = get_scalar<STRINGTYPE>("model_attr/model_type");
+  inited = true;
+
+  init_nbor = false;
+}
+
+template <class VT>
+VT DeepPotTF::get_scalar(const std::string& name) const {
+  return session_get_scalar<VT>(session, name);
+}
+
+template <typename VALUETYPE>
+void DeepPotTF::validate_fparam_aparam(
+    const int& nframes,
+    const int& nloc,
+    const std::vector<VALUETYPE>& fparam,
+    const std::vector<VALUETYPE>& aparam) const {
+  if (fparam.size() != dfparam &&
+      fparam.size() != static_cast<size_t>(nframes) * dfparam) {
+    throw deepmd::deepmd_exception(
+        "the dim of frame parameter provided is not consistent with what the "
+        "model uses");
+  }
+
+  if (aparam.size() != static_cast<size_t>(daparam) * nloc &&
+      aparam.size() != static_cast<size_t>(nframes) * daparam * nloc) {
+    throw deepmd::deepmd_exception(
+        "the dim of atom parameter provided is not consistent with what the "
+        "model uses");
+  }
+}
+
+template void DeepPotTF::validate_fparam_aparam<double>(
+    const int& nframes,
+    const int& nloc,
+    const std::vector<double>& fparam,
+    const std::vector<double>& aparam) const;
+
+template void DeepPotTF::validate_fparam_aparam<float>(
+    const int& nframes,
+    const int& nloc,
+    const std::vector<float>& fparam,
+    const std::vector<float>& aparam) const;
+
+template <typename VALUETYPE>
+void DeepPotTF::tile_fparam_aparam(std::vector<VALUETYPE>& out_param,
+                                   const int& nframes,
+                                   const int& dparam,
+                                   const std::vector<VALUETYPE>& param) const {
+  if (param.size() == dparam) {
+    out_param.resize(static_cast<size_t>(nframes) * dparam);
+    for (int ii = 0; ii < nframes; ++ii) {
+      std::copy(param.begin(), param.end(),
+                out_param.begin() + static_cast<unsigned long>(ii) * dparam);
+    }
+  } else if (param.size() == static_cast<size_t>(nframes) * dparam) {
+    out_param = param;
+  }
+}
+
+template void DeepPotTF::tile_fparam_aparam<double>(
+    std::vector<double>& out_param,
+    const int& nframes,
+    const int& dparam,
+    const std::vector<double>& param) const;
+
+template void DeepPotTF::tile_fparam_aparam<float>(
+    std::vector<float>& out_param,
+    const int& nframes,
+    const int& dparam,
+    const std::vector<float>& param) const;
+
+// ENERGYVTYPE: std::vector<ENERGYTYPE> or ENERGYTYPE
+
+template <typename VALUETYPE, typename ENERGYVTYPE>
+void DeepPotTF::compute(ENERGYVTYPE& dener,
+                        std::vector<VALUETYPE>& dforce_,
+                        std::vector<VALUETYPE>& dvirial,
+                        std::vector<VALUETYPE>& datom_energy_,
+                        std::vector<VALUETYPE>& datom_virial_,
+                        const std::vector<VALUETYPE>& dcoord_,
+                        const std::vector<int>& datype_,
+                        const std::vector<VALUETYPE>& dbox,
+                        const std::vector<VALUETYPE>& fparam_,
+                        const std::vector<VALUETYPE>& aparam_) {
+  // if datype.size is 0, not clear nframes; but 1 is just ok
+  int nframes = datype_.size() > 0 ? (dcoord_.size() / 3 / datype_.size()) : 1;
+  atommap = deepmd::AtomMap(datype_.begin(), datype_.end());
+  int nloc = datype_.size();
+  std::vector<VALUETYPE> fparam;
+  std::vector<VALUETYPE> aparam;
+  validate_fparam_aparam(nframes, nloc, fparam_, aparam_);
+  tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
+  tile_fparam_aparam(aparam, nframes, nloc * daparam, aparam_);
+
+  std::vector<std::pair<std::string, Tensor>> input_tensors;
+
+  if (dtype == tensorflow::DT_DOUBLE) {
+    int ret = session_input_tensors<double>(input_tensors, dcoord_, ntypes,
+                                            datype_, dbox, cell_size, fparam,
+                                            aparam, atommap, "", aparam_nall);
+    run_model<double>(dener, dforce_, dvirial, datom_energy_, datom_virial_,
+                      session, input_tensors, atommap, nframes);
+  } else {
+    int ret = session_input_tensors<float>(input_tensors, dcoord_, ntypes,
+                                           datype_, dbox, cell_size, fparam,
+                                           aparam, atommap, "", aparam_nall);
+    run_model<float>(dener, dforce_, dvirial, datom_energy_, datom_virial_,
+                     session, input_tensors, atommap, nframes);
+  }
+}
+
+template void DeepPotTF::compute<double, ENERGYTYPE>(
+    ENERGYTYPE& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
+    const std::vector<double>& fparam,
+    const std::vector<double>& aparam);
+
+template void DeepPotTF::compute<float, ENERGYTYPE>(
+    ENERGYTYPE& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
+    const std::vector<float>& fparam,
+    const std::vector<float>& aparam);
+
+template void DeepPotTF::compute<double, std::vector<ENERGYTYPE>>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
+    const std::vector<double>& fparam,
+    const std::vector<double>& aparam);
+
+template void DeepPotTF::compute<float, std::vector<ENERGYTYPE>>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
+    const std::vector<float>& fparam,
+    const std::vector<float>& aparam);
+
+template <typename VALUETYPE, typename ENERGYVTYPE>
+void DeepPotTF::compute(ENERGYVTYPE& dener,
+                        std::vector<VALUETYPE>& dforce_,
+                        std::vector<VALUETYPE>& dvirial,
+                        std::vector<VALUETYPE>& datom_energy_,
+                        std::vector<VALUETYPE>& datom_virial_,
+                        const std::vector<VALUETYPE>& dcoord_,
+                        const std::vector<int>& datype_,
+                        const std::vector<VALUETYPE>& dbox,
+                        const int nghost,
+                        const InputNlist& lmp_list,
+                        const int& ago,
+                        const std::vector<VALUETYPE>& fparam_,
+                        const std::vector<VALUETYPE>& aparam__) {
+  int nall = datype_.size();
+  // if nall==0, unclear nframes, but 1 is ok
+  int nframes = nall > 0 ? (dcoord_.size() / nall / 3) : 1;
+  int nloc = nall - nghost;
+  std::vector<VALUETYPE> fparam;
+  std::vector<VALUETYPE> aparam_;
+  validate_fparam_aparam(nframes, (aparam_nall ? nall : nloc), fparam_,
+                         aparam__);
+  tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
+  tile_fparam_aparam(aparam_, nframes, (aparam_nall ? nall : nloc) * daparam,
+                     aparam__);
+  std::vector<std::pair<std::string, Tensor>> input_tensors;
+  // select real atoms
+  std::vector<VALUETYPE> dcoord, dforce, aparam, datom_energy, datom_virial;
+  std::vector<int> datype, fwd_map, bkw_map;
+  int nghost_real, nall_real, nloc_real;
+  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
+                          nall_real, nloc_real, dcoord_, datype_, aparam_,
+                          nghost, ntypes, nframes, daparam, nall, aparam_nall);
+
+  if (ago == 0) {
+    atommap = deepmd::AtomMap(datype.begin(), datype.begin() + nloc_real);
+    assert(nloc_real == atommap.get_type().size());
+
+    nlist_data.copy_from_nlist(lmp_list);
+    nlist_data.shuffle_exclude_empty(fwd_map);
+    nlist_data.shuffle(atommap);
+    nlist_data.make_inlist(nlist);
+  }
+
+  if (dtype == tensorflow::DT_DOUBLE) {
+    int ret = session_input_tensors<double>(
+        input_tensors, dcoord, ntypes, datype, dbox, nlist, fparam, aparam,
+        atommap, nghost_real, ago, "", aparam_nall);
+    assert(nloc_real == ret);
+    run_model<double>(dener, dforce, dvirial, datom_energy, datom_virial,
+                      session, input_tensors, atommap, nframes, nghost_real);
+  } else {
+    int ret = session_input_tensors<float>(
+        input_tensors, dcoord, ntypes, datype, dbox, nlist, fparam, aparam,
+        atommap, nghost_real, ago, "", aparam_nall);
+    assert(nloc_real == ret);
+    run_model<float>(dener, dforce, dvirial, datom_energy, datom_virial,
+                     session, input_tensors, atommap, nframes, nghost_real);
+  }
+
+  // bkw map
+  dforce_.resize(static_cast<size_t>(nframes) * fwd_map.size() * 3);
+  datom_energy_.resize(static_cast<size_t>(nframes) * fwd_map.size());
+  datom_virial_.resize(static_cast<size_t>(nframes) * fwd_map.size() * 9);
+  select_map<VALUETYPE>(dforce_, dforce, bkw_map, 3, nframes, fwd_map.size(),
+                        nall_real);
+  select_map<VALUETYPE>(datom_energy_, datom_energy, bkw_map, 1, nframes,
+                        fwd_map.size(), nall_real);
+  select_map<VALUETYPE>(datom_virial_, datom_virial, bkw_map, 9, nframes,
+                        fwd_map.size(), nall_real);
+}
+
+template void DeepPotTF::compute<double, ENERGYTYPE>(
+    ENERGYTYPE& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
+    const int nghost,
+    const InputNlist& lmp_list,
+    const int& ago,
+    const std::vector<double>& fparam,
+    const std::vector<double>& aparam_);
+
+template void DeepPotTF::compute<float, ENERGYTYPE>(
+    ENERGYTYPE& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
+    const int nghost,
+    const InputNlist& lmp_list,
+    const int& ago,
+    const std::vector<float>& fparam,
+    const std::vector<float>& aparam_);
+
+template void DeepPotTF::compute<double, std::vector<ENERGYTYPE>>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
+    const int nghost,
+    const InputNlist& lmp_list,
+    const int& ago,
+    const std::vector<double>& fparam,
+    const std::vector<double>& aparam_);
+
+template void DeepPotTF::compute<float, std::vector<ENERGYTYPE>>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
+    const int nghost,
+    const InputNlist& lmp_list,
+    const int& ago,
+    const std::vector<float>& fparam,
+    const std::vector<float>& aparam_);
+
+// mixed type
+template <typename VALUETYPE, typename ENERGYVTYPE>
+void DeepPotTF::compute_mixed_type(ENERGYVTYPE& dener,
+                                   std::vector<VALUETYPE>& dforce_,
+                                   std::vector<VALUETYPE>& dvirial,
+                                   const int& nframes,
+                                   const std::vector<VALUETYPE>& dcoord_,
+                                   const std::vector<int>& datype_,
+                                   const std::vector<VALUETYPE>& dbox,
+                                   const std::vector<VALUETYPE>& fparam_,
+                                   const std::vector<VALUETYPE>& aparam_) {
+  int nloc = datype_.size() / nframes;
+  // here atommap only used to get nloc
+  atommap = deepmd::AtomMap(datype_.begin(), datype_.begin() + nloc);
+  std::vector<VALUETYPE> fparam;
+  std::vector<VALUETYPE> aparam;
+  validate_fparam_aparam(nframes, nloc, fparam_, aparam_);
+  tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
+  tile_fparam_aparam(aparam, nframes, nloc * daparam, aparam_);
+
+  std::vector<std::pair<std::string, Tensor>> input_tensors;
+
+  if (dtype == tensorflow::DT_DOUBLE) {
+    int ret = session_input_tensors_mixed_type<double>(
+        input_tensors, nframes, dcoord_, ntypes, datype_, dbox, cell_size,
+        fparam, aparam, atommap, "", aparam_nall);
+    assert(ret == nloc);
+    run_model<double>(dener, dforce_, dvirial, session, input_tensors, atommap,
+                      nframes);
+  } else {
+    int ret = session_input_tensors_mixed_type<float>(
+        input_tensors, nframes, dcoord_, ntypes, datype_, dbox, cell_size,
+        fparam, aparam, atommap, "", aparam_nall);
+    assert(ret == nloc);
+    run_model<float>(dener, dforce_, dvirial, session, input_tensors, atommap,
+                     nframes);
+  }
+}
+
+template void DeepPotTF::compute_mixed_type<double, ENERGYTYPE>(
+    ENERGYTYPE& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    const int& nframes,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
+    const std::vector<double>& fparam,
+    const std::vector<double>& aparam);
+
+template void DeepPotTF::compute_mixed_type<float, ENERGYTYPE>(
+    ENERGYTYPE& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    const int& nframes,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
+    const std::vector<float>& fparam,
+    const std::vector<float>& aparam);
+
+template void DeepPotTF::compute_mixed_type<double, std::vector<ENERGYTYPE>>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    const int& nframes,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
+    const std::vector<double>& fparam,
+    const std::vector<double>& aparam);
+
+template void DeepPotTF::compute_mixed_type<float, std::vector<ENERGYTYPE>>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    const int& nframes,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
+    const std::vector<float>& fparam,
+    const std::vector<float>& aparam);
+
+template <typename VALUETYPE, typename ENERGYVTYPE>
+void DeepPotTF::compute_mixed_type(ENERGYVTYPE& dener,
+                                   std::vector<VALUETYPE>& dforce_,
+                                   std::vector<VALUETYPE>& dvirial,
+                                   std::vector<VALUETYPE>& datom_energy_,
+                                   std::vector<VALUETYPE>& datom_virial_,
+                                   const int& nframes,
+                                   const std::vector<VALUETYPE>& dcoord_,
+                                   const std::vector<int>& datype_,
+                                   const std::vector<VALUETYPE>& dbox,
+                                   const std::vector<VALUETYPE>& fparam_,
+                                   const std::vector<VALUETYPE>& aparam_) {
+  int nloc = datype_.size() / nframes;
+  // here atommap only used to get nloc
+  atommap = deepmd::AtomMap(datype_.begin(), datype_.begin() + nloc);
+  std::vector<VALUETYPE> fparam;
+  std::vector<VALUETYPE> aparam;
+  validate_fparam_aparam(nframes, nloc, fparam_, aparam_);
+  tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
+  tile_fparam_aparam(aparam, nframes, nloc * daparam, aparam_);
+
+  std::vector<std::pair<std::string, Tensor>> input_tensors;
+
+  if (dtype == tensorflow::DT_DOUBLE) {
+    int nloc = session_input_tensors_mixed_type<double>(
+        input_tensors, nframes, dcoord_, ntypes, datype_, dbox, cell_size,
+        fparam, aparam, atommap, "", aparam_nall);
+    run_model<double>(dener, dforce_, dvirial, datom_energy_, datom_virial_,
+                      session, input_tensors, atommap, nframes);
+  } else {
+    int nloc = session_input_tensors_mixed_type<float>(
+        input_tensors, nframes, dcoord_, ntypes, datype_, dbox, cell_size,
+        fparam, aparam, atommap, "", aparam_nall);
+    run_model<float>(dener, dforce_, dvirial, datom_energy_, datom_virial_,
+                     session, input_tensors, atommap, nframes);
+  }
+}
+
+template void DeepPotTF::compute_mixed_type<double, ENERGYTYPE>(
+    ENERGYTYPE& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const int& nframes,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
+    const std::vector<double>& fparam,
+    const std::vector<double>& aparam);
+
+template void DeepPotTF::compute_mixed_type<float, ENERGYTYPE>(
+    ENERGYTYPE& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const int& nframes,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
+    const std::vector<float>& fparam,
+    const std::vector<float>& aparam);
+
+template void DeepPotTF::compute_mixed_type<double, std::vector<ENERGYTYPE>>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const int& nframes,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
+    const std::vector<double>& fparam,
+    const std::vector<double>& aparam);
+
+template void DeepPotTF::compute_mixed_type<float, std::vector<ENERGYTYPE>>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const int& nframes,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
+    const std::vector<float>& fparam,
+    const std::vector<float>& aparam);
+
+void DeepPotTF::get_type_map(std::string& type_map) {
+  type_map = get_scalar<STRINGTYPE>("model_attr/tmap");
+}
+
+// forward to template method
+void DeepPotTF::computew(std::vector<double>& ener,
+                         std::vector<double>& force,
+                         std::vector<double>& virial,
+                         std::vector<double>& atom_energy,
+                         std::vector<double>& atom_virial,
+                         const std::vector<double>& coord,
+                         const std::vector<int>& atype,
+                         const std::vector<double>& box,
+                         const std::vector<double>& fparam,
+                         const std::vector<double>& aparam) {
+  compute(ener, force, virial, atom_energy, atom_virial, coord, atype, box,
+          fparam, aparam);
+}
+void DeepPotTF::computew(std::vector<double>& ener,
+                         std::vector<float>& force,
+                         std::vector<float>& virial,
+                         std::vector<float>& atom_energy,
+                         std::vector<float>& atom_virial,
+                         const std::vector<float>& coord,
+                         const std::vector<int>& atype,
+                         const std::vector<float>& box,
+                         const std::vector<float>& fparam,
+                         const std::vector<float>& aparam) {
+  compute(ener, force, virial, atom_energy, atom_virial, coord, atype, box,
+          fparam, aparam);
+}
+void DeepPotTF::computew(std::vector<double>& ener,
+                         std::vector<double>& force,
+                         std::vector<double>& virial,
+                         std::vector<double>& atom_energy,
+                         std::vector<double>& atom_virial,
+                         const std::vector<double>& coord,
+                         const std::vector<int>& atype,
+                         const std::vector<double>& box,
+                         const int nghost,
+                         const InputNlist& inlist,
+                         const int& ago,
+                         const std::vector<double>& fparam,
+                         const std::vector<double>& aparam) {
+  compute(ener, force, virial, atom_energy, atom_virial, coord, atype, box,
+          nghost, inlist, ago, fparam, aparam);
+}
+void DeepPotTF::computew(std::vector<double>& ener,
+                         std::vector<float>& force,
+                         std::vector<float>& virial,
+                         std::vector<float>& atom_energy,
+                         std::vector<float>& atom_virial,
+                         const std::vector<float>& coord,
+                         const std::vector<int>& atype,
+                         const std::vector<float>& box,
+                         const int nghost,
+                         const InputNlist& inlist,
+                         const int& ago,
+                         const std::vector<float>& fparam,
+                         const std::vector<float>& aparam) {
+  compute(ener, force, virial, atom_energy, atom_virial, coord, atype, box,
+          nghost, inlist, ago, fparam, aparam);
+}
+void DeepPotTF::computew_mixed_type(std::vector<double>& ener,
+                                    std::vector<double>& force,
+                                    std::vector<double>& virial,
+                                    std::vector<double>& atom_energy,
+                                    std::vector<double>& atom_virial,
+                                    const int& nframes,
+                                    const std::vector<double>& coord,
+                                    const std::vector<int>& atype,
+                                    const std::vector<double>& box,
+                                    const std::vector<double>& fparam,
+                                    const std::vector<double>& aparam) {
+  compute_mixed_type(ener, force, virial, atom_energy, atom_virial, nframes,
+                     coord, atype, box, fparam, aparam);
+}
+void DeepPotTF::computew_mixed_type(std::vector<double>& ener,
+                                    std::vector<float>& force,
+                                    std::vector<float>& virial,
+                                    std::vector<float>& atom_energy,
+                                    std::vector<float>& atom_virial,
+                                    const int& nframes,
+                                    const std::vector<float>& coord,
+                                    const std::vector<int>& atype,
+                                    const std::vector<float>& box,
+                                    const std::vector<float>& fparam,
+                                    const std::vector<float>& aparam) {
+  compute_mixed_type(ener, force, virial, atom_energy, atom_virial, nframes,
+                     coord, atype, box, fparam, aparam);
+}
diff --git a/source/api_cc/src/DeepTensor.cc b/source/api_cc/src/DeepTensor.cc
index 30ff99497c..2c88ab2f4b 100644
--- a/source/api_cc/src/DeepTensor.cc
+++ b/source/api_cc/src/DeepTensor.cc
@@ -1,19 +1,23 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include "DeepTensor.h"
 
+#include <memory>
+
+#include "DeepTensorTF.h"
+#include "common.h"
+
 using namespace deepmd;
-using namespace tensorflow;
 
-DeepTensor::DeepTensor() : inited(false), graph_def(new GraphDef()) {}
+DeepTensor::DeepTensor() : inited(false) {}
 
 DeepTensor::DeepTensor(const std::string &model,
                        const int &gpu_rank,
                        const std::string &name_scope_)
-    : inited(false), name_scope(name_scope_), graph_def(new GraphDef()) {
+    : inited(false) {
   init(model, gpu_rank, name_scope_);
 }
 
-DeepTensor::~DeepTensor() { delete graph_def; }
+DeepTensor::~DeepTensor() {}
 
 void DeepTensor::init(const std::string &model,
                       const int &gpu_rank,
@@ -24,53 +28,18 @@ void DeepTensor::init(const std::string &model,
               << std::endl;
     return;
   }
-  name_scope = name_scope_;
-  SessionOptions options;
-  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
-  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
-  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  deepmd::load_op_library();
-  int gpu_num = -1;
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  DPGetDeviceCount(gpu_num);  // check current device environment
-  if (gpu_num > 0) {
-    options.config.set_allow_soft_placement(true);
-    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(
-        0.9);
-    options.config.mutable_gpu_options()->set_allow_growth(true);
-    DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
-    std::string str = "/gpu:";
-    str += std::to_string(gpu_rank % gpu_num);
-    graph::SetDefaultDevice(str, graph_def);
-  }
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  deepmd::check_status(NewSession(options, &session));
-  deepmd::check_status(ReadBinaryProto(Env::Default(), model, graph_def));
-  deepmd::check_status(session->Create(*graph_def));
-  try {
-    model_version = get_scalar<STRINGTYPE>("model_attr/model_version");
-  } catch (deepmd::tf_exception &e) {
-    // no model version defined in old models
-    model_version = "0.0";
-  }
-  if (!model_compatable(model_version)) {
-    throw deepmd::deepmd_exception(
-        "incompatable model: version " + model_version +
-        " in graph, but version " + global_model_version +
-        " supported "
-        "See https://deepmd.rtfd.io/compatability/ for details.");
-  }
-  dtype = session_get_dtype(session, "descrpt_attr/rcut");
-  if (dtype == tensorflow::DT_DOUBLE) {
-    rcut = get_scalar<double>("descrpt_attr/rcut");
+  // TODO: To implement detect_backend
+  DPBackend backend = deepmd::DPBackend::TensorFlow;
+  if (deepmd::DPBackend::TensorFlow == backend) {
+    // TODO: throw errors if TF backend is not built, without mentioning TF
+    dt = std::make_shared<deepmd::DeepTensorTF>(model, gpu_rank, name_scope_);
+  } else if (deepmd::DPBackend::PyTorch == backend) {
+    throw deepmd::deepmd_exception("PyTorch backend is not supported yet");
+  } else if (deepmd::DPBackend::Paddle == backend) {
+    throw deepmd::deepmd_exception("PaddlePaddle backend is not supported yet");
   } else {
-    rcut = get_scalar<float>("descrpt_attr/rcut");
+    throw deepmd::deepmd_exception("Unknown file type");
   }
-  cell_size = rcut;
-  ntypes = get_scalar<int>("descrpt_attr/ntypes");
-  odim = get_scalar<int>("model_attr/output_dim");
-  get_vector<int>(sel_type, "model_attr/sel_type");
-  model_type = get_scalar<STRINGTYPE>("model_attr/model_type");
   inited = true;
 }
 
@@ -78,267 +47,14 @@ void DeepTensor::print_summary(const std::string &pre) const {
   deepmd::print_summary(pre);
 }
 
-template <class VT>
-VT DeepTensor::get_scalar(const std::string &name) const {
-  return session_get_scalar<VT>(session, name, name_scope);
-}
-
-template <class VT>
-void DeepTensor::get_vector(std::vector<VT> &vec,
-                            const std::string &name) const {
-  session_get_vector<VT>(vec, session, name, name_scope);
-}
-
-template <typename MODELTYPE, typename VALUETYPE>
-void DeepTensor::run_model(
-    std::vector<VALUETYPE> &d_tensor_,
-    Session *session,
-    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
-    const int nghost) {
-  unsigned nloc = atommap.get_type().size();
-  unsigned nall = nloc + nghost;
-  if (nloc == 0) {
-    // return empty
-    d_tensor_.clear();
-    return;
-  }
-
-  std::vector<Tensor> output_tensors;
-  deepmd::check_status(
-      session->Run(input_tensors, {name_prefix(name_scope) + "o_" + model_type},
-                   {}, &output_tensors));
-
-  Tensor output_t = output_tensors[0];
-  // Yixiao: newer model may output rank 2 tensor [nframes x (natoms x noutdim)]
-  // assert (output_t.dims() == 1), "dim of output tensor should be 1";
-  auto ot = output_t.flat<MODELTYPE>();
-  // this is an Eigen Tensor
-  int o_size = ot.size();
-
-  std::vector<VALUETYPE> d_tensor(o_size);
-  for (unsigned ii = 0; ii < o_size; ++ii) {
-    d_tensor[ii] = ot(ii);
-  }
-  // now we map the type-sorted sel-atom tensor back to original order
-  // first we have to get the type-sorted select map
-  std::vector<int> sel_srt = sel_fwd;
-  select_map<int>(sel_srt, sel_fwd, atommap.get_fwd_map(), 1);
-  // remove those -1 that correspond to discarded atoms
-  std::remove(sel_srt.begin(), sel_srt.end(), -1);
-  // now map the tensor back
-  d_tensor_.resize(o_size);
-  select_map<VALUETYPE>(d_tensor_, d_tensor, sel_srt, odim);
-}
-
-template void DeepTensor::run_model<double, double>(
-    std::vector<double> &d_tensor_,
-    Session *session,
-    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
-    const int nghost);
-template void DeepTensor::run_model<float, double>(
-    std::vector<double> &d_tensor_,
-    Session *session,
-    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
-    const int nghost);
-template void DeepTensor::run_model<double, float>(
-    std::vector<float> &d_tensor_,
-    Session *session,
-    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
-    const int nghost);
-template void DeepTensor::run_model<float, float>(
-    std::vector<float> &d_tensor_,
-    Session *session,
-    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
-    const int nghost);
-
-template <typename MODELTYPE, typename VALUETYPE>
-void DeepTensor::run_model(
-    std::vector<VALUETYPE> &dglobal_tensor_,
-    std::vector<VALUETYPE> &dforce_,
-    std::vector<VALUETYPE> &dvirial_,
-    std::vector<VALUETYPE> &datom_tensor_,
-    std::vector<VALUETYPE> &datom_virial_,
-    tensorflow::Session *session,
-    const std::vector<std::pair<std::string, tensorflow::Tensor>>
-        &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
-    const int nghost) {
-  unsigned nloc = atommap.get_type().size();
-  unsigned nall = nloc + nghost;
-  unsigned nsel = nloc - std::count(sel_fwd.begin(), sel_fwd.end(), -1);
-  if (nloc == 0) {
-    // return empty
-    dglobal_tensor_.clear();
-    dforce_.clear();
-    dvirial_.clear();
-    return;
-  }
-
-  std::vector<Tensor> output_tensors;
-  deepmd::check_status(
-      session->Run(input_tensors,
-                   {name_prefix(name_scope) + "o_global_" + model_type,
-                    name_prefix(name_scope) + "o_force",
-                    name_prefix(name_scope) + "o_virial",
-                    name_prefix(name_scope) + "o_" + model_type,
-                    name_prefix(name_scope) + "o_atom_virial"},
-                   {}, &output_tensors));
-
-  Tensor output_gt = output_tensors[0];
-  Tensor output_f = output_tensors[1];
-  Tensor output_v = output_tensors[2];
-  Tensor output_at = output_tensors[3];
-  Tensor output_av = output_tensors[4];
-  // this is the new model, output has to be rank 2 tensor
-  assert(output_gt.dims() == 2 && "dim of output tensor should be 2");
-  assert(output_f.dims() == 2 && "dim of output tensor should be 2");
-  assert(output_v.dims() == 2 && "dim of output tensor should be 2");
-  assert(output_at.dims() == 2 && "dim of output tensor should be 2");
-  assert(output_av.dims() == 2 && "dim of output tensor should be 2");
-  // also check the tensor shapes
-  assert(output_gt.dim_size(0) == 1 && "nframes should match");
-  assert(output_gt.dim_size(1) == odim &&
-         "dof of global tensor should be odim");
-  assert(output_f.dim_size(0) == 1 && "nframes should match");
-  assert(output_f.dim_size(1) == odim * nall * 3 &&
-         "dof of force should be odim * nall * 3");
-  assert(output_v.dim_size(0) == 1 && "nframes should match");
-  assert(output_v.dim_size(1) == odim * 9 &&
-         "dof of virial should be odim * 9");
-  assert(output_at.dim_size(0) == 1 && "nframes should match");
-  assert(output_at.dim_size(1) == nsel * odim &&
-         "dof of atomic tensor should be nsel * odim");
-  assert(output_av.dim_size(0) == 1 && "nframes should match");
-  assert(output_av.dim_size(1) == odim * nall * 9 &&
-         "dof of atomic virial should be odim * nall * 9");
-
-  auto ogt = output_gt.flat<ENERGYTYPE>();
-  auto of = output_f.flat<MODELTYPE>();
-  auto ov = output_v.flat<MODELTYPE>();
-  auto oat = output_at.flat<MODELTYPE>();
-  auto oav = output_av.flat<MODELTYPE>();
-
-  // global tensor
-  dglobal_tensor_.resize(odim);
-  for (unsigned ii = 0; ii < odim; ++ii) {
-    dglobal_tensor_[ii] = ogt(ii);
-  }
-
-  // component-wise force
-  std::vector<VALUETYPE> dforce(3 * nall * odim);
-  for (unsigned ii = 0; ii < odim * nall * 3; ++ii) {
-    dforce[ii] = of(ii);
-  }
-  dforce_ = dforce;
-  for (unsigned dd = 0; dd < odim; ++dd) {
-    atommap.backward<VALUETYPE>(dforce_.begin() + (dd * nall * 3),
-                                dforce.begin() + (dd * nall * 3), 3);
-  }
-
-  // component-wise virial
-  dvirial_.resize(odim * 9);
-  for (unsigned ii = 0; ii < odim * 9; ++ii) {
-    dvirial_[ii] = ov(ii);
-  }
-
-  // atomic tensor
-  std::vector<VALUETYPE> datom_tensor(nsel * odim);
-  for (unsigned ii = 0; ii < nsel * odim; ++ii) {
-    datom_tensor[ii] = oat(ii);
-  }
-  std::vector<int> sel_srt = sel_fwd;
-  select_map<int>(sel_srt, sel_fwd, atommap.get_fwd_map(), 1);
-  std::remove(sel_srt.begin(), sel_srt.end(), -1);
-  datom_tensor_.resize(nsel * odim);
-  select_map<VALUETYPE>(datom_tensor_, datom_tensor, sel_srt, odim);
-
-  // component-wise atomic virial
-  std::vector<VALUETYPE> datom_virial(9 * nall * odim);
-  for (unsigned ii = 0; ii < odim * nall * 9; ++ii) {
-    datom_virial[ii] = oav(ii);
-  }
-  datom_virial_ = datom_virial;
-  for (unsigned dd = 0; dd < odim; ++dd) {
-    atommap.backward<VALUETYPE>(datom_virial_.begin() + (dd * nall * 9),
-                                datom_virial.begin() + (dd * nall * 9), 9);
-  }
-}
-
-template void DeepTensor::run_model<double, double>(
-    std::vector<double> &dglobal_tensor_,
-    std::vector<double> &dforce_,
-    std::vector<double> &dvirial_,
-    std::vector<double> &datom_tensor_,
-    std::vector<double> &datom_virial_,
-    tensorflow::Session *session,
-    const std::vector<std::pair<std::string, tensorflow::Tensor>>
-        &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
-    const int nghost);
-template void DeepTensor::run_model<float, double>(
-    std::vector<double> &dglobal_tensor_,
-    std::vector<double> &dforce_,
-    std::vector<double> &dvirial_,
-    std::vector<double> &datom_tensor_,
-    std::vector<double> &datom_virial_,
-    tensorflow::Session *session,
-    const std::vector<std::pair<std::string, tensorflow::Tensor>>
-        &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
-    const int nghost);
-
-template void DeepTensor::run_model<double, float>(
-    std::vector<float> &dglobal_tensor_,
-    std::vector<float> &dforce_,
-    std::vector<float> &dvirial_,
-    std::vector<float> &datom_tensor_,
-    std::vector<float> &datom_virial_,
-    tensorflow::Session *session,
-    const std::vector<std::pair<std::string, tensorflow::Tensor>>
-        &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
-    const int nghost);
-
-template void DeepTensor::run_model<float, float>(
-    std::vector<float> &dglobal_tensor_,
-    std::vector<float> &dforce_,
-    std::vector<float> &dvirial_,
-    std::vector<float> &datom_tensor_,
-    std::vector<float> &datom_virial_,
-    tensorflow::Session *session,
-    const std::vector<std::pair<std::string, tensorflow::Tensor>>
-        &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
-    const int nghost);
-
 template <typename VALUETYPE>
 void DeepTensor::compute(std::vector<VALUETYPE> &dtensor_,
                          const std::vector<VALUETYPE> &dcoord_,
                          const std::vector<int> &datype_,
                          const std::vector<VALUETYPE> &dbox) {
-  int nall = datype_.size();
-  std::vector<VALUETYPE> dcoord, aparam, aparam_;
-  std::vector<int> datype, fwd_map, bkw_map;
-  int nghost_real, nall_real, nloc_real;
-  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
-                          nall_real, nloc_real, dcoord_, datype_, aparam_, 0,
-                          ntypes, 1, 0, nall);
-  compute_inner(dtensor_, dcoord, datype, dbox);
+  std::vector<VALUETYPE> force_, virial_, datom_tensor_, datom_virial_;
+  dt->computew(dtensor_, force_, virial_, datom_tensor_, datom_virial_, dcoord_,
+               datype_, dbox, false);
 }
 
 template void DeepTensor::compute<double>(std::vector<double> &dtensor_,
@@ -358,20 +74,9 @@ void DeepTensor::compute(std::vector<VALUETYPE> &dtensor_,
                          const std::vector<VALUETYPE> &dbox,
                          const int nghost,
                          const InputNlist &lmp_list) {
-  int nall = datype_.size();
-  std::vector<VALUETYPE> dcoord, dforce, datom_virial, aparam, aparam_;
-  std::vector<int> datype, fwd_map, bkw_map;
-  int nghost_real, nall_real, nloc_real;
-  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
-                          nall_real, nloc_real, dcoord_, datype_, aparam_,
-                          nghost, ntypes, 1, 0, nall);
-  // internal nlist
-  NeighborListData nlist_data;
-  nlist_data.copy_from_nlist(lmp_list);
-  nlist_data.shuffle_exclude_empty(fwd_map);
-  InputNlist nlist;
-  nlist_data.make_inlist(nlist);
-  compute_inner(dtensor_, dcoord, datype, dbox, nghost_real, nlist);
+  std::vector<VALUETYPE> force_, virial_, datom_tensor_, datom_virial_;
+  dt->computew(dtensor_, force_, virial_, datom_tensor_, datom_virial_, dcoord_,
+               datype_, dbox, nghost, lmp_list, false);
 }
 
 template void DeepTensor::compute<double>(std::vector<double> &dtensor_,
@@ -395,9 +100,9 @@ void DeepTensor::compute(std::vector<VALUETYPE> &dglobal_tensor_,
                          const std::vector<VALUETYPE> &dcoord_,
                          const std::vector<int> &datype_,
                          const std::vector<VALUETYPE> &dbox) {
-  std::vector<VALUETYPE> tmp_at_, tmp_av_;
-  compute(dglobal_tensor_, dforce_, dvirial_, tmp_at_, tmp_av_, dcoord_,
-          datype_, dbox);
+  std::vector<VALUETYPE> datom_tensor_, datom_virial_;
+  dt->computew(dglobal_tensor_, dforce_, dvirial_, datom_tensor_, datom_virial_,
+               dcoord_, datype_, dbox, true);
 }
 
 template void DeepTensor::compute<double>(std::vector<double> &dglobal_tensor_,
@@ -423,9 +128,9 @@ void DeepTensor::compute(std::vector<VALUETYPE> &dglobal_tensor_,
                          const std::vector<VALUETYPE> &dbox,
                          const int nghost,
                          const InputNlist &lmp_list) {
-  std::vector<VALUETYPE> tmp_at_, tmp_av_;
-  compute(dglobal_tensor_, dforce_, dvirial_, tmp_at_, tmp_av_, dcoord_,
-          datype_, dbox, nghost, lmp_list);
+  std::vector<VALUETYPE> datom_tensor_, datom_virial_;
+  dt->computew(dglobal_tensor_, dforce_, dvirial_, datom_tensor_, datom_virial_,
+               dcoord_, datype_, dbox, nghost, lmp_list, true);
 }
 
 template void DeepTensor::compute<double>(std::vector<double> &dglobal_tensor_,
@@ -455,34 +160,8 @@ void DeepTensor::compute(std::vector<VALUETYPE> &dglobal_tensor_,
                          const std::vector<VALUETYPE> &dcoord_,
                          const std::vector<int> &datype_,
                          const std::vector<VALUETYPE> &dbox) {
-  int nall = datype_.size();
-  std::vector<VALUETYPE> dcoord, dforce, datom_virial, aparam, aparam_;
-  std::vector<int> datype, fwd_map, bkw_map;
-  int nghost_real, nall_real, nloc_real;
-  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
-                          nall_real, nloc_real, dcoord_, datype_, aparam_, 0,
-                          ntypes, 1, 0, nall);
-  assert(nghost_real == 0);
-  // resize to nall_real
-  dcoord.resize(bkw_map.size() * 3);
-  datype.resize(bkw_map.size());
-  // fwd map
-  select_map<VALUETYPE>(dcoord, dcoord_, fwd_map, 3);
-  select_map<int>(datype, datype_, fwd_map, 1);
-  compute_inner(dglobal_tensor_, dforce, dvirial_, datom_tensor_, datom_virial,
-                dcoord, datype, dbox);
-  // bkw map
-  dforce_.resize(odim * fwd_map.size() * 3);
-  for (int kk = 0; kk < odim; ++kk) {
-    select_map<VALUETYPE>(dforce_.begin() + kk * fwd_map.size() * 3,
-                          dforce.begin() + kk * bkw_map.size() * 3, bkw_map, 3);
-  }
-  datom_virial_.resize(odim * fwd_map.size() * 9);
-  for (int kk = 0; kk < odim; ++kk) {
-    select_map<VALUETYPE>(datom_virial_.begin() + kk * fwd_map.size() * 9,
-                          datom_virial.begin() + kk * bkw_map.size() * 9,
-                          bkw_map, 9);
-  }
+  dt->computew(dglobal_tensor_, dforce_, dvirial_, datom_tensor_, datom_virial_,
+               dcoord_, datype_, dbox, true);
 }
 
 template void DeepTensor::compute<double>(std::vector<double> &dglobal_tensor_,
@@ -514,33 +193,8 @@ void DeepTensor::compute(std::vector<VALUETYPE> &dglobal_tensor_,
                          const std::vector<VALUETYPE> &dbox,
                          const int nghost,
                          const InputNlist &lmp_list) {
-  int nall = datype_.size();
-  std::vector<VALUETYPE> dcoord, dforce, datom_virial, aparam, aparam_;
-  std::vector<int> datype, fwd_map, bkw_map;
-  int nghost_real, nall_real, nloc_real;
-  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
-                          nall_real, nloc_real, dcoord_, datype_, aparam_,
-                          nghost, ntypes, 1, 0, nall);
-  // internal nlist
-  NeighborListData nlist_data;
-  nlist_data.copy_from_nlist(lmp_list);
-  nlist_data.shuffle_exclude_empty(fwd_map);
-  InputNlist nlist;
-  nlist_data.make_inlist(nlist);
-  compute_inner(dglobal_tensor_, dforce, dvirial_, datom_tensor_, datom_virial,
-                dcoord, datype, dbox, nghost_real, nlist);
-  // bkw map
-  dforce_.resize(odim * fwd_map.size() * 3);
-  for (int kk = 0; kk < odim; ++kk) {
-    select_map<VALUETYPE>(dforce_.begin() + kk * fwd_map.size() * 3,
-                          dforce.begin() + kk * bkw_map.size() * 3, bkw_map, 3);
-  }
-  datom_virial_.resize(odim * fwd_map.size() * 9);
-  for (int kk = 0; kk < odim; ++kk) {
-    select_map<VALUETYPE>(datom_virial_.begin() + kk * fwd_map.size() * 9,
-                          datom_virial.begin() + kk * bkw_map.size() * 9,
-                          bkw_map, 9);
-  }
+  dt->computew(dglobal_tensor_, dforce_, dvirial_, datom_tensor_, datom_virial_,
+               dcoord_, datype_, dbox, nghost, lmp_list, true);
 }
 
 template void DeepTensor::compute<double>(std::vector<double> &dglobal_tensor_,
@@ -565,250 +219,16 @@ template void DeepTensor::compute<float>(std::vector<float> &dglobal_tensor_,
                                          const int nghost,
                                          const InputNlist &lmp_list);
 
-template <typename VALUETYPE>
-void DeepTensor::compute_inner(std::vector<VALUETYPE> &dtensor_,
-                               const std::vector<VALUETYPE> &dcoord_,
-                               const std::vector<int> &datype_,
-                               const std::vector<VALUETYPE> &dbox) {
-  int nall = dcoord_.size() / 3;
-  int nloc = nall;
-  AtomMap atommap(datype_.begin(), datype_.begin() + nloc);
-  assert(nloc == atommap.get_type().size());
-
-  std::vector<int> sel_fwd, sel_bkw;
-  int nghost_sel;
-  // this gives the raw selection map, will pass to run model
-  select_by_type(sel_fwd, sel_bkw, nghost_sel, dcoord_, datype_, 0, sel_type);
-
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-
-  if (dtype == tensorflow::DT_DOUBLE) {
-    int ret = session_input_tensors<double>(
-        input_tensors, dcoord_, ntypes, datype_, dbox, cell_size,
-        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap,
-        name_scope);
-    assert(ret == nloc);
-    run_model<double>(dtensor_, session, input_tensors, atommap, sel_fwd);
-  } else {
-    int ret = session_input_tensors<float>(
-        input_tensors, dcoord_, ntypes, datype_, dbox, cell_size,
-        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap,
-        name_scope);
-    assert(ret == nloc);
-    run_model<float>(dtensor_, session, input_tensors, atommap, sel_fwd);
-  }
-}
-
-template void DeepTensor::compute_inner<double>(
-    std::vector<double> &dtensor_,
-    const std::vector<double> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<double> &dbox);
-
-template void DeepTensor::compute_inner<float>(
-    std::vector<float> &dtensor_,
-    const std::vector<float> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<float> &dbox);
-
-template <typename VALUETYPE>
-void DeepTensor::compute_inner(std::vector<VALUETYPE> &dtensor_,
-                               const std::vector<VALUETYPE> &dcoord_,
-                               const std::vector<int> &datype_,
-                               const std::vector<VALUETYPE> &dbox,
-                               const int nghost,
-                               const InputNlist &nlist_) {
-  int nall = dcoord_.size() / 3;
-  int nloc = nall - nghost;
-  AtomMap atommap(datype_.begin(), datype_.begin() + nloc);
-  assert(nloc == atommap.get_type().size());
-
-  std::vector<int> sel_fwd, sel_bkw;
-  int nghost_sel;
-  // this gives the raw selection map, will pass to run model
-  select_by_type(sel_fwd, sel_bkw, nghost_sel, dcoord_, datype_, nghost,
-                 sel_type);
-  sel_fwd.resize(nloc);
-
-  NeighborListData nlist_data;
-  nlist_data.copy_from_nlist(nlist_);
-  nlist_data.shuffle(atommap);
-  InputNlist nlist;
-  nlist_data.make_inlist(nlist);
-
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-
-  if (dtype == tensorflow::DT_DOUBLE) {
-    int ret = session_input_tensors<double>(
-        input_tensors, dcoord_, ntypes, datype_, dbox, nlist,
-        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap, nghost, 0,
-        name_scope);
-    assert(nloc == ret);
-    run_model<double>(dtensor_, session, input_tensors, atommap, sel_fwd,
-                      nghost);
-  } else {
-    int ret = session_input_tensors<float>(
-        input_tensors, dcoord_, ntypes, datype_, dbox, nlist,
-        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap, nghost, 0,
-        name_scope);
-    assert(nloc == ret);
-    run_model<float>(dtensor_, session, input_tensors, atommap, sel_fwd,
-                     nghost);
-  }
-}
-
-template void DeepTensor::compute_inner<double>(
-    std::vector<double> &dtensor_,
-    const std::vector<double> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<double> &dbox,
-    const int nghost,
-    const InputNlist &nlist_);
-
-template void DeepTensor::compute_inner<float>(
-    std::vector<float> &dtensor_,
-    const std::vector<float> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<float> &dbox,
-    const int nghost,
-    const InputNlist &nlist_);
-
-template <typename VALUETYPE>
-void DeepTensor::compute_inner(std::vector<VALUETYPE> &dglobal_tensor_,
-                               std::vector<VALUETYPE> &dforce_,
-                               std::vector<VALUETYPE> &dvirial_,
-                               std::vector<VALUETYPE> &datom_tensor_,
-                               std::vector<VALUETYPE> &datom_virial_,
-                               const std::vector<VALUETYPE> &dcoord_,
-                               const std::vector<int> &datype_,
-                               const std::vector<VALUETYPE> &dbox) {
-  int nall = dcoord_.size() / 3;
-  int nloc = nall;
-  AtomMap atommap(datype_.begin(), datype_.begin() + nloc);
-  assert(nloc == atommap.get_type().size());
-
-  std::vector<int> sel_fwd, sel_bkw;
-  int nghost_sel;
-  // this gives the raw selection map, will pass to run model
-  select_by_type(sel_fwd, sel_bkw, nghost_sel, dcoord_, datype_, 0, sel_type);
-
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-
-  if (dtype == tensorflow::DT_DOUBLE) {
-    int ret = session_input_tensors<double>(
-        input_tensors, dcoord_, ntypes, datype_, dbox, cell_size,
-        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap,
-        name_scope);
-    assert(ret == nloc);
-    run_model<double>(dglobal_tensor_, dforce_, dvirial_, datom_tensor_,
-                      datom_virial_, session, input_tensors, atommap, sel_fwd);
-  } else {
-    int ret = session_input_tensors<float>(
-        input_tensors, dcoord_, ntypes, datype_, dbox, cell_size,
-        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap,
-        name_scope);
-    assert(ret == nloc);
-    run_model<float>(dglobal_tensor_, dforce_, dvirial_, datom_tensor_,
-                     datom_virial_, session, input_tensors, atommap, sel_fwd);
-  }
+void DeepTensor::get_type_map(std::string &type_map) {
+  dt->get_type_map(type_map);
 }
 
-template void DeepTensor::compute_inner<double>(
-    std::vector<double> &dglobal_tensor_,
-    std::vector<double> &dforce_,
-    std::vector<double> &dvirial_,
-    std::vector<double> &datom_tensor_,
-    std::vector<double> &datom_virial_,
-    const std::vector<double> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<double> &dbox);
-
-template void DeepTensor::compute_inner<float>(
-    std::vector<float> &dglobal_tensor_,
-    std::vector<float> &dforce_,
-    std::vector<float> &dvirial_,
-    std::vector<float> &datom_tensor_,
-    std::vector<float> &datom_virial_,
-    const std::vector<float> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<float> &dbox);
+double DeepTensor::cutoff() const { return dt->cutoff(); }
 
-template <typename VALUETYPE>
-void DeepTensor::compute_inner(std::vector<VALUETYPE> &dglobal_tensor_,
-                               std::vector<VALUETYPE> &dforce_,
-                               std::vector<VALUETYPE> &dvirial_,
-                               std::vector<VALUETYPE> &datom_tensor_,
-                               std::vector<VALUETYPE> &datom_virial_,
-                               const std::vector<VALUETYPE> &dcoord_,
-                               const std::vector<int> &datype_,
-                               const std::vector<VALUETYPE> &dbox,
-                               const int nghost,
-                               const InputNlist &nlist_) {
-  int nall = dcoord_.size() / 3;
-  int nloc = nall - nghost;
-  AtomMap atommap(datype_.begin(), datype_.begin() + nloc);
-  assert(nloc == atommap.get_type().size());
-
-  std::vector<int> sel_fwd, sel_bkw;
-  int nghost_sel;
-  // this gives the raw selection map, will pass to run model
-  select_by_type(sel_fwd, sel_bkw, nghost_sel, dcoord_, datype_, nghost,
-                 sel_type);
-  sel_fwd.resize(nloc);
-
-  NeighborListData nlist_data;
-  nlist_data.copy_from_nlist(nlist_);
-  nlist_data.shuffle(atommap);
-  InputNlist nlist;
-  nlist_data.make_inlist(nlist);
+int DeepTensor::output_dim() const { return dt->output_dim(); }
 
-  std::vector<std::pair<std::string, Tensor>> input_tensors;
-
-  if (dtype == tensorflow::DT_DOUBLE) {
-    int ret = session_input_tensors<double>(
-        input_tensors, dcoord_, ntypes, datype_, dbox, nlist,
-        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap, nghost, 0,
-        name_scope);
-    assert(nloc == ret);
-    run_model<double>(dglobal_tensor_, dforce_, dvirial_, datom_tensor_,
-                      datom_virial_, session, input_tensors, atommap, sel_fwd,
-                      nghost);
-  } else {
-    int ret = session_input_tensors<float>(
-        input_tensors, dcoord_, ntypes, datype_, dbox, nlist,
-        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap, nghost, 0,
-        name_scope);
-    assert(nloc == ret);
-    run_model<float>(dglobal_tensor_, dforce_, dvirial_, datom_tensor_,
-                     datom_virial_, session, input_tensors, atommap, sel_fwd,
-                     nghost);
-  }
+const std::vector<int> &DeepTensor::sel_types() const {
+  return dt->sel_types();
 }
 
-template void DeepTensor::compute_inner<double>(
-    std::vector<double> &dglobal_tensor_,
-    std::vector<double> &dforce_,
-    std::vector<double> &dvirial_,
-    std::vector<double> &datom_tensor_,
-    std::vector<double> &datom_virial_,
-    const std::vector<double> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<double> &dbox,
-    const int nghost,
-    const InputNlist &nlist_);
-
-template void DeepTensor::compute_inner<float>(
-    std::vector<float> &dglobal_tensor_,
-    std::vector<float> &dforce_,
-    std::vector<float> &dvirial_,
-    std::vector<float> &datom_tensor_,
-    std::vector<float> &datom_virial_,
-    const std::vector<float> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<float> &dbox,
-    const int nghost,
-    const InputNlist &nlist_);
-
-void DeepTensor::get_type_map(std::string &type_map) {
-  type_map = get_scalar<STRINGTYPE>("model_attr/tmap");
-}
+int DeepTensor::numb_types() const { return dt->numb_types(); }
diff --git a/source/api_cc/src/DeepTensorTF.cc b/source/api_cc/src/DeepTensorTF.cc
new file mode 100644
index 0000000000..436e389ad2
--- /dev/null
+++ b/source/api_cc/src/DeepTensorTF.cc
@@ -0,0 +1,846 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#include "DeepTensorTF.h"
+
+using namespace deepmd;
+using namespace tensorflow;
+
+DeepTensorTF::DeepTensorTF() : inited(false), graph_def(new GraphDef()) {}
+
+DeepTensorTF::DeepTensorTF(const std::string &model,
+                           const int &gpu_rank,
+                           const std::string &name_scope_)
+    : inited(false), name_scope(name_scope_), graph_def(new GraphDef()) {
+  try {
+    init(model, gpu_rank, name_scope_);
+  } catch (...) {
+    // Clean up and rethrow, as the destructor will not be called
+    delete graph_def;
+    throw;
+  }
+}
+
+DeepTensorTF::~DeepTensorTF() { delete graph_def; }
+
+void DeepTensorTF::init(const std::string &model,
+                        const int &gpu_rank,
+                        const std::string &name_scope_) {
+  if (inited) {
+    std::cerr << "WARNING: deepmd-kit should not be initialized twice, do "
+                 "nothing at the second call of initializer"
+              << std::endl;
+    return;
+  }
+  name_scope = name_scope_;
+  SessionOptions options;
+  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
+  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
+  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
+  deepmd::load_op_library();
+  int gpu_num = -1;
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  DPGetDeviceCount(gpu_num);  // check current device environment
+  if (gpu_num > 0) {
+    options.config.set_allow_soft_placement(true);
+    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(
+        0.9);
+    options.config.mutable_gpu_options()->set_allow_growth(true);
+    DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
+    std::string str = "/gpu:";
+    str += std::to_string(gpu_rank % gpu_num);
+    graph::SetDefaultDevice(str, graph_def);
+  }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  deepmd::check_status(NewSession(options, &session));
+  deepmd::check_status(ReadBinaryProto(Env::Default(), model, graph_def));
+  deepmd::check_status(session->Create(*graph_def));
+  try {
+    model_version = get_scalar<STRINGTYPE>("model_attr/model_version");
+  } catch (deepmd::tf_exception &e) {
+    // no model version defined in old models
+    model_version = "0.0";
+  }
+  if (!model_compatable(model_version)) {
+    throw deepmd::deepmd_exception(
+        "incompatable model: version " + model_version +
+        " in graph, but version " + global_model_version +
+        " supported "
+        "See https://deepmd.rtfd.io/compatability/ for details.");
+  }
+  dtype = session_get_dtype(session, "descrpt_attr/rcut");
+  if (dtype == tensorflow::DT_DOUBLE) {
+    rcut = get_scalar<double>("descrpt_attr/rcut");
+  } else {
+    rcut = get_scalar<float>("descrpt_attr/rcut");
+  }
+  cell_size = rcut;
+  ntypes = get_scalar<int>("descrpt_attr/ntypes");
+  odim = get_scalar<int>("model_attr/output_dim");
+  get_vector<int>(sel_type, "model_attr/sel_type");
+  model_type = get_scalar<STRINGTYPE>("model_attr/model_type");
+  inited = true;
+}
+
+template <class VT>
+VT DeepTensorTF::get_scalar(const std::string &name) const {
+  return session_get_scalar<VT>(session, name, name_scope);
+}
+
+template <class VT>
+void DeepTensorTF::get_vector(std::vector<VT> &vec,
+                              const std::string &name) const {
+  session_get_vector<VT>(vec, session, name, name_scope);
+}
+
+template <typename MODELTYPE, typename VALUETYPE>
+void DeepTensorTF::run_model(
+    std::vector<VALUETYPE> &d_tensor_,
+    Session *session,
+    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
+    const AtomMap &atommap,
+    const std::vector<int> &sel_fwd,
+    const int nghost) {
+  unsigned nloc = atommap.get_type().size();
+  unsigned nall = nloc + nghost;
+  if (nloc == 0) {
+    // return empty
+    d_tensor_.clear();
+    return;
+  }
+
+  std::vector<Tensor> output_tensors;
+  deepmd::check_status(
+      session->Run(input_tensors, {name_prefix(name_scope) + "o_" + model_type},
+                   {}, &output_tensors));
+
+  Tensor output_t = output_tensors[0];
+  // Yixiao: newer model may output rank 2 tensor [nframes x (natoms x noutdim)]
+  // assert (output_t.dims() == 1), "dim of output tensor should be 1";
+  auto ot = output_t.flat<MODELTYPE>();
+  // this is an Eigen Tensor
+  int o_size = ot.size();
+
+  std::vector<VALUETYPE> d_tensor(o_size);
+  for (unsigned ii = 0; ii < o_size; ++ii) {
+    d_tensor[ii] = ot(ii);
+  }
+  // now we map the type-sorted sel-atom tensor back to original order
+  // first we have to get the type-sorted select map
+  std::vector<int> sel_srt = sel_fwd;
+  select_map<int>(sel_srt, sel_fwd, atommap.get_fwd_map(), 1);
+  // remove those -1 that correspond to discarded atoms
+  std::remove(sel_srt.begin(), sel_srt.end(), -1);
+  // now map the tensor back
+  d_tensor_.resize(o_size);
+  select_map<VALUETYPE>(d_tensor_, d_tensor, sel_srt, odim);
+}
+
+template void DeepTensorTF::run_model<double, double>(
+    std::vector<double> &d_tensor_,
+    Session *session,
+    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
+    const AtomMap &atommap,
+    const std::vector<int> &sel_fwd,
+    const int nghost);
+template void DeepTensorTF::run_model<float, double>(
+    std::vector<double> &d_tensor_,
+    Session *session,
+    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
+    const AtomMap &atommap,
+    const std::vector<int> &sel_fwd,
+    const int nghost);
+template void DeepTensorTF::run_model<double, float>(
+    std::vector<float> &d_tensor_,
+    Session *session,
+    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
+    const AtomMap &atommap,
+    const std::vector<int> &sel_fwd,
+    const int nghost);
+template void DeepTensorTF::run_model<float, float>(
+    std::vector<float> &d_tensor_,
+    Session *session,
+    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
+    const AtomMap &atommap,
+    const std::vector<int> &sel_fwd,
+    const int nghost);
+
+template <typename MODELTYPE, typename VALUETYPE>
+void DeepTensorTF::run_model(
+    std::vector<VALUETYPE> &dglobal_tensor_,
+    std::vector<VALUETYPE> &dforce_,
+    std::vector<VALUETYPE> &dvirial_,
+    std::vector<VALUETYPE> &datom_tensor_,
+    std::vector<VALUETYPE> &datom_virial_,
+    tensorflow::Session *session,
+    const std::vector<std::pair<std::string, tensorflow::Tensor>>
+        &input_tensors,
+    const AtomMap &atommap,
+    const std::vector<int> &sel_fwd,
+    const int nghost) {
+  unsigned nloc = atommap.get_type().size();
+  unsigned nall = nloc + nghost;
+  unsigned nsel = nloc - std::count(sel_fwd.begin(), sel_fwd.end(), -1);
+  if (nloc == 0) {
+    // return empty
+    dglobal_tensor_.clear();
+    dforce_.clear();
+    dvirial_.clear();
+    return;
+  }
+
+  std::vector<Tensor> output_tensors;
+  deepmd::check_status(
+      session->Run(input_tensors,
+                   {name_prefix(name_scope) + "o_global_" + model_type,
+                    name_prefix(name_scope) + "o_force",
+                    name_prefix(name_scope) + "o_virial",
+                    name_prefix(name_scope) + "o_" + model_type,
+                    name_prefix(name_scope) + "o_atom_virial"},
+                   {}, &output_tensors));
+
+  Tensor output_gt = output_tensors[0];
+  Tensor output_f = output_tensors[1];
+  Tensor output_v = output_tensors[2];
+  Tensor output_at = output_tensors[3];
+  Tensor output_av = output_tensors[4];
+  // this is the new model, output has to be rank 2 tensor
+  assert(output_gt.dims() == 2 && "dim of output tensor should be 2");
+  assert(output_f.dims() == 2 && "dim of output tensor should be 2");
+  assert(output_v.dims() == 2 && "dim of output tensor should be 2");
+  assert(output_at.dims() == 2 && "dim of output tensor should be 2");
+  assert(output_av.dims() == 2 && "dim of output tensor should be 2");
+  // also check the tensor shapes
+  assert(output_gt.dim_size(0) == 1 && "nframes should match");
+  assert(output_gt.dim_size(1) == odim &&
+         "dof of global tensor should be odim");
+  assert(output_f.dim_size(0) == 1 && "nframes should match");
+  assert(output_f.dim_size(1) == odim * nall * 3 &&
+         "dof of force should be odim * nall * 3");
+  assert(output_v.dim_size(0) == 1 && "nframes should match");
+  assert(output_v.dim_size(1) == odim * 9 &&
+         "dof of virial should be odim * 9");
+  assert(output_at.dim_size(0) == 1 && "nframes should match");
+  assert(output_at.dim_size(1) == nsel * odim &&
+         "dof of atomic tensor should be nsel * odim");
+  assert(output_av.dim_size(0) == 1 && "nframes should match");
+  assert(output_av.dim_size(1) == odim * nall * 9 &&
+         "dof of atomic virial should be odim * nall * 9");
+
+  auto ogt = output_gt.flat<ENERGYTYPE>();
+  auto of = output_f.flat<MODELTYPE>();
+  auto ov = output_v.flat<MODELTYPE>();
+  auto oat = output_at.flat<MODELTYPE>();
+  auto oav = output_av.flat<MODELTYPE>();
+
+  // global tensor
+  dglobal_tensor_.resize(odim);
+  for (unsigned ii = 0; ii < odim; ++ii) {
+    dglobal_tensor_[ii] = ogt(ii);
+  }
+
+  // component-wise force
+  std::vector<VALUETYPE> dforce(3 * static_cast<size_t>(nall) * odim);
+  for (unsigned ii = 0; ii < odim * nall * 3; ++ii) {
+    dforce[ii] = of(ii);
+  }
+  dforce_ = dforce;
+  for (unsigned dd = 0; dd < odim; ++dd) {
+    atommap.backward<VALUETYPE>(dforce_.begin() + (dd * nall * 3),
+                                dforce.begin() + (dd * nall * 3), 3);
+  }
+
+  // component-wise virial
+  dvirial_.resize(static_cast<size_t>(odim) * 9);
+  for (unsigned ii = 0; ii < odim * 9; ++ii) {
+    dvirial_[ii] = ov(ii);
+  }
+
+  // atomic tensor
+  std::vector<VALUETYPE> datom_tensor(static_cast<size_t>(nsel) * odim);
+  for (unsigned ii = 0; ii < nsel * odim; ++ii) {
+    datom_tensor[ii] = oat(ii);
+  }
+  std::vector<int> sel_srt = sel_fwd;
+  select_map<int>(sel_srt, sel_fwd, atommap.get_fwd_map(), 1);
+  std::remove(sel_srt.begin(), sel_srt.end(), -1);
+  datom_tensor_.resize(static_cast<size_t>(nsel) * odim);
+  select_map<VALUETYPE>(datom_tensor_, datom_tensor, sel_srt, odim);
+
+  // component-wise atomic virial
+  std::vector<VALUETYPE> datom_virial(9 * static_cast<size_t>(nall) * odim);
+  for (unsigned ii = 0; ii < odim * nall * 9; ++ii) {
+    datom_virial[ii] = oav(ii);
+  }
+  datom_virial_ = datom_virial;
+  for (unsigned dd = 0; dd < odim; ++dd) {
+    atommap.backward<VALUETYPE>(datom_virial_.begin() + (dd * nall * 9),
+                                datom_virial.begin() + (dd * nall * 9), 9);
+  }
+}
+
+template void DeepTensorTF::run_model<double, double>(
+    std::vector<double> &dglobal_tensor_,
+    std::vector<double> &dforce_,
+    std::vector<double> &dvirial_,
+    std::vector<double> &datom_tensor_,
+    std::vector<double> &datom_virial_,
+    tensorflow::Session *session,
+    const std::vector<std::pair<std::string, tensorflow::Tensor>>
+        &input_tensors,
+    const AtomMap &atommap,
+    const std::vector<int> &sel_fwd,
+    const int nghost);
+template void DeepTensorTF::run_model<float, double>(
+    std::vector<double> &dglobal_tensor_,
+    std::vector<double> &dforce_,
+    std::vector<double> &dvirial_,
+    std::vector<double> &datom_tensor_,
+    std::vector<double> &datom_virial_,
+    tensorflow::Session *session,
+    const std::vector<std::pair<std::string, tensorflow::Tensor>>
+        &input_tensors,
+    const AtomMap &atommap,
+    const std::vector<int> &sel_fwd,
+    const int nghost);
+
+template void DeepTensorTF::run_model<double, float>(
+    std::vector<float> &dglobal_tensor_,
+    std::vector<float> &dforce_,
+    std::vector<float> &dvirial_,
+    std::vector<float> &datom_tensor_,
+    std::vector<float> &datom_virial_,
+    tensorflow::Session *session,
+    const std::vector<std::pair<std::string, tensorflow::Tensor>>
+        &input_tensors,
+    const AtomMap &atommap,
+    const std::vector<int> &sel_fwd,
+    const int nghost);
+
+template void DeepTensorTF::run_model<float, float>(
+    std::vector<float> &dglobal_tensor_,
+    std::vector<float> &dforce_,
+    std::vector<float> &dvirial_,
+    std::vector<float> &datom_tensor_,
+    std::vector<float> &datom_virial_,
+    tensorflow::Session *session,
+    const std::vector<std::pair<std::string, tensorflow::Tensor>>
+        &input_tensors,
+    const AtomMap &atommap,
+    const std::vector<int> &sel_fwd,
+    const int nghost);
+
+template <typename VALUETYPE>
+void DeepTensorTF::compute(std::vector<VALUETYPE> &dtensor_,
+                           const std::vector<VALUETYPE> &dcoord_,
+                           const std::vector<int> &datype_,
+                           const std::vector<VALUETYPE> &dbox) {
+  int nall = datype_.size();
+  std::vector<VALUETYPE> dcoord, aparam, aparam_;
+  std::vector<int> datype, fwd_map, bkw_map;
+  int nghost_real, nall_real, nloc_real;
+  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
+                          nall_real, nloc_real, dcoord_, datype_, aparam_, 0,
+                          ntypes, 1, 0, nall);
+  compute_inner(dtensor_, dcoord, datype, dbox);
+}
+
+template void DeepTensorTF::compute<double>(std::vector<double> &dtensor_,
+                                            const std::vector<double> &dcoord_,
+                                            const std::vector<int> &datype_,
+                                            const std::vector<double> &dbox);
+
+template void DeepTensorTF::compute<float>(std::vector<float> &dtensor_,
+                                           const std::vector<float> &dcoord_,
+                                           const std::vector<int> &datype_,
+                                           const std::vector<float> &dbox);
+
+template <typename VALUETYPE>
+void DeepTensorTF::compute(std::vector<VALUETYPE> &dtensor_,
+                           const std::vector<VALUETYPE> &dcoord_,
+                           const std::vector<int> &datype_,
+                           const std::vector<VALUETYPE> &dbox,
+                           const int nghost,
+                           const InputNlist &lmp_list) {
+  int nall = datype_.size();
+  std::vector<VALUETYPE> dcoord, dforce, datom_virial, aparam, aparam_;
+  std::vector<int> datype, fwd_map, bkw_map;
+  int nghost_real, nall_real, nloc_real;
+  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
+                          nall_real, nloc_real, dcoord_, datype_, aparam_,
+                          nghost, ntypes, 1, 0, nall);
+  // internal nlist
+  NeighborListData nlist_data;
+  nlist_data.copy_from_nlist(lmp_list);
+  nlist_data.shuffle_exclude_empty(fwd_map);
+  InputNlist nlist;
+  nlist_data.make_inlist(nlist);
+  compute_inner(dtensor_, dcoord, datype, dbox, nghost_real, nlist);
+}
+
+template void DeepTensorTF::compute<double>(std::vector<double> &dtensor_,
+                                            const std::vector<double> &dcoord_,
+                                            const std::vector<int> &datype_,
+                                            const std::vector<double> &dbox,
+                                            const int nghost,
+                                            const InputNlist &lmp_list);
+
+template void DeepTensorTF::compute<float>(std::vector<float> &dtensor_,
+                                           const std::vector<float> &dcoord_,
+                                           const std::vector<int> &datype_,
+                                           const std::vector<float> &dbox,
+                                           const int nghost,
+                                           const InputNlist &lmp_list);
+
+template <typename VALUETYPE>
+void DeepTensorTF::compute(std::vector<VALUETYPE> &dglobal_tensor_,
+                           std::vector<VALUETYPE> &dforce_,
+                           std::vector<VALUETYPE> &dvirial_,
+                           std::vector<VALUETYPE> &datom_tensor_,
+                           std::vector<VALUETYPE> &datom_virial_,
+                           const std::vector<VALUETYPE> &dcoord_,
+                           const std::vector<int> &datype_,
+                           const std::vector<VALUETYPE> &dbox) {
+  int nall = datype_.size();
+  std::vector<VALUETYPE> dcoord, dforce, datom_virial, aparam, aparam_;
+  std::vector<int> datype, fwd_map, bkw_map;
+  int nghost_real, nall_real, nloc_real;
+  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
+                          nall_real, nloc_real, dcoord_, datype_, aparam_, 0,
+                          ntypes, 1, 0, nall);
+  assert(nghost_real == 0);
+  // resize to nall_real
+  dcoord.resize(bkw_map.size() * 3);
+  datype.resize(bkw_map.size());
+  // fwd map
+  select_map<VALUETYPE>(dcoord, dcoord_, fwd_map, 3);
+  select_map<int>(datype, datype_, fwd_map, 1);
+  compute_inner(dglobal_tensor_, dforce, dvirial_, datom_tensor_, datom_virial,
+                dcoord, datype, dbox);
+  // bkw map
+  dforce_.resize(odim * fwd_map.size() * 3);
+  for (int kk = 0; kk < odim; ++kk) {
+    select_map<VALUETYPE>(dforce_.begin() + kk * fwd_map.size() * 3,
+                          dforce.begin() + kk * bkw_map.size() * 3, bkw_map, 3);
+  }
+  datom_virial_.resize(odim * fwd_map.size() * 9);
+  for (int kk = 0; kk < odim; ++kk) {
+    select_map<VALUETYPE>(datom_virial_.begin() + kk * fwd_map.size() * 9,
+                          datom_virial.begin() + kk * bkw_map.size() * 9,
+                          bkw_map, 9);
+  }
+}
+
+template void DeepTensorTF::compute<double>(
+    std::vector<double> &dglobal_tensor_,
+    std::vector<double> &dforce_,
+    std::vector<double> &dvirial_,
+    std::vector<double> &datom_tensor_,
+    std::vector<double> &datom_virial_,
+    const std::vector<double> &dcoord_,
+    const std::vector<int> &datype_,
+    const std::vector<double> &dbox);
+
+template void DeepTensorTF::compute<float>(std::vector<float> &dglobal_tensor_,
+                                           std::vector<float> &dforce_,
+                                           std::vector<float> &dvirial_,
+                                           std::vector<float> &datom_tensor_,
+                                           std::vector<float> &datom_virial_,
+                                           const std::vector<float> &dcoord_,
+                                           const std::vector<int> &datype_,
+                                           const std::vector<float> &dbox);
+
+template <typename VALUETYPE>
+void DeepTensorTF::compute(std::vector<VALUETYPE> &dglobal_tensor_,
+                           std::vector<VALUETYPE> &dforce_,
+                           std::vector<VALUETYPE> &dvirial_,
+                           std::vector<VALUETYPE> &datom_tensor_,
+                           std::vector<VALUETYPE> &datom_virial_,
+                           const std::vector<VALUETYPE> &dcoord_,
+                           const std::vector<int> &datype_,
+                           const std::vector<VALUETYPE> &dbox,
+                           const int nghost,
+                           const InputNlist &lmp_list) {
+  int nall = datype_.size();
+  std::vector<VALUETYPE> dcoord, dforce, datom_virial, aparam, aparam_;
+  std::vector<int> datype, fwd_map, bkw_map;
+  int nghost_real, nall_real, nloc_real;
+  select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
+                          nall_real, nloc_real, dcoord_, datype_, aparam_,
+                          nghost, ntypes, 1, 0, nall);
+  // internal nlist
+  NeighborListData nlist_data;
+  nlist_data.copy_from_nlist(lmp_list);
+  nlist_data.shuffle_exclude_empty(fwd_map);
+  InputNlist nlist;
+  nlist_data.make_inlist(nlist);
+  compute_inner(dglobal_tensor_, dforce, dvirial_, datom_tensor_, datom_virial,
+                dcoord, datype, dbox, nghost_real, nlist);
+  // bkw map
+  dforce_.resize(odim * fwd_map.size() * 3);
+  for (int kk = 0; kk < odim; ++kk) {
+    select_map<VALUETYPE>(dforce_.begin() + kk * fwd_map.size() * 3,
+                          dforce.begin() + kk * bkw_map.size() * 3, bkw_map, 3);
+  }
+  datom_virial_.resize(odim * fwd_map.size() * 9);
+  for (int kk = 0; kk < odim; ++kk) {
+    select_map<VALUETYPE>(datom_virial_.begin() + kk * fwd_map.size() * 9,
+                          datom_virial.begin() + kk * bkw_map.size() * 9,
+                          bkw_map, 9);
+  }
+}
+
+template void DeepTensorTF::compute<double>(
+    std::vector<double> &dglobal_tensor_,
+    std::vector<double> &dforce_,
+    std::vector<double> &dvirial_,
+    std::vector<double> &datom_tensor_,
+    std::vector<double> &datom_virial_,
+    const std::vector<double> &dcoord_,
+    const std::vector<int> &datype_,
+    const std::vector<double> &dbox,
+    const int nghost,
+    const InputNlist &lmp_list);
+
+template void DeepTensorTF::compute<float>(std::vector<float> &dglobal_tensor_,
+                                           std::vector<float> &dforce_,
+                                           std::vector<float> &dvirial_,
+                                           std::vector<float> &datom_tensor_,
+                                           std::vector<float> &datom_virial_,
+                                           const std::vector<float> &dcoord_,
+                                           const std::vector<int> &datype_,
+                                           const std::vector<float> &dbox,
+                                           const int nghost,
+                                           const InputNlist &lmp_list);
+
+template <typename VALUETYPE>
+void DeepTensorTF::compute_inner(std::vector<VALUETYPE> &dtensor_,
+                                 const std::vector<VALUETYPE> &dcoord_,
+                                 const std::vector<int> &datype_,
+                                 const std::vector<VALUETYPE> &dbox) {
+  int nall = dcoord_.size() / 3;
+  int nloc = nall;
+  AtomMap atommap(datype_.begin(), datype_.begin() + nloc);
+  assert(nloc == atommap.get_type().size());
+
+  std::vector<int> sel_fwd, sel_bkw;
+  int nghost_sel;
+  // this gives the raw selection map, will pass to run model
+  select_by_type(sel_fwd, sel_bkw, nghost_sel, dcoord_, datype_, 0, sel_type);
+
+  std::vector<std::pair<std::string, Tensor>> input_tensors;
+
+  if (dtype == tensorflow::DT_DOUBLE) {
+    int ret = session_input_tensors<double>(
+        input_tensors, dcoord_, ntypes, datype_, dbox, cell_size,
+        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap,
+        name_scope);
+    assert(ret == nloc);
+    run_model<double>(dtensor_, session, input_tensors, atommap, sel_fwd);
+  } else {
+    int ret = session_input_tensors<float>(
+        input_tensors, dcoord_, ntypes, datype_, dbox, cell_size,
+        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap,
+        name_scope);
+    assert(ret == nloc);
+    run_model<float>(dtensor_, session, input_tensors, atommap, sel_fwd);
+  }
+}
+
+template void DeepTensorTF::compute_inner<double>(
+    std::vector<double> &dtensor_,
+    const std::vector<double> &dcoord_,
+    const std::vector<int> &datype_,
+    const std::vector<double> &dbox);
+
+template void DeepTensorTF::compute_inner<float>(
+    std::vector<float> &dtensor_,
+    const std::vector<float> &dcoord_,
+    const std::vector<int> &datype_,
+    const std::vector<float> &dbox);
+
+template <typename VALUETYPE>
+void DeepTensorTF::compute_inner(std::vector<VALUETYPE> &dtensor_,
+                                 const std::vector<VALUETYPE> &dcoord_,
+                                 const std::vector<int> &datype_,
+                                 const std::vector<VALUETYPE> &dbox,
+                                 const int nghost,
+                                 const InputNlist &nlist_) {
+  int nall = dcoord_.size() / 3;
+  int nloc = nall - nghost;
+  AtomMap atommap(datype_.begin(), datype_.begin() + nloc);
+  assert(nloc == atommap.get_type().size());
+
+  std::vector<int> sel_fwd, sel_bkw;
+  int nghost_sel;
+  // this gives the raw selection map, will pass to run model
+  select_by_type(sel_fwd, sel_bkw, nghost_sel, dcoord_, datype_, nghost,
+                 sel_type);
+  sel_fwd.resize(nloc);
+
+  NeighborListData nlist_data;
+  nlist_data.copy_from_nlist(nlist_);
+  nlist_data.shuffle(atommap);
+  InputNlist nlist;
+  nlist_data.make_inlist(nlist);
+
+  std::vector<std::pair<std::string, Tensor>> input_tensors;
+
+  if (dtype == tensorflow::DT_DOUBLE) {
+    int ret = session_input_tensors<double>(
+        input_tensors, dcoord_, ntypes, datype_, dbox, nlist,
+        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap, nghost, 0,
+        name_scope);
+    assert(nloc == ret);
+    run_model<double>(dtensor_, session, input_tensors, atommap, sel_fwd,
+                      nghost);
+  } else {
+    int ret = session_input_tensors<float>(
+        input_tensors, dcoord_, ntypes, datype_, dbox, nlist,
+        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap, nghost, 0,
+        name_scope);
+    assert(nloc == ret);
+    run_model<float>(dtensor_, session, input_tensors, atommap, sel_fwd,
+                     nghost);
+  }
+}
+
+template void DeepTensorTF::compute_inner<double>(
+    std::vector<double> &dtensor_,
+    const std::vector<double> &dcoord_,
+    const std::vector<int> &datype_,
+    const std::vector<double> &dbox,
+    const int nghost,
+    const InputNlist &nlist_);
+
+template void DeepTensorTF::compute_inner<float>(
+    std::vector<float> &dtensor_,
+    const std::vector<float> &dcoord_,
+    const std::vector<int> &datype_,
+    const std::vector<float> &dbox,
+    const int nghost,
+    const InputNlist &nlist_);
+
+template <typename VALUETYPE>
+void DeepTensorTF::compute_inner(std::vector<VALUETYPE> &dglobal_tensor_,
+                                 std::vector<VALUETYPE> &dforce_,
+                                 std::vector<VALUETYPE> &dvirial_,
+                                 std::vector<VALUETYPE> &datom_tensor_,
+                                 std::vector<VALUETYPE> &datom_virial_,
+                                 const std::vector<VALUETYPE> &dcoord_,
+                                 const std::vector<int> &datype_,
+                                 const std::vector<VALUETYPE> &dbox) {
+  int nall = dcoord_.size() / 3;
+  int nloc = nall;
+  AtomMap atommap(datype_.begin(), datype_.begin() + nloc);
+  assert(nloc == atommap.get_type().size());
+
+  std::vector<int> sel_fwd, sel_bkw;
+  int nghost_sel;
+  // this gives the raw selection map, will pass to run model
+  select_by_type(sel_fwd, sel_bkw, nghost_sel, dcoord_, datype_, 0, sel_type);
+
+  std::vector<std::pair<std::string, Tensor>> input_tensors;
+
+  if (dtype == tensorflow::DT_DOUBLE) {
+    int ret = session_input_tensors<double>(
+        input_tensors, dcoord_, ntypes, datype_, dbox, cell_size,
+        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap,
+        name_scope);
+    assert(ret == nloc);
+    run_model<double>(dglobal_tensor_, dforce_, dvirial_, datom_tensor_,
+                      datom_virial_, session, input_tensors, atommap, sel_fwd);
+  } else {
+    int ret = session_input_tensors<float>(
+        input_tensors, dcoord_, ntypes, datype_, dbox, cell_size,
+        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap,
+        name_scope);
+    assert(ret == nloc);
+    run_model<float>(dglobal_tensor_, dforce_, dvirial_, datom_tensor_,
+                     datom_virial_, session, input_tensors, atommap, sel_fwd);
+  }
+}
+
+template void DeepTensorTF::compute_inner<double>(
+    std::vector<double> &dglobal_tensor_,
+    std::vector<double> &dforce_,
+    std::vector<double> &dvirial_,
+    std::vector<double> &datom_tensor_,
+    std::vector<double> &datom_virial_,
+    const std::vector<double> &dcoord_,
+    const std::vector<int> &datype_,
+    const std::vector<double> &dbox);
+
+template void DeepTensorTF::compute_inner<float>(
+    std::vector<float> &dglobal_tensor_,
+    std::vector<float> &dforce_,
+    std::vector<float> &dvirial_,
+    std::vector<float> &datom_tensor_,
+    std::vector<float> &datom_virial_,
+    const std::vector<float> &dcoord_,
+    const std::vector<int> &datype_,
+    const std::vector<float> &dbox);
+
+template <typename VALUETYPE>
+void DeepTensorTF::compute_inner(std::vector<VALUETYPE> &dglobal_tensor_,
+                                 std::vector<VALUETYPE> &dforce_,
+                                 std::vector<VALUETYPE> &dvirial_,
+                                 std::vector<VALUETYPE> &datom_tensor_,
+                                 std::vector<VALUETYPE> &datom_virial_,
+                                 const std::vector<VALUETYPE> &dcoord_,
+                                 const std::vector<int> &datype_,
+                                 const std::vector<VALUETYPE> &dbox,
+                                 const int nghost,
+                                 const InputNlist &nlist_) {
+  int nall = dcoord_.size() / 3;
+  int nloc = nall - nghost;
+  AtomMap atommap(datype_.begin(), datype_.begin() + nloc);
+  assert(nloc == atommap.get_type().size());
+
+  std::vector<int> sel_fwd, sel_bkw;
+  int nghost_sel;
+  // this gives the raw selection map, will pass to run model
+  select_by_type(sel_fwd, sel_bkw, nghost_sel, dcoord_, datype_, nghost,
+                 sel_type);
+  sel_fwd.resize(nloc);
+
+  NeighborListData nlist_data;
+  nlist_data.copy_from_nlist(nlist_);
+  nlist_data.shuffle(atommap);
+  InputNlist nlist;
+  nlist_data.make_inlist(nlist);
+
+  std::vector<std::pair<std::string, Tensor>> input_tensors;
+
+  if (dtype == tensorflow::DT_DOUBLE) {
+    int ret = session_input_tensors<double>(
+        input_tensors, dcoord_, ntypes, datype_, dbox, nlist,
+        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap, nghost, 0,
+        name_scope);
+    assert(nloc == ret);
+    run_model<double>(dglobal_tensor_, dforce_, dvirial_, datom_tensor_,
+                      datom_virial_, session, input_tensors, atommap, sel_fwd,
+                      nghost);
+  } else {
+    int ret = session_input_tensors<float>(
+        input_tensors, dcoord_, ntypes, datype_, dbox, nlist,
+        std::vector<VALUETYPE>(), std::vector<VALUETYPE>(), atommap, nghost, 0,
+        name_scope);
+    assert(nloc == ret);
+    run_model<float>(dglobal_tensor_, dforce_, dvirial_, datom_tensor_,
+                     datom_virial_, session, input_tensors, atommap, sel_fwd,
+                     nghost);
+  }
+}
+
+template void DeepTensorTF::compute_inner<double>(
+    std::vector<double> &dglobal_tensor_,
+    std::vector<double> &dforce_,
+    std::vector<double> &dvirial_,
+    std::vector<double> &datom_tensor_,
+    std::vector<double> &datom_virial_,
+    const std::vector<double> &dcoord_,
+    const std::vector<int> &datype_,
+    const std::vector<double> &dbox,
+    const int nghost,
+    const InputNlist &nlist_);
+
+template void DeepTensorTF::compute_inner<float>(
+    std::vector<float> &dglobal_tensor_,
+    std::vector<float> &dforce_,
+    std::vector<float> &dvirial_,
+    std::vector<float> &datom_tensor_,
+    std::vector<float> &datom_virial_,
+    const std::vector<float> &dcoord_,
+    const std::vector<int> &datype_,
+    const std::vector<float> &dbox,
+    const int nghost,
+    const InputNlist &nlist_);
+
+void DeepTensorTF::get_type_map(std::string &type_map) {
+  type_map = get_scalar<STRINGTYPE>("model_attr/tmap");
+}
+
+void DeepTensorTF::computew(std::vector<double> &global_tensor,
+                            std::vector<double> &force,
+                            std::vector<double> &virial,
+                            std::vector<double> &atom_tensor,
+                            std::vector<double> &atom_virial,
+                            const std::vector<double> &coord,
+                            const std::vector<int> &atype,
+                            const std::vector<double> &box,
+                            const bool request_deriv) {
+  if (request_deriv) {
+    compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
+            atype, box);
+  } else {
+    compute(global_tensor, coord, atype, box);
+    force.clear();
+    virial.clear();
+    atom_tensor.clear();
+    atom_virial.clear();
+  }
+}
+void DeepTensorTF::computew(std::vector<float> &global_tensor,
+                            std::vector<float> &force,
+                            std::vector<float> &virial,
+                            std::vector<float> &atom_tensor,
+                            std::vector<float> &atom_virial,
+                            const std::vector<float> &coord,
+                            const std::vector<int> &atype,
+                            const std::vector<float> &box,
+                            const bool request_deriv) {
+  if (request_deriv) {
+    compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
+            atype, box);
+  } else {
+    compute(global_tensor, coord, atype, box);
+    force.clear();
+    virial.clear();
+    atom_tensor.clear();
+    atom_virial.clear();
+  }
+}
+
+void DeepTensorTF::computew(std::vector<double> &global_tensor,
+                            std::vector<double> &force,
+                            std::vector<double> &virial,
+                            std::vector<double> &atom_tensor,
+                            std::vector<double> &atom_virial,
+                            const std::vector<double> &coord,
+                            const std::vector<int> &atype,
+                            const std::vector<double> &box,
+                            const int nghost,
+                            const InputNlist &inlist,
+                            const bool request_deriv) {
+  if (request_deriv) {
+    compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
+            atype, box, nghost, inlist);
+  } else {
+    compute(global_tensor, coord, atype, box, nghost, inlist);
+    force.clear();
+    virial.clear();
+    atom_tensor.clear();
+    atom_virial.clear();
+  }
+}
+void DeepTensorTF::computew(std::vector<float> &global_tensor,
+                            std::vector<float> &force,
+                            std::vector<float> &virial,
+                            std::vector<float> &atom_tensor,
+                            std::vector<float> &atom_virial,
+                            const std::vector<float> &coord,
+                            const std::vector<int> &atype,
+                            const std::vector<float> &box,
+                            const int nghost,
+                            const InputNlist &inlist,
+                            const bool request_deriv) {
+  if (request_deriv) {
+    compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
+            atype, box, nghost, inlist);
+  } else {
+    compute(global_tensor, coord, atype, box, nghost, inlist);
+    force.clear();
+    virial.clear();
+    atom_tensor.clear();
+    atom_virial.clear();
+  }
+}
diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc
index 5994e9446f..2f75aaa291 100644
--- a/source/api_cc/src/common.cc
+++ b/source/api_cc/src/common.cc
@@ -171,14 +171,15 @@ void deepmd::select_real_atoms_coord(std::vector<VALUETYPE>& dcoord,
   // resize to nall_real
   nall_real = bkw_map.size();
   nloc_real = nall_real - nghost_real;
-  dcoord.resize(nframes * nall_real * 3);
+  dcoord.resize(static_cast<size_t>(nframes) * nall_real * 3);
   datype.resize(nall_real);
   // fwd map
   select_map<VALUETYPE>(dcoord, dcoord_, fwd_map, 3, nframes, nall_real, nall);
   select_map<int>(datype, datype_, fwd_map, 1);
   // aparam
   if (daparam > 0) {
-    aparam.resize(nframes * (aparam_nall ? nall_real : nloc_real));
+    aparam.resize(static_cast<size_t>(nframes) *
+                  (aparam_nall ? nall_real : nloc_real));
     select_map<VALUETYPE>(aparam, aparam_, fwd_map, daparam, nframes,
                           (aparam_nall ? nall_real : nloc_real),
                           (aparam_nall ? nall : (nall - nghost)));
@@ -396,7 +397,7 @@ int deepmd::session_input_tensors(
 
   TensorShape coord_shape;
   coord_shape.AddDim(nframes);
-  coord_shape.AddDim(nall * 3);
+  coord_shape.AddDim(static_cast<int64_t>(nall) * 3);
   TensorShape type_shape;
   type_shape.AddDim(nframes);
   type_shape.AddDim(nall);
@@ -540,7 +541,7 @@ int deepmd::session_input_tensors(
 
   TensorShape coord_shape;
   coord_shape.AddDim(nframes);
-  coord_shape.AddDim(nall * 3);
+  coord_shape.AddDim(static_cast<int64_t>(nall) * 3);
   TensorShape type_shape;
   type_shape.AddDim(nframes);
   type_shape.AddDim(nall);
@@ -675,7 +676,7 @@ int deepmd::session_input_tensors_mixed_type(
 
   TensorShape coord_shape;
   coord_shape.AddDim(nframes);
-  coord_shape.AddDim(nall * 3);
+  coord_shape.AddDim(static_cast<int64_t>(nall) * 3);
   TensorShape type_shape;
   type_shape.AddDim(nframes);
   type_shape.AddDim(nall);
@@ -1277,10 +1278,10 @@ void deepmd::print_summary(const std::string& pre) {
   deepmd::get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
   std::cout << pre << "installed to:       " + global_install_prefix << "\n";
   std::cout << pre << "source:             " + global_git_summ << "\n";
-  std::cout << pre << "source branch:       " + global_git_branch << "\n";
+  std::cout << pre << "source branch:      " + global_git_branch << "\n";
   std::cout << pre << "source commit:      " + global_git_hash << "\n";
   std::cout << pre << "source commit at:   " + global_git_date << "\n";
-  std::cout << pre << "surpport model ver.:" + global_model_version << "\n";
+  std::cout << pre << "support model ver.: " + global_model_version << "\n";
 #if defined(GOOGLE_CUDA)
   std::cout << pre << "build variant:      cuda"
             << "\n";
diff --git a/source/api_cc/tests/test_deepdipole.cc b/source/api_cc/tests/test_deepdipole.cc
index b8f2195728..86a8a4131f 100644
--- a/source/api_cc/tests/test_deepdipole.cc
+++ b/source/api_cc/tests/test_deepdipole.cc
@@ -238,7 +238,7 @@ class TestInferDeepDipoleNew : public ::testing::Test {
       }
     }
 
-    expected_gv.resize(odim * 9);
+    expected_gv.resize(static_cast<size_t>(odim) * 9);
     for (int kk = 0; kk < odim; ++kk) {
       for (int ii = 0; ii < natoms; ++ii) {
         for (int dd = 0; dd < 9; ++dd) {
diff --git a/source/api_cc/tests/test_deepmd_exception.cc b/source/api_cc/tests/test_deepmd_exception.cc
index 1cbec270b5..dd97f2786f 100644
--- a/source/api_cc/tests/test_deepmd_exception.cc
+++ b/source/api_cc/tests/test_deepmd_exception.cc
@@ -10,7 +10,9 @@
 #include <string>
 #include <vector>
 
+#include "DataModifier.h"
 #include "DeepPot.h"
+#include "DeepTensor.h"
 #include "errors.h"
 TEST(TestDeepmdException, deepmdexception) {
   std::string expected_error_message = "DeePMD-kit Error: unittest";
@@ -21,6 +23,22 @@ TEST(TestDeepmdException, deepmdexception) {
   }
 }
 
-TEST(TestDeepmdException, deepmdexception_nofile) {
+TEST(TestDeepmdException, deepmdexception_nofile_deeppot) {
   ASSERT_THROW(deepmd::DeepPot("_no_such_file.pb"), deepmd::deepmd_exception);
 }
+
+TEST(TestDeepmdException, deepmdexception_nofile_deeppotmodeldevi) {
+  ASSERT_THROW(
+      deepmd::DeepPotModelDevi({"_no_such_file.pb", "_no_such_file.pb"}),
+      deepmd::deepmd_exception);
+}
+
+TEST(TestDeepmdException, deepmdexception_nofile_deeptensor) {
+  ASSERT_THROW(deepmd::DeepTensor("_no_such_file.pb"),
+               deepmd::deepmd_exception);
+}
+
+TEST(TestDeepmdException, deepmdexception_nofile_dipolechargemodifier) {
+  ASSERT_THROW(deepmd::DipoleChargeModifier("_no_such_file.pb"),
+               deepmd::deepmd_exception);
+}
diff --git a/source/api_cc/tests/test_deeppolar.cc b/source/api_cc/tests/test_deeppolar.cc
index d8ad497054..89014fd245 100644
--- a/source/api_cc/tests/test_deeppolar.cc
+++ b/source/api_cc/tests/test_deeppolar.cc
@@ -470,7 +470,7 @@ class TestInferDeepPolarNew : public ::testing::Test {
       }
     }
 
-    expected_gv.resize(odim * 9);
+    expected_gv.resize(static_cast<size_t>(odim) * 9);
     for (int kk = 0; kk < odim; ++kk) {
       for (int ii = 0; ii < natoms; ++ii) {
         for (int dd = 0; dd < 9; ++dd) {
diff --git a/source/api_cc/tests/test_deeppot_a_fparam_aparam_nframes.cc b/source/api_cc/tests/test_deeppot_a_fparam_aparam_nframes.cc
index 0f45eaabb0..0851523814 100644
--- a/source/api_cc/tests/test_deeppot_a_fparam_aparam_nframes.cc
+++ b/source/api_cc/tests/test_deeppot_a_fparam_aparam_nframes.cc
@@ -127,7 +127,7 @@ class TestInferDeepPotAFparamAparamNFrames : public ::testing::Test {
     EXPECT_EQ(nframes * natoms * 3, expected_f.size());
     EXPECT_EQ(nframes * natoms * 9, expected_v.size());
     expected_tot_e.resize(nframes);
-    expected_tot_v.resize(nframes * 9);
+    expected_tot_v.resize(static_cast<size_t>(nframes) * 9);
     std::fill(expected_tot_e.begin(), expected_tot_e.end(), 0.);
     std::fill(expected_tot_v.begin(), expected_tot_v.end(), 0.);
     for (int kk = 0; kk < nframes; ++kk) {
@@ -768,7 +768,7 @@ class TestInferDeepPotAFparamAparamNFramesSingleParam : public ::testing::Test {
     EXPECT_EQ(nframes * natoms * 3, expected_f.size());
     EXPECT_EQ(nframes * natoms * 9, expected_v.size());
     expected_tot_e.resize(nframes);
-    expected_tot_v.resize(nframes * 9);
+    expected_tot_v.resize(static_cast<size_t>(nframes) * 9);
     std::fill(expected_tot_e.begin(), expected_tot_e.end(), 0.);
     std::fill(expected_tot_v.begin(), expected_tot_v.end(), 0.);
     for (int kk = 0; kk < nframes; ++kk) {
diff --git a/source/api_cc/tests/test_deeppot_a_nframes.cc b/source/api_cc/tests/test_deeppot_a_nframes.cc
index 835971d106..c83a7a0b41 100644
--- a/source/api_cc/tests/test_deeppot_a_nframes.cc
+++ b/source/api_cc/tests/test_deeppot_a_nframes.cc
@@ -123,7 +123,7 @@ class TestInferDeepPotANFrames : public ::testing::Test {
     EXPECT_EQ(nframes * natoms * 3, expected_f.size());
     EXPECT_EQ(nframes * natoms * 9, expected_v.size());
     expected_tot_e.resize(nframes);
-    expected_tot_v.resize(nframes * 9);
+    expected_tot_v.resize(static_cast<size_t>(nframes) * 9);
     std::fill(expected_tot_e.begin(), expected_tot_e.end(), 0.);
     std::fill(expected_tot_v.begin(), expected_tot_v.end(), 0.);
     for (int kk = 0; kk < nframes; ++kk) {
@@ -732,7 +732,7 @@ class TestInferDeepPotANFramesNoPbc : public ::testing::Test {
     EXPECT_EQ(nframes * natoms * 3, expected_f.size());
     EXPECT_EQ(nframes * natoms * 9, expected_v.size());
     expected_tot_e.resize(nframes);
-    expected_tot_v.resize(nframes * 9);
+    expected_tot_v.resize(static_cast<size_t>(nframes) * 9);
     std::fill(expected_tot_e.begin(), expected_tot_e.end(), 0.);
     std::fill(expected_tot_v.begin(), expected_tot_v.end(), 0.);
     for (int kk = 0; kk < nframes; ++kk) {
diff --git a/source/api_cc/tests/test_utils.h b/source/api_cc/tests/test_utils.h
index 46732ca935..d06823b4e0 100644
--- a/source/api_cc/tests/test_utils.h
+++ b/source/api_cc/tests/test_utils.h
@@ -42,7 +42,7 @@ inline void _fold_back(std::vector<VALUETYPE> &out,
                        const int nall,
                        const int ndim,
                        const int nframes = 1) {
-  out.resize(nframes * nloc * ndim);
+  out.resize(static_cast<size_t>(nframes) * nloc * ndim);
   _fold_back<VALUETYPE>(out.begin(), in.begin(), mapping, nloc, nall, ndim,
                         nframes);
 }
diff --git a/source/install/build_cc.sh b/source/install/build_cc.sh
index 74e3835b74..fef9e82ebc 100755
--- a/source/install/build_cc.sh
+++ b/source/install/build_cc.sh
@@ -20,7 +20,7 @@ NPROC=$(nproc --all)
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DUSE_TF_PYTHON_LIBS=TRUE ${CUDA_ARGS} -DLAMMPS_VERSION=stable_2Aug2023_update1 ..
+cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DUSE_TF_PYTHON_LIBS=TRUE ${CUDA_ARGS} -DLAMMPS_VERSION=stable_2Aug2023_update2 ..
 cmake --build . -j${NPROC}
 cmake --install .
 
diff --git a/source/install/build_from_c.sh b/source/install/build_from_c.sh
index cd0aeca089..c1188252ab 100755
--- a/source/install/build_from_c.sh
+++ b/source/install/build_from_c.sh
@@ -13,7 +13,7 @@ NPROC=$(nproc --all)
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DDEEPMD_C_ROOT=${DEEPMD_C_ROOT} -DLAMMPS_VERSION=stable_2Aug2023_update1 ..
+cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DDEEPMD_C_ROOT=${DEEPMD_C_ROOT} -DLAMMPS_VERSION=stable_2Aug2023_update2 ..
 cmake --build . -j${NPROC}
 cmake --install .
 cmake --build . --target=lammps
diff --git a/source/install/build_lammps.sh b/source/install/build_lammps.sh
index c8cfa6ea54..2b5bf0a643 100755
--- a/source/install/build_lammps.sh
+++ b/source/install/build_lammps.sh
@@ -14,7 +14,7 @@ BUILD_TMP_DIR=${SCRIPT_PATH}/../build_lammps
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
 # download LAMMMPS
-LAMMPS_VERSION=stable_2Aug2023_update1
+LAMMPS_VERSION=stable_2Aug2023_update2
 if [ ! -d "lammps-${LAMMPS_VERSION}" ]; then
 	curl -L -o lammps.tar.gz https://github.com/lammps/lammps/archive/refs/tags/${LAMMPS_VERSION}.tar.gz
 	tar vxzf lammps.tar.gz
diff --git a/source/install/docker/Dockerfile b/source/install/docker/Dockerfile
index c5fa878e2a..26b7be9f19 100644
--- a/source/install/docker/Dockerfile
+++ b/source/install/docker/Dockerfile
@@ -1,16 +1,18 @@
-FROM python:3.10 AS compile-image
+FROM python:3.11 AS compile-image
+ARG VARIANT=""
+ARG CUDA_VERSION="12"
 RUN python -m venv /opt/deepmd-kit
 # Make sure we use the virtualenv
 ENV PATH="/opt/deepmd-kit/bin:$PATH"
 # Install package
 COPY dist /dist
-RUN pip install "$(ls /dist/deepmd_kit-*manylinux*_x86_64.whl)[gpu,cu11,lmp,ipi]" \
+RUN pip install "$(ls /dist/deepmd_kit${VARIANT}-*manylinux*_x86_64.whl)[gpu,cu${CUDA_VERSION},lmp,ipi]" \
     && dp -h \
     && lmp -h \
     && dp_ipi \
     && python -m deepmd -h
 
-FROM python:3.10 AS build-image
+FROM python:3.11 AS build-image
 COPY --from=compile-image /opt/deepmd-kit /opt/deepmd-kit
 ENV PATH="/opt/deepmd-kit/bin:$PATH"
 CMD ["/bin/bash"]
diff --git a/source/install/docker_package_c.sh b/source/install/docker_package_c.sh
index d6fb269acd..544c175a0a 100755
--- a/source/install/docker_package_c.sh
+++ b/source/install/docker_package_c.sh
@@ -3,8 +3,8 @@ set -e
 SCRIPT_PATH=$(dirname $(realpath -s $0))
 
 docker run --rm -v ${SCRIPT_PATH}/../..:/root/deepmd-kit -w /root/deepmd-kit \
-	tensorflow/build:2.13-python3.11 \
-	/bin/sh -c "pip install tensorflow cmake \
+	tensorflow/build:${TENSORFLOW_BUILD_VERSION:-2.15}-python3.11 \
+	/bin/sh -c "pip install \"tensorflow${TENSORFLOW_VERSION}\" cmake \
             && cd /root/deepmd-kit/source/install \
             && CC=/dt9/usr/bin/gcc \
                CXX=/dt9/usr/bin/g++ \
diff --git a/source/install/test_cc.sh b/source/install/test_cc.sh
index 0a8700b275..0dd35f5615 100755
--- a/source/install/test_cc.sh
+++ b/source/install/test_cc.sh
@@ -17,7 +17,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023_update1 ${CUDA_ARGS} ..
+cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023_update2 ${CUDA_ARGS} ..
 cmake --build . -j${NPROC}
 cmake --install .
 ctest --output-on-failure
diff --git a/source/install/test_cc_local.sh b/source/install/test_cc_local.sh
index 74477a8c2a..22d22a27f6 100755
--- a/source/install/test_cc_local.sh
+++ b/source/install/test_cc_local.sh
@@ -18,7 +18,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DINSTALL_TENSORFLOW=FALSE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023_update1 ${CUDA_ARGS} ..
+cmake -DINSTALL_TENSORFLOW=FALSE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023_update2 ${CUDA_ARGS} ..
 cmake --build . -j${NPROC}
 cmake --install .
 ctest --output-on-failure
diff --git a/source/ipi/driver.cc b/source/ipi/driver.cc
index 22d5415bd1..1e3d92eb5e 100644
--- a/source/ipi/driver.cc
+++ b/source/ipi/driver.cc
@@ -190,11 +190,11 @@ int main(int argc, char *argv[]) {
                     << std::endl;
         }
 
-        dcoord.resize(3 * natoms);
-        dforce.resize(3 * natoms, 0);
-        dcoord_tmp.resize(3 * natoms);
-        dforce_tmp.resize(3 * natoms, 0);
-        msg_buff = new double[3 * natoms];
+        dcoord.resize(3 * static_cast<size_t>(natoms));
+        dforce.resize(3 * static_cast<size_t>(natoms), 0);
+        dcoord_tmp.resize(3 * static_cast<size_t>(natoms));
+        dforce_tmp.resize(3 * static_cast<size_t>(natoms), 0);
+        msg_buff = new double[3 * static_cast<size_t>(natoms)];
       }
 
       // get coord
diff --git a/source/ipi/src/Convert.cc b/source/ipi/src/Convert.cc
index 0a98962518..7dea877e3b 100644
--- a/source/ipi/src/Convert.cc
+++ b/source/ipi/src/Convert.cc
@@ -30,7 +30,7 @@ void Convert<VALUETYPE>::forward(std::vector<VALUETYPE>& out,
                                  const int stride) const {
   assert(in.size() == stride * idx_map.size());
   int natoms = idx_map.size();
-  out.resize(stride * natoms);
+  out.resize(static_cast<size_t>(stride) * natoms);
   for (int ii = 0; ii < natoms; ++ii) {
     int gro_i = idx_map[ii];
     for (int dd = 0; dd < stride; ++dd) {
@@ -45,7 +45,7 @@ void Convert<VALUETYPE>::backward(std::vector<VALUETYPE>& out,
                                   const int stride) const {
   int natoms = idx_map.size();
   assert(in.size() == stride * idx_map.size());
-  out.resize(stride * natoms);
+  out.resize(static_cast<size_t>(stride) * natoms);
   for (int ii = 0; ii < natoms; ++ii) {
     int gro_i = idx_map[ii];
     for (int dd = 0; dd < stride; ++dd) {
diff --git a/source/lib/include/region.h b/source/lib/include/region.h
index 2f6dbbf4e0..ee11b5b8ac 100644
--- a/source/lib/include/region.h
+++ b/source/lib/include/region.h
@@ -8,7 +8,11 @@ struct Region {
   FPTYPE* boxt;
   FPTYPE* rec_boxt;
   Region();
+  Region(FPTYPE* extern_boxt, FPTYPE* extern_rec_boxt);
   ~Region();
+
+ private:
+  bool self_allocated;
 };
 
 template <typename FPTYPE>
diff --git a/source/lib/src/ewald.cc b/source/lib/src/ewald.cc
index 0f3b960d9f..ec201c4d7d 100644
--- a/source/lib/src/ewald.cc
+++ b/source/lib/src/ewald.cc
@@ -98,7 +98,7 @@ void deepmd::ewald_recp(VALUETYPE& ener,
   // natoms
   int natoms = charge.size();
   // init returns
-  force.resize(natoms * 3);
+  force.resize(static_cast<size_t>(natoms) * 3);
   virial.resize(9);
   ener = 0;
   fill(force.begin(), force.end(), static_cast<VALUETYPE>(0));
@@ -179,7 +179,7 @@ void deepmd::ewald_recp(VALUETYPE& ener,
   std::vector<std::vector<VALUETYPE> > thread_force(nthreads);
   std::vector<std::vector<VALUETYPE> > thread_virial(nthreads);
   for (int ii = 0; ii < nthreads; ++ii) {
-    thread_force[ii].resize(natoms * 3, 0.);
+    thread_force[ii].resize(static_cast<size_t>(natoms) * 3, 0.);
     thread_virial[ii].resize(9, 0.);
   }
   // calculate ener, force and virial
diff --git a/source/lib/src/gpu/CMakeLists.txt b/source/lib/src/gpu/CMakeLists.txt
index c78da978a2..3bd24cc620 100644
--- a/source/lib/src/gpu/CMakeLists.txt
+++ b/source/lib/src/gpu/CMakeLists.txt
@@ -5,7 +5,9 @@ if(USE_CUDA_TOOLKIT)
   project(deepmd_op_cuda)
   set(GPU_LIB_NAME deepmd_op_cuda)
 
-  set(CMAKE_CUDA_ARCHITECTURES all)
+  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES all)
+  endif()
   enable_language(CUDA)
   set(CMAKE_CUDA_STANDARD 11)
   add_compile_definitions(
@@ -71,9 +73,10 @@ elseif(USE_ROCM_TOOLKIT)
 
   message(STATUS "HIP major version is " ${hip_VERSION_MAJOR})
 
-  set(CMAKE_HIP_FLAGS -fno-gpu-rdc ${CMAKE_HIP_FLAGS}) # --amdgpu-target=gfx906
+  set(CMAKE_HIP_FLAGS "-fno-gpu-rdc ${CMAKE_HIP_FLAGS}"
+  )# --amdgpu-target=gfx906
   if(hip_VERSION VERSION_LESS 3.5.1)
-    set(CMAKE_HIP_FLAGS -hc ${CMAKE_HIP_FLAGS})
+    set(CMAKE_HIP_FLAGS "-hc ${CMAKE_HIP_FLAGS}")
   endif()
 
   file(GLOB SOURCE_FILES "*.cu")
diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc
index fc797ce6a9..6723e3de66 100644
--- a/source/lib/src/neighbor_list.cc
+++ b/source/lib/src/neighbor_list.cc
@@ -784,7 +784,7 @@ void copy_coord(std::vector<double>& out_c,
   build_clist(clist, in_c, nloc, nat_stt, ncell, nat_stt, ncell, region, ncell);
 
   // copy local atoms
-  out_c.resize(nloc * 3);
+  out_c.resize(static_cast<size_t>(nloc) * 3);
   out_t.resize(nloc);
   mapping.resize(nloc);
   copy(in_c.begin(), in_c.end(), out_c.begin());
diff --git a/source/lib/src/pairwise.cc b/source/lib/src/pairwise.cc
index 3fea27bd71..f5b21d9856 100644
--- a/source/lib/src/pairwise.cc
+++ b/source/lib/src/pairwise.cc
@@ -93,7 +93,7 @@ void deepmd::dprc_pairwise_map_cpu(
   // (3, 4, 0, 1, 2, 10, 11),
   // (3, 4, 5, 6, 7, 10, -1),
   // (3, 4, 8, 9, -1, 10, -1)
-  forward_qmmm_map.resize((nfragments - 1) * map_size);
+  forward_qmmm_map.resize(static_cast<size_t>(nfragments - 1) * map_size);
   std::fill(forward_qmmm_map.begin(), forward_qmmm_map.end(), -1);
   int nqm_real = nloc;  // init for nfragments = 1
   for (int ii = 0; ii < nfragments - 1; ++ii) {
@@ -133,7 +133,7 @@ void deepmd::dprc_pairwise_map_cpu(
   // (2, 3, 4, 0, 1, -1, -1, -1, -1, -1, 5, 6)
   // (-1, -1, -1, 0, 1, 2, 3, 4, -1, -1, 5, -1)
   // (-1, -1, -1, 0, 1, -1, -1, -1, 2, 3, 5, -1)
-  backward_qmmm_map.resize((nfragments - 1) * nall);
+  backward_qmmm_map.resize(static_cast<size_t>(nfragments - 1) * nall);
   std::fill(backward_qmmm_map.begin(), backward_qmmm_map.end(), -1);
   for (int ii = 0; ii < nfragments - 1; ++ii) {
     for (int jj = 0; jj < map_size; ++jj) {
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index 63813069b0..81984c78e4 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -304,7 +304,8 @@ void deepmd::env_mat_nbor_update(InputNlist &inlist,
     max_nbor_size = _max_nbor_size;
 
     // copy nbor list from host to the device
-    std::vector<int> nbor_list_host(inum * max_nbor_size, 0);
+    std::vector<int> nbor_list_host(static_cast<size_t>(inum) * max_nbor_size,
+                                    0);
     int **_firstneigh = (int **)malloc(sizeof(int *) * inum);
     for (int ii = 0; ii < inum; ii++) {
       _firstneigh[ii] = nbor_list_dev + ii * max_nbor_size;
@@ -313,7 +314,7 @@ void deepmd::env_mat_nbor_update(InputNlist &inlist,
       }
     }
     memcpy_host_to_device(nbor_list_dev, &nbor_list_host[0],
-                          inum * max_nbor_size);
+                          static_cast<size_t>(inum) * max_nbor_size);
     memcpy_host_to_device(gpu_inlist.firstneigh, _firstneigh, inum);
     free(_firstneigh);
   }
diff --git a/source/lib/src/region.cc b/source/lib/src/region.cc
index 36a739d90a..6c5f5493e6 100644
--- a/source/lib/src/region.cc
+++ b/source/lib/src/region.cc
@@ -14,12 +14,22 @@ template <typename FPTYPE>
 Region<FPTYPE>::Region() {
   boxt = new FPTYPE[BOXT_DIM];
   rec_boxt = new FPTYPE[BOXT_DIM];
+  self_allocated = true;
+}
+
+template <typename FPTYPE>
+Region<FPTYPE>::Region(FPTYPE* extern_boxt, FPTYPE* extern_rec_boxt) {
+  boxt = extern_boxt;
+  rec_boxt = extern_rec_boxt;
+  self_allocated = false;
 }
 
 template <typename FPTYPE>
 Region<FPTYPE>::~Region() {
-  delete[] boxt;
-  delete[] rec_boxt;
+  if (self_allocated) {
+    delete[] boxt;
+    delete[] rec_boxt;
+  }
 }
 
 template struct deepmd::Region<double>;
diff --git a/source/lib/tests/test_coord.cc b/source/lib/tests/test_coord.cc
index af320ca3f7..0427521416 100644
--- a/source/lib/tests/test_coord.cc
+++ b/source/lib/tests/test_coord.cc
@@ -62,9 +62,6 @@ TEST_F(TestNormCoord, cpu_case2) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestNormCoord, gpu_case0) {
   deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
   init_region_cpu(region, &boxt[0]);
   std::vector<double> box_info;
   box_info.resize(18);
@@ -75,11 +72,8 @@ TEST_F(TestNormCoord, gpu_case0) {
   std::vector<double> out_c(r0);
   deepmd::malloc_device_memory_sync(box_info_dev, box_info);
   deepmd::malloc_device_memory_sync(out_c_dev, out_c);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
+  deepmd::Region<double> region_dev(box_info_dev, box_info_dev + 9);
   deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
   deepmd::delete_device_memory(box_info_dev);
   deepmd::delete_device_memory(out_c_dev);
@@ -90,9 +84,6 @@ TEST_F(TestNormCoord, gpu_case0) {
 
 TEST_F(TestNormCoord, gpu_case1) {
   deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
   init_region_cpu(region, &boxt[0]);
   std::vector<double> box_info;
   box_info.resize(18);
@@ -103,11 +94,8 @@ TEST_F(TestNormCoord, gpu_case1) {
   std::vector<double> out_c(r1);
   deepmd::malloc_device_memory_sync(box_info_dev, box_info);
   deepmd::malloc_device_memory_sync(out_c_dev, out_c);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
+  deepmd::Region<double> region_dev(box_info_dev, box_info_dev + 9);
   deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
   deepmd::delete_device_memory(box_info_dev);
   deepmd::delete_device_memory(out_c_dev);
@@ -118,9 +106,6 @@ TEST_F(TestNormCoord, gpu_case1) {
 
 TEST_F(TestNormCoord, gpu_case2) {
   deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
   init_region_cpu(region, &boxt[0]);
   std::vector<double> box_info;
   box_info.resize(18);
@@ -131,11 +116,8 @@ TEST_F(TestNormCoord, gpu_case2) {
   std::vector<double> out_c(r2);
   deepmd::malloc_device_memory_sync(box_info_dev, box_info);
   deepmd::malloc_device_memory_sync(out_c_dev, out_c);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
+  deepmd::Region<double> region_dev(box_info_dev, box_info_dev + 9);
   deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
   deepmd::delete_device_memory(box_info_dev);
   deepmd::delete_device_memory(out_c_dev);
@@ -252,7 +234,7 @@ TEST_F(TestCopyCoord, cpu) {
   // 	    << nloc << " "
   // 	    << nall << std::endl;
 
-  out_c.resize(nall * 3);
+  out_c.resize(static_cast<size_t>(nall) * 3);
   out_t.resize(nall);
   mapping.resize(nall);
 
@@ -298,9 +280,6 @@ TEST_F(TestCopyCoord, gpu) {
   std::vector<int> cell_info;
   cell_info.resize(23);
   deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
   init_region_cpu(region, &boxt[0]);
   deepmd::compute_cell_info(&cell_info[0], rc, region);
   std::vector<double> box_info;
@@ -325,14 +304,11 @@ TEST_F(TestCopyCoord, gpu) {
       int_data_dev, nloc * 3 + loc_cellnum + total_cellnum * 3 +
                         total_cellnum * 3 + loc_cellnum + 1 + total_cellnum +
                         1 + nloc);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
+  deepmd::Region<double> region_dev(box_info_dev, box_info_dev + 9);
   int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall,
                                    int_data_dev, in_c_dev, in_t_dev, nloc,
                                    mem_size, loc_cellnum, total_cellnum,
                                    cell_info_dev, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
   deepmd::memcpy_device_to_host(out_t_dev, out_t);
   deepmd::memcpy_device_to_host(mapping_dev, mapping);
@@ -346,7 +322,7 @@ TEST_F(TestCopyCoord, gpu) {
   deepmd::delete_device_memory(int_data_dev);
   EXPECT_EQ(ret, 0);
   EXPECT_EQ(nall, expected_nall);
-  out_c.resize(nall * 3);
+  out_c.resize(static_cast<size_t>(nall) * 3);
   out_t.resize(nall);
   mapping.resize(nall);
 
@@ -373,9 +349,6 @@ TEST_F(TestCopyCoord, gpu_lessmem) {
   std::vector<int> cell_info;
   cell_info.resize(23);
   deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
   init_region_cpu(region, &boxt[0]);
   deepmd::compute_cell_info(&cell_info[0], rc, region);
   std::vector<double> box_info;
@@ -400,14 +373,11 @@ TEST_F(TestCopyCoord, gpu_lessmem) {
       int_data_dev, nloc * 3 + loc_cellnum + total_cellnum * 3 +
                         total_cellnum * 3 + loc_cellnum + 1 + total_cellnum +
                         1 + nloc);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
+  deepmd::Region<double> region_dev(box_info_dev, box_info_dev + 9);
   int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall,
                                    int_data_dev, in_c_dev, in_t_dev, nloc,
                                    mem_size, loc_cellnum, total_cellnum,
                                    cell_info_dev, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
   deepmd::memcpy_device_to_host(out_t_dev, out_t);
   deepmd::memcpy_device_to_host(mapping_dev, mapping);
@@ -498,7 +468,7 @@ TEST_F(TestCopyCoordMoreCell, cpu) {
   // 	    << nloc << " "
   // 	    << nall << std::endl;
 
-  out_c.resize(nall * 3);
+  out_c.resize(static_cast<size_t>(nall) * 3);
   out_t.resize(nall);
   mapping.resize(nall);
 
@@ -544,9 +514,6 @@ TEST_F(TestCopyCoordMoreCell, gpu) {
   std::vector<int> cell_info;
   cell_info.resize(23);
   deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
   init_region_cpu(region, &boxt[0]);
   deepmd::compute_cell_info(&cell_info[0], rc, region);
   std::vector<double> box_info;
@@ -571,14 +538,11 @@ TEST_F(TestCopyCoordMoreCell, gpu) {
       int_data_dev, nloc * 3 + loc_cellnum + total_cellnum * 3 +
                         total_cellnum * 3 + loc_cellnum + 1 + total_cellnum +
                         1 + nloc);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
+  deepmd::Region<double> region_dev(box_info_dev, box_info_dev + 9);
   int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall,
                                    int_data_dev, in_c_dev, in_t_dev, nloc,
                                    mem_size, loc_cellnum, total_cellnum,
                                    cell_info_dev, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
   deepmd::memcpy_device_to_host(out_t_dev, out_t);
   deepmd::memcpy_device_to_host(mapping_dev, mapping);
@@ -592,7 +556,7 @@ TEST_F(TestCopyCoordMoreCell, gpu) {
   deepmd::delete_device_memory(int_data_dev);
   EXPECT_EQ(ret, 0);
   EXPECT_EQ(nall, expected_nall);
-  out_c.resize(nall * 3);
+  out_c.resize(static_cast<size_t>(nall) * 3);
   out_t.resize(nall);
   mapping.resize(nall);
 
@@ -619,9 +583,6 @@ TEST_F(TestCopyCoordMoreCell, gpu_lessmem) {
   std::vector<int> cell_info;
   cell_info.resize(23);
   deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
   init_region_cpu(region, &boxt[0]);
   deepmd::compute_cell_info(&cell_info[0], rc, region);
   std::vector<double> box_info;
@@ -646,14 +607,11 @@ TEST_F(TestCopyCoordMoreCell, gpu_lessmem) {
       int_data_dev, nloc * 3 + loc_cellnum + total_cellnum * 3 +
                         total_cellnum * 3 + loc_cellnum + 1 + total_cellnum +
                         1 + nloc);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
+  deepmd::Region<double> region_dev(box_info_dev, box_info_dev + 9);
   int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall,
                                    int_data_dev, in_c_dev, in_t_dev, nloc,
                                    mem_size, loc_cellnum, total_cellnum,
                                    cell_info_dev, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
   deepmd::memcpy_device_to_host(out_t_dev, out_t);
   deepmd::memcpy_device_to_host(mapping_dev, mapping);
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index 89756c9fc5..d041d1a0a1 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -504,11 +504,12 @@ TEST_F(TestEnvMatA, prod_cpu) {
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   deepmd::convert_nlist(inlist, nlist_a_cpy);
 
-  std::vector<double> em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3),
-      rij(nloc * nnei * 3);
-  std::vector<int> nlist(nloc * nnei);
-  std::vector<double> avg(ntypes * ndescrpt, 0);
-  std::vector<double> std(ntypes * ndescrpt, 1);
+  std::vector<double> em(static_cast<size_t>(nloc) * ndescrpt),
+      em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
+      rij(static_cast<size_t>(nloc) * nnei * 3);
+  std::vector<int> nlist(static_cast<size_t>(nloc) * nnei);
+  std::vector<double> avg(static_cast<size_t>(ntypes) * ndescrpt, 0);
+  std::vector<double> std(static_cast<size_t>(ntypes) * ndescrpt, 1);
   deepmd::prod_env_mat_a_cpu(&em[0], &em_deriv[0], &rij[0], &nlist[0],
                              &posi_cpy[0], &atype_cpy[0], inlist, max_nbor_size,
                              &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
@@ -538,11 +539,12 @@ TEST_F(TestEnvMatA, prod_cpu_equal_cpu) {
   std::vector<int *> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_a_cpy);
-  std::vector<double> em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3),
-      rij(nloc * nnei * 3);
-  std::vector<int> nlist(nloc * nnei);
-  std::vector<double> avg(ntypes * ndescrpt, 0);
-  std::vector<double> std(ntypes * ndescrpt, 1);
+  std::vector<double> em(static_cast<size_t>(nloc) * ndescrpt),
+      em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
+      rij(static_cast<size_t>(nloc) * nnei * 3);
+  std::vector<int> nlist(static_cast<size_t>(nloc) * nnei);
+  std::vector<double> avg(static_cast<size_t>(ntypes) * ndescrpt, 0);
+  std::vector<double> std(static_cast<size_t>(ntypes) * ndescrpt, 1);
   deepmd::prod_env_mat_a_cpu(&em[0], &em_deriv[0], &rij[0], &nlist[0],
                              &posi_cpy[0], &atype_cpy[0], inlist, max_nbor_size,
                              &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
diff --git a/source/lib/tests/test_env_mat_a_mix.cc b/source/lib/tests/test_env_mat_a_mix.cc
index 909088d1e3..d7e6cc88eb 100644
--- a/source/lib/tests/test_env_mat_a_mix.cc
+++ b/source/lib/tests/test_env_mat_a_mix.cc
@@ -532,11 +532,12 @@ TEST_F(TestEnvMatAMix, prod_cpu) {
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   deepmd::convert_nlist(inlist, nlist_a_cpy);
 
-  std::vector<double> em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3),
-      rij(nloc * nnei * 3);
-  std::vector<int> nlist(nloc * nnei);
-  std::vector<int> ntype(nloc * nnei);
-  bool *nmask = new bool[nloc * nnei];
+  std::vector<double> em(static_cast<size_t>(nloc) * ndescrpt),
+      em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
+      rij(static_cast<size_t>(nloc) * nnei * 3);
+  std::vector<int> nlist(static_cast<size_t>(nloc) * nnei);
+  std::vector<int> ntype(static_cast<size_t>(nloc) * nnei);
+  bool *nmask = new bool[static_cast<size_t>(nloc) * nnei];
   memset(nmask, 0, sizeof(bool) * nloc * nnei);
   std::vector<double> avg(ntypes * ndescrpt, 0);
   std::vector<double> std(ntypes * ndescrpt, 1);
@@ -575,11 +576,12 @@ TEST_F(TestEnvMatAMix, prod_cpu_equal_cpu) {
   std::vector<int *> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_a_cpy);
-  std::vector<double> em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3),
-      rij(nloc * nnei * 3);
-  std::vector<int> nlist(nloc * nnei);
-  std::vector<double> avg(ntypes * ndescrpt, 0);
-  std::vector<double> std(ntypes * ndescrpt, 1);
+  std::vector<double> em(static_cast<size_t>(nloc) * ndescrpt),
+      em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
+      rij(static_cast<size_t>(nloc) * nnei * 3);
+  std::vector<int> nlist(static_cast<size_t>(nloc) * nnei);
+  std::vector<double> avg(static_cast<size_t>(ntypes) * ndescrpt, 0);
+  std::vector<double> std(static_cast<size_t>(ntypes) * ndescrpt, 1);
   deepmd::prod_env_mat_a_cpu(&em[0], &em_deriv[0], &rij[0], &nlist[0],
                              &posi_cpy[0], &atype[0], inlist, max_nbor_size,
                              &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a,
@@ -652,11 +654,12 @@ TEST_F(TestEnvMatAMix, prod_gpu) {
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
       gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
-  std::vector<double> em(nloc * ndescrpt, 0.0),
-      em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
-  std::vector<int> nlist(nloc * nnei, 0);
-  std::vector<int> ntype(nloc * nnei, 0);
-  bool *nmask = new bool[nloc * nnei];
+  std::vector<double> em(static_cast<size_t>(nloc) * ndescrpt, 0.0),
+      em_deriv(nloc * ndescrpt * 3, 0.0),
+      rij(static_cast<size_t>(nloc) * nnei * 3, 0.0);
+  std::vector<int> nlist(static_cast<size_t>(nloc) * nnei, 0);
+  std::vector<int> ntype(static_cast<size_t>(nloc) * nnei, 0);
+  bool *nmask = new bool[static_cast<size_t>(nloc) * nnei];
   memset(nmask, 0, sizeof(bool) * nloc * nnei);
   std::vector<double> avg(ntypes * ndescrpt, 0);
   std::vector<double> std(ntypes * ndescrpt, 1);
diff --git a/source/lib/tests/test_env_mat_a_nvnmd.cc b/source/lib/tests/test_env_mat_a_nvnmd.cc
index cca468829e..bf55323ee0 100644
--- a/source/lib/tests/test_env_mat_a_nvnmd.cc
+++ b/source/lib/tests/test_env_mat_a_nvnmd.cc
@@ -274,7 +274,7 @@ TEST_F(TestEnvMatANvnmd, prod_cpu) {
   deepmd::convert_nlist(inlist, nlist_a_cpy);
 
   std::vector<double> em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3),
-      rij(nloc * nnei * 3);
+      rij(static_cast<size_t>(nloc) * nnei * 3);
   std::vector<int> nlist(nloc * nnei);
   std::vector<double> avg(ntypes * ndescrpt, 0);
   std::vector<double> std(ntypes * ndescrpt, 1);
@@ -308,7 +308,7 @@ TEST_F(TestEnvMatANvnmd, prod_cpu_equal_cpu) {
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_a_cpy);
   std::vector<double> em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3),
-      rij(nloc * nnei * 3);
+      rij(static_cast<size_t>(nloc) * nnei * 3);
   std::vector<int> nlist(nloc * nnei);
   std::vector<double> avg(ntypes * ndescrpt, 0);
   std::vector<double> std(ntypes * ndescrpt, 1);
diff --git a/source/lib/tests/test_map_aparam.cc b/source/lib/tests/test_map_aparam.cc
index 4adf3ffd98..061ae49f28 100644
--- a/source/lib/tests/test_map_aparam.cc
+++ b/source/lib/tests/test_map_aparam.cc
@@ -65,7 +65,7 @@ class TestMapAparam : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
@@ -74,7 +74,7 @@ class TestMapAparam : public ::testing::Test {
         nlist[ii * nnei + jj] = fmt_nlist_a[jj];
       }
     }
-    aparam.resize(nall * numb_aparam);
+    aparam.resize(static_cast<size_t>(nall) * numb_aparam);
     for (int ii = 0; ii < nall * numb_aparam; ++ii) {
       aparam[ii] = 10 - 0.1 * ii;
     }
diff --git a/source/lib/tests/test_pair_tab.cc b/source/lib/tests/test_pair_tab.cc
index 9f68cd98b9..7002beb1da 100644
--- a/source/lib/tests/test_pair_tab.cc
+++ b/source/lib/tests/test_pair_tab.cc
@@ -235,10 +235,10 @@ class TestPairTab : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    env.resize(nloc * ndescrpt);
-    env_deriv.resize(nloc * ndescrpt * 3);
-    rij.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    env.resize(static_cast<size_t>(nloc) * ndescrpt);
+    env_deriv.resize(static_cast<size_t>(nloc) * ndescrpt * 3);
+    rij.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
diff --git a/source/lib/tests/test_prod_force_a.cc b/source/lib/tests/test_prod_force_a.cc
index 2031f086b4..f49b173769 100644
--- a/source/lib/tests/test_prod_force_a.cc
+++ b/source/lib/tests/test_prod_force_a.cc
@@ -82,10 +82,10 @@ class TestProdForceA : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    env.resize(nloc * ndescrpt);
-    env_deriv.resize(nloc * ndescrpt * 3);
-    rij_a.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    env.resize(static_cast<size_t>(nloc) * ndescrpt);
+    env_deriv.resize(static_cast<size_t>(nloc) * ndescrpt * 3);
+    rij_a.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
@@ -105,7 +105,7 @@ class TestProdForceA : public ::testing::Test {
         }
       }
     }
-    net_deriv.resize(nloc * ndescrpt);
+    net_deriv.resize(static_cast<size_t>(nloc) * ndescrpt);
     for (int ii = 0; ii < nloc * ndescrpt; ++ii) {
       net_deriv[ii] = 10 - ii * 0.01;
     }
diff --git a/source/lib/tests/test_prod_force_grad_a.cc b/source/lib/tests/test_prod_force_grad_a.cc
index abb04eaa01..a946639638 100644
--- a/source/lib/tests/test_prod_force_grad_a.cc
+++ b/source/lib/tests/test_prod_force_grad_a.cc
@@ -93,10 +93,10 @@ class TestProdForceGradA : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    env.resize(nloc * ndescrpt);
-    env_deriv.resize(nloc * ndescrpt * 3);
-    rij_a.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    env.resize(static_cast<size_t>(nloc) * ndescrpt);
+    env_deriv.resize(static_cast<size_t>(nloc) * ndescrpt * 3);
+    rij_a.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
@@ -116,7 +116,7 @@ class TestProdForceGradA : public ::testing::Test {
         }
       }
     }
-    grad.resize(nloc * 3);
+    grad.resize(static_cast<size_t>(nloc) * 3);
     for (int ii = 0; ii < nloc * 3; ++ii) {
       grad[ii] = 10 - ii * 0.1;
     }
diff --git a/source/lib/tests/test_prod_force_grad_r.cc b/source/lib/tests/test_prod_force_grad_r.cc
index c8a27077c3..e143633bea 100644
--- a/source/lib/tests/test_prod_force_grad_r.cc
+++ b/source/lib/tests/test_prod_force_grad_r.cc
@@ -67,10 +67,10 @@ class TestProdForceGradR : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    env.resize(nloc * ndescrpt);
-    env_deriv.resize(nloc * ndescrpt * 3);
-    rij_a.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    env.resize(static_cast<size_t>(nloc) * ndescrpt);
+    env_deriv.resize(static_cast<size_t>(nloc) * ndescrpt * 3);
+    rij_a.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
@@ -90,7 +90,7 @@ class TestProdForceGradR : public ::testing::Test {
         }
       }
     }
-    grad.resize(nloc * 3);
+    grad.resize(static_cast<size_t>(nloc) * 3);
     for (int ii = 0; ii < nloc * 3; ++ii) {
       grad[ii] = 10 - ii * 0.1;
     }
diff --git a/source/lib/tests/test_prod_force_r.cc b/source/lib/tests/test_prod_force_r.cc
index ff3245742d..544152c759 100644
--- a/source/lib/tests/test_prod_force_r.cc
+++ b/source/lib/tests/test_prod_force_r.cc
@@ -79,10 +79,10 @@ class TestProdForceR : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    env.resize(nloc * ndescrpt);
-    env_deriv.resize(nloc * ndescrpt * 3);
-    rij_a.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    env.resize(static_cast<size_t>(nloc) * ndescrpt);
+    env_deriv.resize(static_cast<size_t>(nloc) * ndescrpt * 3);
+    rij_a.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
@@ -102,7 +102,7 @@ class TestProdForceR : public ::testing::Test {
         }
       }
     }
-    net_deriv.resize(nloc * ndescrpt);
+    net_deriv.resize(static_cast<size_t>(nloc) * ndescrpt);
     for (int ii = 0; ii < nloc * ndescrpt; ++ii) {
       net_deriv[ii] = 10 - ii * 0.01;
     }
diff --git a/source/lib/tests/test_prod_virial_a.cc b/source/lib/tests/test_prod_virial_a.cc
index b2f2a11989..c6fe254db8 100644
--- a/source/lib/tests/test_prod_virial_a.cc
+++ b/source/lib/tests/test_prod_virial_a.cc
@@ -118,10 +118,10 @@ class TestProdVirialA : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    env.resize(nloc * ndescrpt);
-    env_deriv.resize(nloc * ndescrpt * 3);
-    rij.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    env.resize(static_cast<size_t>(nloc) * ndescrpt);
+    env_deriv.resize(static_cast<size_t>(nloc) * ndescrpt * 3);
+    rij.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
@@ -144,7 +144,7 @@ class TestProdVirialA : public ::testing::Test {
         rij[ii * nnei * 3 + jj] = t_rij[jj];
       }
     }
-    net_deriv.resize(nloc * ndescrpt);
+    net_deriv.resize(static_cast<size_t>(nloc) * ndescrpt);
     for (int ii = 0; ii < nloc * ndescrpt; ++ii) {
       net_deriv[ii] = 10 - ii * 0.01;
     }
diff --git a/source/lib/tests/test_prod_virial_grad_a.cc b/source/lib/tests/test_prod_virial_grad_a.cc
index 09af51d6ed..598df91c86 100644
--- a/source/lib/tests/test_prod_virial_grad_a.cc
+++ b/source/lib/tests/test_prod_virial_grad_a.cc
@@ -87,10 +87,10 @@ class TestProdVirialGradA : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    env.resize(nloc * ndescrpt);
-    env_deriv.resize(nloc * ndescrpt * 3);
-    rij.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    env.resize(static_cast<size_t>(nloc) * ndescrpt);
+    env_deriv.resize(static_cast<size_t>(nloc) * ndescrpt * 3);
+    rij.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
diff --git a/source/lib/tests/test_prod_virial_grad_r.cc b/source/lib/tests/test_prod_virial_grad_r.cc
index 93a7291176..9b520ed898 100644
--- a/source/lib/tests/test_prod_virial_grad_r.cc
+++ b/source/lib/tests/test_prod_virial_grad_r.cc
@@ -61,10 +61,10 @@ class TestProdVirialGradR : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    env.resize(nloc * ndescrpt);
-    env_deriv.resize(nloc * ndescrpt * 3);
-    rij.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    env.resize(static_cast<size_t>(nloc) * ndescrpt);
+    env_deriv.resize(static_cast<size_t>(nloc) * ndescrpt * 3);
+    rij.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
diff --git a/source/lib/tests/test_prod_virial_r.cc b/source/lib/tests/test_prod_virial_r.cc
index aed4abc512..f0fab48e78 100644
--- a/source/lib/tests/test_prod_virial_r.cc
+++ b/source/lib/tests/test_prod_virial_r.cc
@@ -118,10 +118,10 @@ class TestProdVirialR : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    env.resize(nloc * ndescrpt);
-    env_deriv.resize(nloc * ndescrpt * 3);
-    rij.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    env.resize(static_cast<size_t>(nloc) * ndescrpt);
+    env_deriv.resize(static_cast<size_t>(nloc) * ndescrpt * 3);
+    rij.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
@@ -144,7 +144,7 @@ class TestProdVirialR : public ::testing::Test {
         rij[ii * nnei * 3 + jj] = t_rij[jj];
       }
     }
-    net_deriv.resize(nloc * ndescrpt);
+    net_deriv.resize(static_cast<size_t>(nloc) * ndescrpt);
     for (int ii = 0; ii < nloc * ndescrpt; ++ii) {
       net_deriv[ii] = 10 - ii * 0.01;
     }
diff --git a/source/lib/tests/test_simulation_region.cc b/source/lib/tests/test_simulation_region.cc
index 98da9ec350..5f64d3f531 100644
--- a/source/lib/tests/test_simulation_region.cc
+++ b/source/lib/tests/test_simulation_region.cc
@@ -77,9 +77,6 @@ TEST_F(TestRegion, cpu) {
 TEST_F(TestRegion, gpu) {
   // check rec_box
   deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
   double *boxt_dev = NULL, *rec_boxt_dev = NULL;
   double *ref_rp_dev = NULL, *ref_ri_dev = NULL;
   init_region_cpu(region, &ref_boxt[0]);
@@ -90,8 +87,7 @@ TEST_F(TestRegion, gpu) {
   deepmd::malloc_device_memory_sync(rec_boxt_dev, region.rec_boxt, 9);
   deepmd::malloc_device_memory_sync(ref_rp_dev, ref_rp);
   deepmd::malloc_device_memory_sync(ref_ri_dev, ref_ri);
-  region_dev.boxt = boxt_dev;
-  region_dev.rec_boxt = rec_boxt_dev;
+  deepmd::Region<double> region_dev(boxt_dev, rec_boxt_dev);
   // check volume
   double vol[1];
   double* vol_dev = NULL;
@@ -141,8 +137,6 @@ TEST_F(TestRegion, gpu) {
   deepmd::delete_device_memory(rp2_dev);
   deepmd::delete_device_memory(rp_dev);
   deepmd::delete_device_memory(ri2_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/source/lib/tests/test_soft_min_switch.cc b/source/lib/tests/test_soft_min_switch.cc
index f7a4f43b1a..fbce26e352 100644
--- a/source/lib/tests/test_soft_min_switch.cc
+++ b/source/lib/tests/test_soft_min_switch.cc
@@ -55,8 +55,8 @@ class TestSoftMinSwitch : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    rij.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    rij.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
diff --git a/source/lib/tests/test_soft_min_switch_force.cc b/source/lib/tests/test_soft_min_switch_force.cc
index dacc681792..a49661fdbd 100644
--- a/source/lib/tests/test_soft_min_switch_force.cc
+++ b/source/lib/tests/test_soft_min_switch_force.cc
@@ -76,8 +76,8 @@ class TestSoftMinSwitchForce : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    rij.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    rij.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
@@ -95,7 +95,7 @@ class TestSoftMinSwitchForce : public ::testing::Test {
       }
     }
     sw_value.resize(nloc);
-    sw_deriv.resize(nloc * nnei * 3);
+    sw_deriv.resize(static_cast<size_t>(nloc) * nnei * 3);
     deepmd::soft_min_switch_cpu<double>(&sw_value[0], &sw_deriv[0], &rij[0],
                                         &nlist[0], nloc, nnei, alpha, rmin,
                                         rmax);
diff --git a/source/lib/tests/test_soft_min_switch_force_grad.cc b/source/lib/tests/test_soft_min_switch_force_grad.cc
index 7c36296a79..9cef91bed0 100644
--- a/source/lib/tests/test_soft_min_switch_force_grad.cc
+++ b/source/lib/tests/test_soft_min_switch_force_grad.cc
@@ -56,8 +56,8 @@ class TestSoftMinSwitchForceGrad : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    rij.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    rij.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
@@ -75,11 +75,11 @@ class TestSoftMinSwitchForceGrad : public ::testing::Test {
       }
     }
     sw_value.resize(nloc);
-    sw_deriv.resize(nloc * nnei * 3);
+    sw_deriv.resize(static_cast<size_t>(nloc) * nnei * 3);
     deepmd::soft_min_switch_cpu<double>(&sw_value[0], &sw_deriv[0], &rij[0],
                                         &nlist[0], nloc, nnei, alpha, rmin,
                                         rmax);
-    grad.resize(nloc * 3);
+    grad.resize(static_cast<size_t>(nloc) * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       grad[ii] = 1.0 - ii * 0.1;
     }
diff --git a/source/lib/tests/test_soft_min_switch_virial.cc b/source/lib/tests/test_soft_min_switch_virial.cc
index 76ddf9fa7f..8b38805528 100644
--- a/source/lib/tests/test_soft_min_switch_virial.cc
+++ b/source/lib/tests/test_soft_min_switch_virial.cc
@@ -121,8 +121,8 @@ class TestSoftMinSwitchVirial : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    rij.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    rij.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
@@ -140,7 +140,7 @@ class TestSoftMinSwitchVirial : public ::testing::Test {
       }
     }
     sw_value.resize(nloc);
-    sw_deriv.resize(nloc * nnei * 3);
+    sw_deriv.resize(static_cast<size_t>(nloc) * nnei * 3);
     deepmd::soft_min_switch_cpu<double>(&sw_value[0], &sw_deriv[0], &rij[0],
                                         &nlist[0], nloc, nnei, alpha, rmin,
                                         rmax);
diff --git a/source/lib/tests/test_soft_min_switch_virial_grad.cc b/source/lib/tests/test_soft_min_switch_virial_grad.cc
index 315880b3ac..fef87d4d4e 100644
--- a/source/lib/tests/test_soft_min_switch_virial_grad.cc
+++ b/source/lib/tests/test_soft_min_switch_virial_grad.cc
@@ -56,8 +56,8 @@ class TestSoftMinSwitchVirialGrad : public ::testing::Test {
     }
     build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt,
                 ncell, ext_stt, ext_end, region, ncell);
-    nlist.resize(nloc * nnei);
-    rij.resize(nloc * nnei * 3);
+    nlist.resize(static_cast<size_t>(nloc) * nnei);
+    rij.resize(static_cast<size_t>(nloc) * nnei * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       // format nlist and record
       format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii,
@@ -75,11 +75,11 @@ class TestSoftMinSwitchVirialGrad : public ::testing::Test {
       }
     }
     sw_value.resize(nloc);
-    sw_deriv.resize(nloc * nnei * 3);
+    sw_deriv.resize(static_cast<size_t>(nloc) * nnei * 3);
     deepmd::soft_min_switch_cpu<double>(&sw_value[0], &sw_deriv[0], &rij[0],
                                         &nlist[0], nloc, nnei, alpha, rmin,
                                         rmax);
-    grad.resize(nloc * 3);
+    grad.resize(static_cast<size_t>(nloc) * 3);
     for (int ii = 0; ii < nloc; ++ii) {
       grad[ii] = 1.0 - ii * 0.1;
     }
diff --git a/source/lmp/builtin.cmake b/source/lmp/builtin.cmake
index 507fe7bf1a..f29e9d3319 100644
--- a/source/lmp/builtin.cmake
+++ b/source/lmp/builtin.cmake
@@ -29,3 +29,7 @@ target_include_directories(
                  ${LAMMPS_SOURCE_DIR}/KSPACE ${LAMMPS_SOURCE_DIR}/EXTRA-FIX)
 target_compile_definitions(
   lammps PRIVATE "LAMMPS_VERSION_NUMBER=${LAMMPS_VERSION_NUMBER}")
+
+# register styles
+registerstyles(${CMAKE_CURRENT_LIST_DIR})
+generatestyleheaders(${LAMMPS_STYLE_HEADERS_DIR})
diff --git a/source/lmp/compute_deeptensor_atom.cpp b/source/lmp/compute_deeptensor_atom.cpp
index 2f4486002e..6e6e9508b7 100644
--- a/source/lmp/compute_deeptensor_atom.cpp
+++ b/source/lmp/compute_deeptensor_atom.cpp
@@ -178,6 +178,6 @@ void ComputeDeeptensorAtom::compute_peratom() {
 ------------------------------------------------------------------------- */
 
 double ComputeDeeptensorAtom::memory_usage() {
-  double bytes = nmax * size_peratom_cols * sizeof(double);
+  double bytes = static_cast<size_t>(nmax) * size_peratom_cols * sizeof(double);
   return bytes;
 }
diff --git a/source/lmp/deepmd_version.h.in b/source/lmp/deepmd_version.h.in
index f4c72c8413..4b99bc7c33 100644
--- a/source/lmp/deepmd_version.h.in
+++ b/source/lmp/deepmd_version.h.in
@@ -5,7 +5,7 @@
 #define DEEPMD_ROOT @CMAKE_INSTALL_PREFIX@
 #define TensorFlow_INCLUDE_DIRS @TensorFlow_INCLUDE_DIRS@
 #define TensorFlow_LIBRARY @TensorFlow_LIBRARY@
-#define DPMD_CVT_STR(x) #x
+#define DPMD_CVT_STR(...) #__VA_ARGS__
 #define DPMD_CVT_ASSTR(X) DPMD_CVT_STR(X)
 #define STR_GIT_SUMM DPMD_CVT_ASSTR(GIT_SUMM)
 #define STR_GIT_HASH DPMD_CVT_ASSTR(GIT_HASH)
diff --git a/source/lmp/fix_dplr.cpp b/source/lmp/fix_dplr.cpp
index 628f435bb7..ea60023e26 100644
--- a/source/lmp/fix_dplr.cpp
+++ b/source/lmp/fix_dplr.cpp
@@ -517,7 +517,7 @@ void FixDPLR::pre_force(int vflag) {
 
   int odim = dpt.output_dim();
   assert(odim == 3);
-  dipole_recd.resize(nall * 3);
+  dipole_recd.resize(static_cast<size_t>(nall) * 3);
   fill(dipole_recd.begin(), dipole_recd.end(), 0.0);
   for (int ii = 0; ii < valid_pairs.size(); ++ii) {
     int idx0 = valid_pairs[ii].first;
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index 432077de5b..90aa453143 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -204,7 +204,7 @@ static void make_uniform_aparam(vector<double> &daparam,
                                 const vector<double> &aparam,
                                 const int &nlocal) {
   unsigned dim_aparam = aparam.size();
-  daparam.resize(dim_aparam * nlocal);
+  daparam.resize(static_cast<size_t>(dim_aparam) * nlocal);
   for (int ii = 0; ii < nlocal; ++ii) {
     for (int jj = 0; jj < dim_aparam; ++jj) {
       daparam[ii * dim_aparam + jj] = aparam[jj];
@@ -247,7 +247,7 @@ void PairDeepMD::make_aparam_from_compute(vector<double> &aparam) {
 
   assert(compute);
   int nlocal = atom->nlocal;
-  aparam.resize(dim_aparam * nlocal);
+  aparam.resize(static_cast<size_t>(dim_aparam) * nlocal);
 
   if (!(compute->invoked_flag & Compute::INVOKED_PERATOM)) {
     compute->compute_peratom();
@@ -339,24 +339,10 @@ void PairDeepMD::make_ttm_aparam(vector<double> &daparam) {
       int ixnode = static_cast<int>(xscale * nxnodes);
       int iynode = static_cast<int>(yscale * nynodes);
       int iznode = static_cast<int>(zscale * nznodes);
-      while (ixnode > nxnodes - 1) {
-        ixnode -= nxnodes;
-      }
-      while (iynode > nynodes - 1) {
-        iynode -= nynodes;
-      }
-      while (iznode > nznodes - 1) {
-        iznode -= nznodes;
-      }
-      while (ixnode < 0) {
-        ixnode += nxnodes;
-      }
-      while (iynode < 0) {
-        iynode += nynodes;
-      }
-      while (iznode < 0) {
-        iznode += nznodes;
-      }
+      // https://stackoverflow.com/a/1907585/9567349
+      ixnode = ((ixnode % nxnodes) + nxnodes) % nxnodes;
+      iynode = ((iynode % nynodes) + nynodes) % nynodes;
+      iznode = ((iznode % nznodes) + nznodes) % nznodes;
       daparam[ii] = T_electron[ixnode][iynode][iznode];
     }
   }
@@ -408,6 +394,7 @@ PairDeepMD::PairDeepMD(LAMMPS *lmp)
   out_each = 0;
   out_rel = 0;
   out_rel_v = 0;
+  stdf_comm_buff_size = 0;
   eps = 0.;
   eps_v = 0.;
   scale = NULL;
@@ -463,9 +450,9 @@ void PairDeepMD::compute(int eflag, int vflag) {
   if (numb_models == 0) {
     return;
   }
-  if (eflag || vflag) {
-    ev_setup(eflag, vflag);
-  }
+  // See
+  // https://docs.lammps.org/Developer_updating.html#use-ev-init-to-initialize-variables-derived-from-eflag-and-vflag
+  ev_init(eflag, vflag);
   if (vflag_atom) {
     error->all(FLERR,
                "6-element atomic virial is not supported. Use compute "
@@ -586,7 +573,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
             error->one(FLERR, e.what());
           }
         } else {
-          dforce.resize((extend_inum + extend_nghost) * 3);
+          dforce.resize(static_cast<size_t>(extend_inum + extend_nghost) * 3);
           try {
             deep_pot.compute(dener, dforce, dvirial, extend_dcoord,
                              extend_dtype, dbox, extend_nghost, extend_lmp_list,
@@ -609,7 +596,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
             error->one(FLERR, e.what());
           }
         } else {
-          dforce.resize((extend_inum + extend_nghost) * 3);
+          dforce.resize(static_cast<size_t>(extend_inum + extend_nghost) * 3);
           try {
             deep_pot.compute(dener, dforce, dvirial, extend_dcoord,
                              extend_dtype, dbox, extend_nghost, extend_lmp_list,
@@ -734,13 +721,11 @@ void PairDeepMD::compute(int eflag, int vflag) {
         }
         double min = numeric_limits<double>::max(), max = 0, avg = 0;
         ana_st(max, min, avg, std_f, nlocal);
-        int all_nlocal = 0;
-        MPI_Reduce(&nlocal, &all_nlocal, 1, MPI_INT, MPI_SUM, 0, world);
         double all_f_min = 0, all_f_max = 0, all_f_avg = 0;
         MPI_Reduce(&min, &all_f_min, 1, MPI_DOUBLE, MPI_MIN, 0, world);
         MPI_Reduce(&max, &all_f_max, 1, MPI_DOUBLE, MPI_MAX, 0, world);
         MPI_Reduce(&avg, &all_f_avg, 1, MPI_DOUBLE, MPI_SUM, 0, world);
-        all_f_avg /= double(all_nlocal);
+        all_f_avg /= double(atom->natoms);
         // std v
         std::vector<double> send_v(9 * numb_models);
         std::vector<double> recv_v(9 * numb_models);
@@ -781,22 +766,34 @@ void PairDeepMD::compute(int eflag, int vflag) {
           all_v_avg = sqrt(all_v_avg / 9);
         }
         if (rank == 0) {
-          all_v_max *= scale[1][1] * ener_unit_cvt_factor;
-          all_v_min *= scale[1][1] * ener_unit_cvt_factor;
-          all_v_avg *= scale[1][1] * ener_unit_cvt_factor;
-          all_f_max *= scale[1][1] * force_unit_cvt_factor;
-          all_f_min *= scale[1][1] * force_unit_cvt_factor;
-          all_f_avg *= scale[1][1] * force_unit_cvt_factor;
+          all_v_max *= ener_unit_cvt_factor;
+          all_v_min *= ener_unit_cvt_factor;
+          all_v_avg *= ener_unit_cvt_factor;
+          all_f_max *= force_unit_cvt_factor;
+          all_f_min *= force_unit_cvt_factor;
+          all_f_avg *= force_unit_cvt_factor;
           fp << setw(12) << update->ntimestep << " " << setw(18) << all_v_max
              << " " << setw(18) << all_v_min << " " << setw(18) << all_v_avg
              << " " << setw(18) << all_f_max << " " << setw(18) << all_f_min
              << " " << setw(18) << all_f_avg;
         }
         if (out_each == 1) {
-          vector<double> std_f_all(all_nlocal);
+          vector<double> std_f_all(atom->natoms);
           // Gather std_f and tags
           tagint *tag = atom->tag;
           int nprocs = comm->nprocs;
+          // Grow arrays if necessary
+          if (atom->natoms > stdf_comm_buff_size) {
+            stdf_comm_buff_size = atom->natoms;
+            memory->destroy(stdfsend);
+            memory->destroy(stdfrecv);
+            memory->destroy(tagsend);
+            memory->destroy(tagrecv);
+            memory->create(stdfsend, stdf_comm_buff_size, "deepmd:stdfsendall");
+            memory->create(stdfrecv, stdf_comm_buff_size, "deepmd:stdfrecvall");
+            memory->create(tagsend, stdf_comm_buff_size, "deepmd:tagsendall");
+            memory->create(tagrecv, stdf_comm_buff_size, "deepmd:tagrecvall");
+          }
           for (int ii = 0; ii < nlocal; ii++) {
             tagsend[ii] = tag[ii];
             stdfsend[ii] = std_f[ii];
@@ -811,11 +808,10 @@ void PairDeepMD::compute(int eflag, int vflag) {
           MPI_Gatherv(stdfsend, nlocal, MPI_DOUBLE, stdfrecv, counts,
                       displacements, MPI_DOUBLE, 0, world);
           if (rank == 0) {
-            for (int dd = 0; dd < all_nlocal; ++dd) {
-              std_f_all[tagrecv[dd] - 1] =
-                  stdfrecv[dd] * scale[1][1] * force_unit_cvt_factor;
+            for (int dd = 0; dd < atom->natoms; ++dd) {
+              std_f_all[tagrecv[dd] - 1] = stdfrecv[dd] * force_unit_cvt_factor;
             }
-            for (int dd = 0; dd < all_nlocal; ++dd) {
+            for (int dd = 0; dd < atom->natoms; ++dd) {
               fp << " " << setw(18) << std_f_all[dd];
             }
           }
@@ -1293,6 +1289,9 @@ void PairDeepMD::init_style() {
   if (out_each == 1) {
     int ntotal = atom->natoms;
     int nprocs = comm->nprocs;
+    if (ntotal > stdf_comm_buff_size) {
+      stdf_comm_buff_size = ntotal;
+    }
     memory->create(counts, nprocs, "deepmd:counts");
     memory->create(displacements, nprocs, "deepmd:displacements");
     memory->create(stdfsend, ntotal, "deepmd:stdfsendall");
@@ -1496,7 +1495,7 @@ void PairDeepMD::extend(int &extend_inum,
   }
 
   // extend coord
-  extend_dcoord.resize(extend_nall * 3);
+  extend_dcoord.resize(static_cast<size_t>(extend_nall) * 3);
   for (int ii = 0; ii < nloc; ii++) {
     for (int jj = 0; jj < 3; jj++) {
       extend_dcoord[new_idx_map[ii] * 3 + jj] = dcoord[ii * 3 + jj];
diff --git a/source/lmp/pair_deepmd.h b/source/lmp/pair_deepmd.h
index 0f704ab45c..cd72dc7b2a 100644
--- a/source/lmp/pair_deepmd.h
+++ b/source/lmp/pair_deepmd.h
@@ -98,6 +98,7 @@ class PairDeepMD : public Pair {
   int out_each;
   int out_rel;
   int out_rel_v;
+  int stdf_comm_buff_size;
   bool single_model;
   bool multi_models_mod_devi;
   bool multi_models_no_mod_devi;
diff --git a/source/lmp/pppm_dplr.cpp b/source/lmp/pppm_dplr.cpp
index faa80ee308..613a9f1c93 100644
--- a/source/lmp/pppm_dplr.cpp
+++ b/source/lmp/pppm_dplr.cpp
@@ -59,7 +59,7 @@ void PPPMDPLR::init() {
 
   int nlocal = atom->nlocal;
   // cout << " ninit pppm/dplr ---------------------- " << nlocal << endl;
-  fele.resize(nlocal * 3);
+  fele.resize(static_cast<size_t>(nlocal) * 3);
   fill(fele.begin(), fele.end(), 0.0);
 }
 
@@ -296,7 +296,7 @@ void PPPMDPLR::fieldforce_ik() {
   int nghost = atom->nghost;
   int nall = nlocal + nghost;
 
-  fele.resize(nlocal * 3);
+  fele.resize(static_cast<size_t>(nlocal) * 3);
   fill(fele.begin(), fele.end(), 0.0);
 
   for (i = 0; i < nlocal; i++) {
@@ -372,7 +372,7 @@ void PPPMDPLR::fieldforce_ad() {
   int nghost = atom->nghost;
   int nall = nlocal + nghost;
 
-  fele.resize(nlocal * 3);
+  fele.resize(static_cast<size_t>(nlocal) * 3);
   fill(fele.begin(), fele.end(), 0.0);
 
   for (i = 0; i < nlocal; i++) {
diff --git a/source/lmp/tests/test_lammps_faparam.py b/source/lmp/tests/test_lammps_faparam.py
new file mode 100644
index 0000000000..064928eeb1
--- /dev/null
+++ b/source/lmp/tests/test_lammps_faparam.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Test LAMMPS fparam and aparam input."""
+import os
+import subprocess as sp
+import sys
+from pathlib import (
+    Path,
+)
+
+import constants
+import numpy as np
+import pytest
+from lammps import (
+    PyLammps,
+)
+from write_lmp_data import (
+    write_lmp_data,
+)
+
+pbtxt_file = (
+    Path(__file__).parent.parent.parent / "tests" / "infer" / "fparam_aparam.pbtxt"
+)
+pb_file = Path(__file__).parent / "fparam_aparam.pb"
+system_file = Path(__file__).parent.parent.parent / "tests"
+data_file = Path(__file__).parent / "data.lmp"
+md_file = Path(__file__).parent / "md.out"
+
+# from api_cc/tests/test_deeppot_a_fparam_aparam.cc
+expected_ae = np.array(
+    [
+        -1.038271183039953804e-01,
+        -7.285433575272914908e-02,
+        -9.467600174099155552e-02,
+        -1.467050086239614082e-01,
+        -7.660561620618722145e-02,
+        -7.277295998502930630e-02,
+    ]
+)
+expected_e = np.sum(expected_ae)
+expected_f = np.array(
+    [
+        6.622266817497907132e-02,
+        5.278739055693523058e-02,
+        2.265727495541422845e-02,
+        -2.606047850915838363e-02,
+        -4.538811686410718776e-02,
+        1.058247569147072187e-02,
+        1.679392490937766935e-01,
+        -2.257828022687320690e-03,
+        -4.490145670355452645e-02,
+        -1.148364103573685929e-01,
+        -1.169790466695089237e-02,
+        6.140402504113953025e-02,
+        -8.078778132132799494e-02,
+        -5.838878056243369807e-02,
+        6.773639989682191109e-02,
+        -1.247724708090079161e-02,
+        6.494523955924384750e-02,
+        -1.174787188812918687e-01,
+    ]
+).reshape(6, 3)
+
+expected_v = -np.array(
+    [
+        -1.589185553287162656e-01,
+        2.586163333170100279e-03,
+        -1.575127933809472624e-04,
+        -1.855360380105876630e-02,
+        1.949822090859933826e-02,
+        -1.006552056166355388e-02,
+        3.177029853276916449e-02,
+        1.714349636720383010e-03,
+        -1.290389175187874483e-03,
+        -8.553510339477603253e-02,
+        -5.654637257232508415e-03,
+        -1.286954833787038420e-02,
+        2.464156457499515687e-02,
+        -2.398202886026797043e-02,
+        -1.957110465239037672e-02,
+        2.233492928605742764e-02,
+        6.107843207824020099e-03,
+        1.707078295947736047e-03,
+        -1.653994088976195043e-01,
+        3.894358678172111371e-02,
+        -2.169595969759342477e-02,
+        6.819704294738503786e-03,
+        -5.018242039618424008e-03,
+        2.640664428663210429e-03,
+        -1.985298275686078057e-03,
+        -3.638421609610945767e-02,
+        2.342932331075030239e-02,
+        -8.501331914753691710e-02,
+        -2.181253413538992297e-03,
+        4.311300069651782287e-03,
+        -1.910329328333908129e-03,
+        -1.808810159508548836e-03,
+        -1.540075281450827612e-03,
+        -1.173703213175551763e-02,
+        -2.596306629910121507e-03,
+        6.705025662372287101e-03,
+        -9.038455005073858795e-02,
+        3.011717773578577451e-02,
+        -5.083054073419784880e-02,
+        -2.951210292616929069e-03,
+        2.342445652898489383e-02,
+        -4.091207474993674431e-02,
+        -1.648470649301832236e-02,
+        -2.872261885460645689e-02,
+        4.763924972552112391e-02,
+        -8.300036532764677732e-02,
+        1.020429228955421243e-03,
+        -1.026734151199098881e-03,
+        5.678534096113684732e-02,
+        1.273635718045938205e-02,
+        -1.530143225195957322e-02,
+        -1.061671865629566225e-01,
+        -2.486859433265622629e-02,
+        2.875323131744185121e-02,
+    ]
+).reshape(6, 9)
+
+box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0])
+coord = np.array(
+    [
+        [12.83, 2.56, 2.18],
+        [12.09, 2.87, 2.74],
+        [0.25, 3.32, 1.68],
+        [3.36, 3.00, 1.81],
+        [3.51, 2.51, 2.60],
+        [4.27, 3.22, 1.56],
+    ]
+)
+type_OH = np.array([1, 1, 1, 1, 1, 1])
+
+
+sp.check_output(
+    "{} -m deepmd convert-from pbtxt -i {} -o {}".format(
+        sys.executable,
+        pbtxt_file.resolve(),
+        pb_file.resolve(),
+    ).split()
+)
+
+
+def setup_module():
+    write_lmp_data(box, coord, type_OH, data_file)
+
+
+def teardown_module():
+    os.remove(data_file)
+
+
+def _lammps(data_file, units="metal") -> PyLammps:
+    lammps = PyLammps()
+    lammps.units(units)
+    lammps.boundary("p p p")
+    lammps.atom_style("atomic")
+    if units == "metal" or units == "real":
+        lammps.neighbor("2.0 bin")
+    elif units == "si":
+        lammps.neighbor("2.0e-10 bin")
+    else:
+        raise ValueError("units should be metal, real, or si")
+    lammps.neigh_modify("every 10 delay 0 check no")
+    lammps.read_data(data_file.resolve())
+    if units == "metal" or units == "real":
+        lammps.mass("1 16")
+    elif units == "si":
+        lammps.mass("1 %.10e" % (16 * constants.mass_metal2si))
+    else:
+        raise ValueError("units should be metal, real, or si")
+    if units == "metal":
+        lammps.timestep(0.0005)
+    elif units == "real":
+        lammps.timestep(0.5)
+    elif units == "si":
+        lammps.timestep(5e-16)
+    else:
+        raise ValueError("units should be metal, real, or si")
+    lammps.fix("1 all nve")
+    return lammps
+
+
+@pytest.fixture
+def lammps():
+    lmp = _lammps(data_file=data_file)
+    yield lmp
+    lmp.close()
+
+
+def test_pair_deepmd(lammps):
+    lammps.pair_style(f"deepmd {pb_file.resolve()} fparam 0.25852028 aparam 0.25852028")
+    lammps.pair_coeff("* *")
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    lammps.run(1)
+
+
+def test_pair_deepmd_virial(lammps):
+    lammps.pair_style(f"deepmd {pb_file.resolve()} fparam 0.25852028 aparam 0.25852028")
+    lammps.pair_coeff("* *")
+    lammps.compute("virial all centroid/stress/atom NULL pair")
+    for ii in range(9):
+        jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii]
+        lammps.variable(f"virial{jj} atom c_virial[{ii+1}]")
+    lammps.dump(
+        "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
+    )
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    for ii in range(9):
+        assert np.array(
+            lammps.variables[f"virial{ii}"].value
+        ) / constants.nktv2p == pytest.approx(expected_v[idx_map, ii])
diff --git a/source/md/src/Convert.cc b/source/md/src/Convert.cc
index 198c746c23..b8014bf974 100644
--- a/source/md/src/Convert.cc
+++ b/source/md/src/Convert.cc
@@ -53,8 +53,8 @@ void Convert<VALUETYPE>::gro2nnp(vector<VALUETYPE>& coord,
   assert(posi.size() == idx_map_nnp2gro.size());
   assert(velo.size() == idx_map_nnp2gro.size());
   int natoms = idx_map_nnp2gro.size();
-  coord.resize(3 * natoms);
-  veloc.resize(3 * natoms);
+  coord.resize(3 * static_cast<size_t>(natoms));
+  veloc.resize(3 * static_cast<size_t>(natoms));
   for (unsigned ii = 0; ii < natoms; ++ii) {
     int gro_i = idx_map_nnp2gro[ii];
     for (int dd = 0; dd < 3; ++dd) {
diff --git a/source/md/src/Tabulated.cc b/source/md/src/Tabulated.cc
index 1ecf8ee53d..6e9777ea29 100644
--- a/source/md/src/Tabulated.cc
+++ b/source/md/src/Tabulated.cc
@@ -24,7 +24,7 @@ void Tabulated::reinit(const VALUETYPE rc,
   hi = 1. / hh;
   rc2 = rc * rc;
 
-  data.resize(tableLength * stride);
+  data.resize(static_cast<size_t>(tableLength) * stride);
 
   int ii;
   for (ii = 0; ii < tableLength - 1; ++ii) {
diff --git a/source/op/descrpt.cc b/source/op/descrpt.cc
index ef040c3de0..6362b8d37a 100644
--- a/source/op/descrpt.cc
+++ b/source/op/descrpt.cc
@@ -145,22 +145,22 @@ class DescrptOp : public OpKernel {
     // Create an output tensor
     TensorShape descrpt_shape;
     descrpt_shape.AddDim(nsamples);
-    descrpt_shape.AddDim(nloc * ndescrpt);
+    descrpt_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt);
     TensorShape descrpt_deriv_shape;
     descrpt_deriv_shape.AddDim(nsamples);
-    descrpt_deriv_shape.AddDim(nloc * ndescrpt * 12);
+    descrpt_deriv_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt * 12);
     TensorShape rij_shape;
     rij_shape.AddDim(nsamples);
-    rij_shape.AddDim(nloc * nnei * 3);
+    rij_shape.AddDim(static_cast<int64_t>(nloc) * nnei * 3);
     TensorShape nlist_shape;
     nlist_shape.AddDim(nsamples);
-    nlist_shape.AddDim(nloc * nnei);
+    nlist_shape.AddDim(static_cast<int64_t>(nloc) * nnei);
     TensorShape axis_shape;
     axis_shape.AddDim(nsamples);
-    axis_shape.AddDim(nloc * 4);
+    axis_shape.AddDim(static_cast<int64_t>(nloc) * 4);
     TensorShape rot_mat_shape;
     rot_mat_shape.AddDim(nsamples);
-    rot_mat_shape.AddDim(nloc * 9);
+    rot_mat_shape.AddDim(static_cast<int64_t>(nloc) * 9);
 
     Tensor* descrpt_tensor = NULL;
     OP_REQUIRES_OK(context,
diff --git a/source/op/descrpt_se_a_ef.cc b/source/op/descrpt_se_a_ef.cc
index 030c184b46..96c953f167 100644
--- a/source/op/descrpt_se_a_ef.cc
+++ b/source/op/descrpt_se_a_ef.cc
@@ -161,16 +161,16 @@ class DescrptSeAEfOp : public OpKernel {
     // Create an output tensor
     TensorShape descrpt_shape;
     descrpt_shape.AddDim(nsamples);
-    descrpt_shape.AddDim(nloc * ndescrpt);
+    descrpt_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt);
     TensorShape descrpt_deriv_shape;
     descrpt_deriv_shape.AddDim(nsamples);
-    descrpt_deriv_shape.AddDim(nloc * ndescrpt * 3);
+    descrpt_deriv_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt * 3);
     TensorShape rij_shape;
     rij_shape.AddDim(nsamples);
-    rij_shape.AddDim(nloc * nnei * 3);
+    rij_shape.AddDim(static_cast<int64_t>(nloc) * nnei * 3);
     TensorShape nlist_shape;
     nlist_shape.AddDim(nsamples);
-    nlist_shape.AddDim(nloc * nnei);
+    nlist_shape.AddDim(static_cast<int64_t>(nloc) * nnei);
 
     int context_output_index = 0;
     Tensor* descrpt_tensor = NULL;
diff --git a/source/op/descrpt_se_a_ef_para.cc b/source/op/descrpt_se_a_ef_para.cc
index 06f7f138fb..6dc4442ee6 100644
--- a/source/op/descrpt_se_a_ef_para.cc
+++ b/source/op/descrpt_se_a_ef_para.cc
@@ -161,16 +161,16 @@ class DescrptSeAEfParaOp : public OpKernel {
     // Create an output tensor
     TensorShape descrpt_shape;
     descrpt_shape.AddDim(nsamples);
-    descrpt_shape.AddDim(nloc * ndescrpt);
+    descrpt_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt);
     TensorShape descrpt_deriv_shape;
     descrpt_deriv_shape.AddDim(nsamples);
-    descrpt_deriv_shape.AddDim(nloc * ndescrpt * 3);
+    descrpt_deriv_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt * 3);
     TensorShape rij_shape;
     rij_shape.AddDim(nsamples);
-    rij_shape.AddDim(nloc * nnei * 3);
+    rij_shape.AddDim(static_cast<int64_t>(nloc) * nnei * 3);
     TensorShape nlist_shape;
     nlist_shape.AddDim(nsamples);
-    nlist_shape.AddDim(nloc * nnei);
+    nlist_shape.AddDim(static_cast<int64_t>(nloc) * nnei);
 
     int context_output_index = 0;
     Tensor* descrpt_tensor = NULL;
diff --git a/source/op/descrpt_se_a_ef_vert.cc b/source/op/descrpt_se_a_ef_vert.cc
index d70e9b201b..9899e29f06 100644
--- a/source/op/descrpt_se_a_ef_vert.cc
+++ b/source/op/descrpt_se_a_ef_vert.cc
@@ -161,16 +161,16 @@ class DescrptSeAEfVertOp : public OpKernel {
     // Create an output tensor
     TensorShape descrpt_shape;
     descrpt_shape.AddDim(nsamples);
-    descrpt_shape.AddDim(nloc * ndescrpt);
+    descrpt_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt);
     TensorShape descrpt_deriv_shape;
     descrpt_deriv_shape.AddDim(nsamples);
-    descrpt_deriv_shape.AddDim(nloc * ndescrpt * 3);
+    descrpt_deriv_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt * 3);
     TensorShape rij_shape;
     rij_shape.AddDim(nsamples);
-    rij_shape.AddDim(nloc * nnei * 3);
+    rij_shape.AddDim(static_cast<int64_t>(nloc) * nnei * 3);
     TensorShape nlist_shape;
     nlist_shape.AddDim(nsamples);
-    nlist_shape.AddDim(nloc * nnei);
+    nlist_shape.AddDim(static_cast<int64_t>(nloc) * nnei);
 
     int context_output_index = 0;
     Tensor* descrpt_tensor = NULL;
diff --git a/source/op/descrpt_se_a_mask.cc b/source/op/descrpt_se_a_mask.cc
index 4f133e5210..e27ea099ab 100644
--- a/source/op/descrpt_se_a_mask.cc
+++ b/source/op/descrpt_se_a_mask.cc
@@ -95,16 +95,18 @@ class DescrptSeAMaskOp : public OpKernel {
     // Create an output tensor
     TensorShape descrpt_shape;
     descrpt_shape.AddDim(nsamples);
-    descrpt_shape.AddDim(total_atom_num * total_atom_num * n_descrpt);
+    descrpt_shape.AddDim(static_cast<int64_t>(total_atom_num) * total_atom_num *
+                         n_descrpt);
     TensorShape descrpt_deriv_shape;
     descrpt_deriv_shape.AddDim(nsamples);
-    descrpt_deriv_shape.AddDim(total_atom_num * total_atom_num * n_descrpt * 3);
+    descrpt_deriv_shape.AddDim(static_cast<int64_t>(total_atom_num) *
+                               total_atom_num * n_descrpt * 3);
     TensorShape rij_shape;
     rij_shape.AddDim(nsamples);
-    rij_shape.AddDim(total_atom_num * total_atom_num * 3);
+    rij_shape.AddDim(static_cast<int64_t>(total_atom_num) * total_atom_num * 3);
     TensorShape nlist_shape;
     nlist_shape.AddDim(nsamples);
-    nlist_shape.AddDim(total_atom_num * total_atom_num);
+    nlist_shape.AddDim(static_cast<int64_t>(total_atom_num) * total_atom_num);
 
     int context_output_index = 0;
     Tensor *descrpt_tensor = NULL;
diff --git a/source/op/ewald_recp.cc b/source/op/ewald_recp.cc
index a0fbc7f580..dcad204467 100644
--- a/source/op/ewald_recp.cc
+++ b/source/op/ewald_recp.cc
@@ -56,10 +56,14 @@ class EwaldRecpOp : public OpKernel {
 
     // check the sizes
     OP_REQUIRES(
-        context, (nsamples * nloc * 3 == coord_tensor.shape().dim_size(0)),
+        context,
+        (static_cast<int64_t>(nsamples) * nloc * 3 ==
+         coord_tensor.shape().dim_size(0)),
         errors::InvalidArgument("coord  number of samples should match"));
     OP_REQUIRES(
-        context, (nsamples * nloc * 1 == charge_tensor.shape().dim_size(0)),
+        context,
+        (static_cast<int64_t>(nsamples) * nloc * 1 ==
+         charge_tensor.shape().dim_size(0)),
         errors::InvalidArgument("charge number of samples should match"));
     OP_REQUIRES(
         context, (nsamples * 9 == box_tensor.shape().dim_size(0)),
@@ -70,7 +74,7 @@ class EwaldRecpOp : public OpKernel {
     energy_shape.AddDim(nsamples);
     TensorShape force_shape;
     force_shape.AddDim(nsamples);
-    force_shape.AddDim(nloc * 3);
+    force_shape.AddDim(static_cast<int64_t>(nloc) * 3);
     TensorShape virial_shape;
     virial_shape.AddDim(nsamples);
     virial_shape.AddDim(9);
diff --git a/source/op/map_aparam.cc b/source/op/map_aparam.cc
index d0ff08032d..7ac3b48a4f 100644
--- a/source/op/map_aparam.cc
+++ b/source/op/map_aparam.cc
@@ -61,7 +61,7 @@ class MapAparamOp : public OpKernel {
     // Create an output tensor
     TensorShape output_shape;
     output_shape.AddDim(nframes);
-    output_shape.AddDim(nloc * nnei * numb_aparam);
+    output_shape.AddDim(static_cast<int64_t>(nloc) * nnei * numb_aparam);
     Tensor* output_tensor = NULL;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_shape, &output_tensor));
diff --git a/source/op/matmul_fitnet_nvnmd.cc b/source/op/matmul_fitnet_nvnmd.cc
index 28795526de..b5dc32a642 100644
--- a/source/op/matmul_fitnet_nvnmd.cc
+++ b/source/op/matmul_fitnet_nvnmd.cc
@@ -133,7 +133,7 @@ class MatmulFitnetNvnmdOp : public OpKernel {
     expo_maxs.resize(K);
 
     if (normw == 0) {
-      find_max_expo(expo_max, (FPTYPE*)&w[0], M * K);
+      find_max_expo(expo_max, (FPTYPE*)&w[0], static_cast<int64_t>(M) * K);
       for (kk = 0; kk < K; kk++) {
         expo_maxs[kk] = expo_max;
       }
diff --git a/source/op/matmul_flt_nvnmd.cc b/source/op/matmul_flt_nvnmd.cc
index 19e7a4869e..92b6375100 100644
--- a/source/op/matmul_flt_nvnmd.cc
+++ b/source/op/matmul_flt_nvnmd.cc
@@ -130,7 +130,8 @@ class MatmulFltNvnmdOp : public OpKernel {
     for (hh = 0; hh < H; hh++) {
       // find x max exponnet
       if ((normx & 0x0f) == 0) {  // normalize x[:,:]
-        find_max_expo(expo_max1, (FPTYPE *)&x[hh * N * M], N * M);
+        find_max_expo(expo_max1, (FPTYPE *)&x[hh * N * M],
+                      static_cast<int64_t>(N) * M);
         for (ii = 0; ii < N; ii++) {
           expo_max1s[ii] = expo_max1;
         }
@@ -144,7 +145,8 @@ class MatmulFltNvnmdOp : public OpKernel {
 
       // find w max exponnet
       if ((normw & 0x0f) == 0) {  // normalize w[:,:]
-        find_max_expo(expo_max2, (FPTYPE *)&w[hh * M * K], M * K);
+        find_max_expo(expo_max2, (FPTYPE *)&w[hh * M * K],
+                      static_cast<int64_t>(M) * K);
         for (kk = 0; kk < K; kk++) {
           expo_max2s[kk] = expo_max2;
         }
diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc
index d917c60a5f..d2a6b3ab31 100644
--- a/source/op/neighbor_stat.cc
+++ b/source/op/neighbor_stat.cc
@@ -112,7 +112,7 @@ class NeighborStatOp : public OpKernel {
       if (nei_mode == 1) {
         // Tensor FPTYPE_temp;
         TensorShape FPTYPE_shape;
-        FPTYPE_shape.AddDim(nall * 3);
+        FPTYPE_shape.AddDim(static_cast<int64_t>(nall) * 3);
         OP_REQUIRES_OK(context,
                        context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
                                               FPTYPE_shape, &tensor_list[0]));
@@ -125,20 +125,20 @@ class NeighborStatOp : public OpKernel {
                                               double_shape, &tensor_list[1]));
         // Tensor cpy_temp;
         TensorShape cpy_shape;
-        cpy_shape.AddDim(mem_cpy * 3);
+        cpy_shape.AddDim(static_cast<int64_t>(mem_cpy) * 3);
         OP_REQUIRES_OK(context,
                        context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
                                               cpy_shape, &tensor_list[3]));
         // Tensor t_temp;
         TensorShape t_shape;
-        t_shape.AddDim(mem_cpy * 2);
+        t_shape.AddDim(static_cast<int64_t>(mem_cpy) * 2);
         OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
                                                        &tensor_list[4]));
       }
 
       // Tensor nlist_temp;
       TensorShape nlist_shape;
-      nlist_shape.AddDim(nloc * 2);
+      nlist_shape.AddDim(static_cast<int64_t>(nloc) * 2);
       OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape,
                                                      &tensor_list[5]));
 
@@ -167,7 +167,7 @@ class NeighborStatOp : public OpKernel {
           rcut, max_cpy_trial, max_nnei_trial);
 
       TensorShape min_nbor_dist_shape;
-      min_nbor_dist_shape.AddDim(nloc * mem_nnei);
+      min_nbor_dist_shape.AddDim(static_cast<int64_t>(nloc) * mem_nnei);
       Tensor* min_nbor_dist_tensor = NULL;
       OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
                                                        min_nbor_dist_shape,
@@ -253,7 +253,7 @@ class NeighborStatOp : public OpKernel {
       }
       // allocate output tensor for deepmd-kit
       TensorShape min_nbor_dist_shape;
-      min_nbor_dist_shape.AddDim(nloc * MAX_NNEI);
+      min_nbor_dist_shape.AddDim(static_cast<int64_t>(nloc) * MAX_NNEI);
       Tensor* min_nbor_dist_tensor = NULL;
       OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
                                                        min_nbor_dist_shape,
diff --git a/source/op/pair_tab.cc b/source/op/pair_tab.cc
index e412aa6c2a..0b04390d5f 100644
--- a/source/op/pair_tab.cc
+++ b/source/op/pair_tab.cc
@@ -89,10 +89,13 @@ class PairTabOp : public OpKernel {
     OP_REQUIRES(context, (nall == type_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("shape of type should be nall"));
     OP_REQUIRES(
-        context, (3 * nnei * nloc == rij_tensor.shape().dim_size(1)),
+        context,
+        (3 * static_cast<int64_t>(nnei) * nloc ==
+         rij_tensor.shape().dim_size(1)),
         errors::InvalidArgument("shape of rij should be 3 * nloc * nnei"));
     OP_REQUIRES(
-        context, (nnei * nloc == nlist_tensor.shape().dim_size(1)),
+        context,
+        (static_cast<int64_t>(nnei) * nloc == nlist_tensor.shape().dim_size(1)),
         errors::InvalidArgument("shape of nlist should be nloc * nnei"));
     OP_REQUIRES(context, (nloc == scale_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("shape of scale should be nloc"));
@@ -103,10 +106,10 @@ class PairTabOp : public OpKernel {
     energy_shape.AddDim(nloc);
     TensorShape force_shape;
     force_shape.AddDim(nframes);
-    force_shape.AddDim(3 * nall);
+    force_shape.AddDim(3 * static_cast<int64_t>(nall));
     TensorShape virial_shape;
     virial_shape.AddDim(nframes);
-    virial_shape.AddDim(9 * nall);
+    virial_shape.AddDim(9 * static_cast<int64_t>(nall));
     Tensor* energy_tensor = NULL;
     Tensor* force_tensor = NULL;
     Tensor* virial_tensor = NULL;
@@ -134,10 +137,11 @@ class PairTabOp : public OpKernel {
                     "ntypes provided in table does not match deeppot"));
     int nspline = table_info(2) + 0.1;
     int tab_stride = 4 * nspline;
-    assert(ntypes * ntypes * tab_stride ==
+    assert(static_cast<int64_t>(ntypes) * ntypes * tab_stride ==
            table_data_tensor.shape().dim_size(0));
     std::vector<double> d_table_info(4);
-    std::vector<double> d_table_data(ntypes * ntypes * tab_stride);
+    std::vector<double> d_table_data(static_cast<size_t>(ntypes) * ntypes *
+                                     tab_stride);
     for (unsigned ii = 0; ii < d_table_info.size(); ++ii) {
       d_table_info[ii] = table_info(ii);
     }
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 048237e042..7037a00a6c 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -497,7 +497,7 @@ class ProdEnvMatAOp : public OpKernel {
         if (nei_mode == 1) {
           // Tensor FPTYPE_temp;
           TensorShape FPTYPE_shape;
-          FPTYPE_shape.AddDim(nall * 3);
+          FPTYPE_shape.AddDim(static_cast<int64_t>(nall) * 3);
           OP_REQUIRES_OK(context,
                          context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
                                                 FPTYPE_shape, &tensor_list[0]));
@@ -510,20 +510,20 @@ class ProdEnvMatAOp : public OpKernel {
                                                 double_shape, &tensor_list[1]));
           // Tensor cpy_temp;
           TensorShape cpy_shape;
-          cpy_shape.AddDim(mem_cpy * 3);
+          cpy_shape.AddDim(static_cast<int64_t>(mem_cpy) * 3);
           OP_REQUIRES_OK(context,
                          context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
                                                 cpy_shape, &tensor_list[3]));
           // Tensor t_temp;
           TensorShape t_shape;
-          t_shape.AddDim(mem_cpy * 2);
+          t_shape.AddDim(static_cast<int64_t>(mem_cpy) * 2);
           OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
                                                          &tensor_list[4]));
         }
 
         // Tensor nlist_temp;
         TensorShape nlist_shape;
-        nlist_shape.AddDim(nloc * 2);
+        nlist_shape.AddDim(static_cast<int64_t>(nloc) * 2);
         OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape,
                                                        &tensor_list[5]));
 
@@ -577,6 +577,15 @@ class ProdEnvMatAOp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut_r, max_cpy_trial, max_nnei_trial);
 
+        // max_nbor_size may be changed after _prepare_coord_nlist_gpu
+        // So we need to update the uint64_temp tensor if necessary
+        if (uint64_temp.NumElements() < int_64(nloc) * max_nbor_size * 2) {
+          TensorShape uint64_shape;
+          uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
+          OP_REQUIRES_OK(context, context->allocate_temp(
+                                      DT_UINT64, uint64_shape, &uint64_temp));
+          array_longlong = uint64_temp.flat<unsigned long long>().data();
+        }
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
                                    gpu_inlist, array_int, array_longlong,
@@ -794,7 +803,7 @@ class ProdEnvMatROp : public OpKernel {
         if (nei_mode == 1) {
           // Tensor FPTYPE_temp;
           TensorShape FPTYPE_shape;
-          FPTYPE_shape.AddDim(nall * 3);
+          FPTYPE_shape.AddDim(static_cast<int64_t>(nall) * 3);
           OP_REQUIRES_OK(context,
                          context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
                                                 FPTYPE_shape, &tensor_list[0]));
@@ -807,20 +816,20 @@ class ProdEnvMatROp : public OpKernel {
                                                 double_shape, &tensor_list[1]));
           // Tensor cpy_temp;
           TensorShape cpy_shape;
-          cpy_shape.AddDim(mem_cpy * 3);
+          cpy_shape.AddDim(static_cast<int64_t>(mem_cpy) * 3);
           OP_REQUIRES_OK(context,
                          context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
                                                 cpy_shape, &tensor_list[3]));
           // Tensor t_temp;
           TensorShape t_shape;
-          t_shape.AddDim(mem_cpy * 2);
+          t_shape.AddDim(static_cast<int64_t>(mem_cpy) * 2);
           OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
                                                          &tensor_list[4]));
         }
 
         // Tensor nlist_temp;
         TensorShape nlist_shape;
-        nlist_shape.AddDim(nloc * 2);
+        nlist_shape.AddDim(static_cast<int64_t>(nloc) * 2);
         OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape,
                                                        &tensor_list[5]));
 
@@ -875,6 +884,16 @@ class ProdEnvMatROp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut, max_cpy_trial, max_nnei_trial);
 
+        // max_nbor_size may be changed after _prepare_coord_nlist_gpu
+        // So we need to update the uint64_temp tensor if necessary
+        if (uint64_temp.NumElements() < int_64(nloc) * max_nbor_size * 2) {
+          TensorShape uint64_shape;
+          uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
+          OP_REQUIRES_OK(context, context->allocate_temp(
+                                      DT_UINT64, uint64_shape, &uint64_temp));
+          array_longlong = uint64_temp.flat<unsigned long long>().data();
+        }
+
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type,
                                    gpu_inlist, array_int, array_longlong,
@@ -1066,10 +1085,10 @@ class ProdEnvMatAMixOp : public OpKernel {
     nlist_shape.AddDim(int_64(nloc) * nnei);
     TensorShape ntype_shape;
     ntype_shape.AddDim(nsamples);
-    ntype_shape.AddDim(nloc * nnei);
+    ntype_shape.AddDim(static_cast<int64_t>(nloc) * nnei);
     TensorShape nmask_shape;
     nmask_shape.AddDim(nsamples);
-    nmask_shape.AddDim(nloc * nnei);
+    nmask_shape.AddDim(static_cast<int64_t>(nloc) * nnei);
     // define output tensor
     int context_output_index = 0;
     Tensor* descrpt_tensor = NULL;
@@ -1098,7 +1117,7 @@ class ProdEnvMatAMixOp : public OpKernel {
 
     Tensor fake_type_tensor;  // all zeros
     TensorShape fake_type_shape;
-    fake_type_shape.AddDim(nsamples * nall);
+    fake_type_shape.AddDim(static_cast<int64_t>(nsamples) * nall);
     OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, fake_type_shape,
                                                    &fake_type_tensor));
 
@@ -1137,7 +1156,7 @@ class ProdEnvMatAMixOp : public OpKernel {
         if (nei_mode == 1) {
           // Tensor FPTYPE_temp;
           TensorShape FPTYPE_shape;
-          FPTYPE_shape.AddDim(nall * 3);
+          FPTYPE_shape.AddDim(static_cast<int64_t>(nall) * 3);
           OP_REQUIRES_OK(context,
                          context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
                                                 FPTYPE_shape, &tensor_list[0]));
@@ -1150,20 +1169,20 @@ class ProdEnvMatAMixOp : public OpKernel {
                                                 double_shape, &tensor_list[1]));
           // Tensor cpy_temp;
           TensorShape cpy_shape;
-          cpy_shape.AddDim(mem_cpy * 3);
+          cpy_shape.AddDim(static_cast<int64_t>(mem_cpy) * 3);
           OP_REQUIRES_OK(context,
                          context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
                                                 cpy_shape, &tensor_list[3]));
           // Tensor t_temp;
           TensorShape t_shape;
-          t_shape.AddDim(mem_cpy * 2);
+          t_shape.AddDim(static_cast<int64_t>(mem_cpy) * 2);
           OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
                                                          &tensor_list[4]));
         }
 
         // Tensor nlist_temp;
         TensorShape nlist_shape;
-        nlist_shape.AddDim(nloc * 2);
+        nlist_shape.AddDim(static_cast<int64_t>(nloc) * 2);
         OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape,
                                                        &tensor_list[5]));
 
@@ -1221,6 +1240,16 @@ class ProdEnvMatAMixOp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut_r, max_cpy_trial, max_nnei_trial);
 
+        // max_nbor_size may be changed after _prepare_coord_nlist_gpu
+        // So we need to update the uint64_temp tensor if necessary
+        if (uint64_temp.NumElements() < int_64(nloc) * max_nbor_size * 2) {
+          TensorShape uint64_shape;
+          uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
+          OP_REQUIRES_OK(context, context->allocate_temp(
+                                      DT_UINT64, uint64_shape, &uint64_temp));
+          array_longlong = uint64_temp.flat<unsigned long long>().data();
+        }
+
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
                                    gpu_inlist, array_int, array_longlong,
@@ -1296,7 +1325,7 @@ static int _norm_copy_coord_cpu(std::vector<FPTYPE>& coord_cpy,
   normalize_coord_cpu(&tmp_coord[0], nall, region);
   int tt;
   for (tt = 0; tt < max_cpy_trial; ++tt) {
-    coord_cpy.resize(mem_cpy * 3);
+    coord_cpy.resize(static_cast<size_t>(mem_cpy) * 3);
     type_cpy.resize(mem_cpy);
     idx_mapping.resize(mem_cpy);
     int ret =
@@ -1496,11 +1525,7 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
   int* int_data_dev = cell_info_dev + 23;
   deepmd::memcpy_host_to_device(box_info_dev, box_info, 18);
   deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23);
-  deepmd::Region<FPTYPE> region_dev;
-  FPTYPE* new_boxt = region_dev.boxt;
-  FPTYPE* new_rec_boxt = region_dev.rec_boxt;
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
+  deepmd::Region<FPTYPE> region_dev(box_info_dev, box_info_dev + 9);
   deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev);
   int tt;
   for (tt = 0; tt < max_cpy_trial; ++tt) {
@@ -1516,7 +1541,7 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
       mem_cpy *= 2;
       // Tensor cpy_temp;
       TensorShape cpy_shape;
-      cpy_shape.AddDim(mem_cpy * 3);
+      cpy_shape.AddDim(static_cast<int64_t>(mem_cpy) * 3);
       status = context->allocate_temp(DataTypeToEnum<FPTYPE>::value, cpy_shape,
                                       tensor_list + 3);
       if (!status.ok()) {
@@ -1524,15 +1549,13 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
       }
       // Tensor t_temp;
       TensorShape t_shape;
-      t_shape.AddDim(mem_cpy * 2);
+      t_shape.AddDim(static_cast<int64_t>(mem_cpy) * 2);
       status = context->allocate_temp(DT_INT32, t_shape, tensor_list + 4);
       if (!status.ok()) {
         return false;
       }
     }
   }
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
   return (tt != max_cpy_trial);
 }
 
diff --git a/source/op/prod_env_mat_multi_device_nvnmd.cc b/source/op/prod_env_mat_multi_device_nvnmd.cc
index abca947f0a..d9f9275b86 100644
--- a/source/op/prod_env_mat_multi_device_nvnmd.cc
+++ b/source/op/prod_env_mat_multi_device_nvnmd.cc
@@ -156,7 +156,7 @@ static int _norm_copy_coord_cpu(std::vector<FPTYPE>& coord_cpy,
   normalize_coord_cpu(&tmp_coord[0], nall, region);
   int tt;
   for (tt = 0; tt < max_cpy_trial; ++tt) {
-    coord_cpy.resize(mem_cpy * 3);
+    coord_cpy.resize(static_cast<size_t>(mem_cpy) * 3);
     type_cpy.resize(mem_cpy);
     idx_mapping.resize(mem_cpy);
     int ret =
@@ -411,6 +411,9 @@ class ProdEnvMatANvnmdQuantizeOp : public OpKernel {
       // no pbc
       assert(nloc == nall);
       nei_mode = -1;
+    } else if (mesh_tensor.shape().dim_size(0) > 16) {
+      // pass neighbor list inside the tensor
+      nei_mode = 4;
     } else if (mesh_tensor.shape().dim_size(0) == 7 ||
                mesh_tensor.shape().dim_size(0) == 1) {
       throw deepmd::deepmd_exception(
@@ -422,16 +425,16 @@ class ProdEnvMatANvnmdQuantizeOp : public OpKernel {
     // Create output tensors
     TensorShape descrpt_shape;
     descrpt_shape.AddDim(nsamples);
-    descrpt_shape.AddDim(nloc * ndescrpt);
+    descrpt_shape.AddDim(int_64(nloc) * ndescrpt);
     TensorShape descrpt_deriv_shape;
     descrpt_deriv_shape.AddDim(nsamples);
-    descrpt_deriv_shape.AddDim(nloc * ndescrpt * 3);
+    descrpt_deriv_shape.AddDim(int_64(nloc) * ndescrpt * 3);
     TensorShape rij_shape;
     rij_shape.AddDim(nsamples);
-    rij_shape.AddDim(nloc * nnei * 3);
+    rij_shape.AddDim(int_64(nloc) * nnei * 3);
     TensorShape nlist_shape;
     nlist_shape.AddDim(nsamples);
-    nlist_shape.AddDim(nloc * nnei);
+    nlist_shape.AddDim(int_64(nloc) * nnei);
     // define output tensor
     int context_output_index = 0;
     Tensor* descrpt_tensor = NULL;
@@ -460,8 +463,16 @@ class ProdEnvMatANvnmdQuantizeOp : public OpKernel {
     const FPTYPE* std = std_tensor.flat<FPTYPE>().data();
     const int* p_type = type_tensor.flat<int>().data();
 
+    // must declar out of if, otherwise the memory will be destroyed!
+    Tensor int_temp;
+    Tensor uint64_temp;
+    std::vector<Tensor> tensor_list(7);
+    if (device == "GPU") {
+      // UNDEFINE
+    }
+
     // loop over samples
-    for (int ff = 0; ff < nsamples; ++ff) {
+    for (int_64 ff = 0; ff < nsamples; ++ff) {
       FPTYPE* em = p_em + ff * nloc * ndescrpt;
       FPTYPE* em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3;
       FPTYPE* rij = p_rij + ff * nloc * nnei * 3;
@@ -633,15 +644,18 @@ class ProdEnvMatAMixNvnmdQuantizeOp : public OpKernel {
     if (mesh_tensor.shape().dim_size(0) == 16) {
       // lammps neighbor list
       nei_mode = 3;
-    } else if (mesh_tensor.shape().dim_size(0) == 6) {
+    } else if (mesh_tensor.shape().dim_size(0) == 6 ||
+               mesh_tensor.shape().dim_size(0) == 7) {
       // manual copied pbc
-      assert(nloc == nall);
       nei_mode = 1;
       b_nlist_map = true;
-    } else if (mesh_tensor.shape().dim_size(0) == 0) {
+    } else if (mesh_tensor.shape().dim_size(0) == 0 ||
+               mesh_tensor.shape().dim_size(0) == 1) {
       // no pbc
-      assert(nloc == nall);
       nei_mode = -1;
+    } else if (mesh_tensor.shape().dim_size(0) > 16) {
+      // pass neighbor list inside the tensor
+      nei_mode = 4;
     } else {
       throw deepmd::deepmd_exception("invalid mesh tensor");
     }
@@ -661,10 +675,10 @@ class ProdEnvMatAMixNvnmdQuantizeOp : public OpKernel {
     nlist_shape.AddDim(int_64(nloc) * nnei);
     TensorShape ntype_shape;
     ntype_shape.AddDim(nsamples);
-    ntype_shape.AddDim(nloc * nnei);
+    ntype_shape.AddDim(static_cast<int64_t>(nloc) * nnei);
     TensorShape nmask_shape;
     nmask_shape.AddDim(nsamples);
-    nmask_shape.AddDim(nloc * nnei);
+    nmask_shape.AddDim(static_cast<int64_t>(nloc) * nnei);
     // define output tensor
     int context_output_index = 0;
     Tensor* descrpt_tensor = NULL;
@@ -691,6 +705,12 @@ class ProdEnvMatAMixNvnmdQuantizeOp : public OpKernel {
                    context->allocate_output(context_output_index++, nmask_shape,
                                             &nmask_tensor));
 
+    Tensor fake_type_tensor;  // all zeros
+    TensorShape fake_type_shape;
+    fake_type_shape.AddDim(static_cast<int64_t>(nsamples) * nall);
+    OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, fake_type_shape,
+                                                   &fake_type_tensor));
+
     FPTYPE* p_em = descrpt_tensor->flat<FPTYPE>().data();
     FPTYPE* p_em_deriv = descrpt_deriv_tensor->flat<FPTYPE>().data();
     FPTYPE* p_rij = rij_tensor->flat<FPTYPE>().data();
@@ -702,7 +722,25 @@ class ProdEnvMatAMixNvnmdQuantizeOp : public OpKernel {
     const FPTYPE* avg = avg_tensor.flat<FPTYPE>().data();
     const FPTYPE* std = std_tensor.flat<FPTYPE>().data();
     const int* p_type = type_tensor.flat<int>().data();
+    int* p_f_type = fake_type_tensor.flat<int>().data();
+
+    if (device == "GPU") {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// UNDEFINE
+#endif
+    } else if (device == "CPU") {
+      for (int ii = 0; ii < nsamples * nall; ii++) {
+        p_f_type[ii] = (p_type[ii] < 0) ? -1 : 0;
+      }
+    }
 
+    // must declar out of if, otherwise the memory will be destroyed!
+    Tensor int_temp;
+    Tensor uint64_temp;
+    std::vector<Tensor> tensor_list(7);
+    if (device == "GPU") {
+      // UNDEFINE
+    }
     // loop over samples
     for (int_64 ff = 0; ff < nsamples; ++ff) {
       FPTYPE* em = p_em + ff * nloc * ndescrpt;
@@ -714,6 +752,7 @@ class ProdEnvMatAMixNvnmdQuantizeOp : public OpKernel {
       const FPTYPE* coord = p_coord + ff * nall * 3;
       const FPTYPE* box = p_box + ff * 9;
       const int* type = p_type + ff * nall;
+      const int* f_type = p_f_type + ff * nall;
 
       if (device == "GPU") {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -729,13 +768,6 @@ class ProdEnvMatAMixNvnmdQuantizeOp : public OpKernel {
         std::vector<FPTYPE> coord_cpy;
         std::vector<int> type_cpy;
         int frame_nall = nall;
-        std::vector<int> fake_type(nall, 0);
-        for (int ii = 0; ii < nall; ii++) {
-          if (type[ii] < 0) {
-            fake_type[ii] = -1;
-          }
-        }
-        const int* f_type = &fake_type[0];
         // prepare coord and nlist
         _prepare_coord_nlist_cpu<FPTYPE>(
             context, &coord, coord_cpy, &f_type, type_cpy, idx_mapping, inlist,
diff --git a/source/op/prod_force.cc b/source/op/prod_force.cc
index d8ced591b9..20269ebef3 100644
--- a/source/op/prod_force.cc
+++ b/source/op/prod_force.cc
@@ -70,7 +70,8 @@ class ProdForceOp : public OpKernel {
                 errors::InvalidArgument("number of samples should match"));
 
     OP_REQUIRES(context,
-                (nloc * ndescrpt * 12 == in_deriv_tensor.shape().dim_size(1)),
+                (static_cast<int64_t>(nloc) * ndescrpt * 12 ==
+                 in_deriv_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("number of descriptors should match"));
     OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel),
                 errors::InvalidArgument("number of neighbors should match"));
@@ -81,7 +82,7 @@ class ProdForceOp : public OpKernel {
     // Create an output tensor
     TensorShape force_shape;
     force_shape.AddDim(nframes);
-    force_shape.AddDim(3 * nall);
+    force_shape.AddDim(3 * static_cast<int64_t>(nall));
     // std::cout << "forcesahpe " << force_shape.dim_size(0) << " " <<
     // force_shape.dim_size(1) << std::endl;
     Tensor* force_tensor = NULL;
diff --git a/source/op/prod_force_grad.cc b/source/op/prod_force_grad.cc
index 2d14022279..acfe9145fe 100644
--- a/source/op/prod_force_grad.cc
+++ b/source/op/prod_force_grad.cc
@@ -81,7 +81,9 @@ class ProdForceGradOp : public OpKernel {
     OP_REQUIRES(
         context, (nloc * 3 == grad_shape.dim_size(1)),
         errors::InvalidArgument("input grad shape should be 3 x natoms"));
-    OP_REQUIRES(context, (nloc * ndescrpt * 12 == in_deriv_shape.dim_size(1)),
+    OP_REQUIRES(context,
+                (static_cast<int64_t>(nloc) * ndescrpt * 12 ==
+                 in_deriv_shape.dim_size(1)),
                 errors::InvalidArgument("number of descriptors should match"));
     OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel),
                 errors::InvalidArgument("number of neighbors should match"));
@@ -92,7 +94,7 @@ class ProdForceGradOp : public OpKernel {
     // Create an output tensor
     TensorShape grad_net_shape;
     grad_net_shape.AddDim(nframes);
-    grad_net_shape.AddDim(nloc * ndescrpt);
+    grad_net_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt);
 
     // allocate the output tensor
     Tensor* grad_net_tensor = NULL;
diff --git a/source/op/prod_force_grad_multi_device.cc b/source/op/prod_force_grad_multi_device.cc
index bbcef6bd91..ee8a29732d 100644
--- a/source/op/prod_force_grad_multi_device.cc
+++ b/source/op/prod_force_grad_multi_device.cc
@@ -107,11 +107,14 @@ class ProdForceSeAGradOp : public OpKernel {
     assert(nframes == net_deriv_tensor.shape().dim_size(0));
     assert(nframes == in_deriv_tensor.shape().dim_size(0));
     assert(nframes == nlist_tensor.shape().dim_size(0));
-    assert(nloc * ndescrpt == grad_net_shape.dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt == grad_net_shape.dim_size(1));
     assert(nloc * 3 == grad_shape.dim_size(1));
-    assert(nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1));
-    assert(nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1));
-    assert(nloc * nnei == nlist_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt ==
+           net_deriv_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+           in_deriv_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * nnei ==
+           nlist_tensor.shape().dim_size(1));
     assert(nnei * 4 == ndescrpt);
     // flat the tensors
     FPTYPE* p_grad_net = grad_net_tensor->flat<FPTYPE>().data();
@@ -215,11 +218,14 @@ class ProdForceSeRGradOp : public OpKernel {
     assert(nframes == net_deriv_tensor.shape().dim_size(0));
     assert(nframes == in_deriv_tensor.shape().dim_size(0));
     assert(nframes == nlist_tensor.shape().dim_size(0));
-    assert(nloc * ndescrpt == grad_net_shape.dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt == grad_net_shape.dim_size(1));
     assert(nloc * 3 == grad_shape.dim_size(1));
-    assert(nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1));
-    assert(nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1));
-    assert(nloc * nnei == nlist_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt ==
+           net_deriv_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+           in_deriv_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * nnei ==
+           nlist_tensor.shape().dim_size(1));
     assert(nnei * 1 == ndescrpt);
     // flat the tensors
     FPTYPE* p_grad_net = grad_net_tensor->flat<FPTYPE>().data();
diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc
index 3eaf005f9a..d48749faa5 100644
--- a/source/op/prod_force_multi_device.cc
+++ b/source/op/prod_force_multi_device.cc
@@ -103,7 +103,7 @@ class ProdForceSeAOp : public OpKernel {
     // Create an output tensor
     TensorShape force_shape;
     force_shape.AddDim(nframes);
-    force_shape.AddDim(3 * nall);
+    force_shape.AddDim(3 * static_cast<int64_t>(nall));
     Tensor* force_tensor = NULL;
     int context_output_index = 0;
     OP_REQUIRES_OK(context,
@@ -115,9 +115,12 @@ class ProdForceSeAOp : public OpKernel {
     assert(nframes == in_deriv_tensor.shape().dim_size(0));
     assert(nframes == nlist_tensor.shape().dim_size(0));
     assert(nall * 3 == force_shape.dim_size(1));
-    assert(nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1));
-    assert(nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1));
-    assert(nloc * nnei == nlist_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt ==
+           net_deriv_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+           in_deriv_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * nnei ==
+           nlist_tensor.shape().dim_size(1));
     assert(nnei * 4 == ndescrpt);
 
     // flat the tensors
@@ -195,12 +198,13 @@ class ProdForceSeROp : public OpKernel {
     OP_REQUIRES(context, (nframes == nlist_tensor.shape().dim_size(0)),
                 errors::InvalidArgument("number of samples should match"));
     OP_REQUIRES(context,
-                (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)),
+                (static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+                 in_deriv_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("number of descriptors should match"));
     // Create an output tensor
     TensorShape force_shape;
     force_shape.AddDim(nframes);
-    force_shape.AddDim(3 * nall);
+    force_shape.AddDim(3 * static_cast<int64_t>(nall));
     Tensor* force_tensor = NULL;
     int context_output_index = 0;
     OP_REQUIRES_OK(context,
@@ -212,8 +216,10 @@ class ProdForceSeROp : public OpKernel {
     assert(nframes == in_deriv_tensor.shape().dim_size(0));
     assert(nframes == nlist_tensor.shape().dim_size(0));
     assert(nall * 3 == force_shape.dim_size(1));
-    assert(nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1));
-    assert(nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt ==
+           net_deriv_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+           in_deriv_tensor.shape().dim_size(1));
     assert(nloc * nnei == nlist_tensor.shape().dim_size(1));
     assert(nnei * 1 == ndescrpt);
     // flat the tensors
diff --git a/source/op/prod_force_se_a_grad.cc b/source/op/prod_force_se_a_grad.cc
index 21dd4fe00a..05e26b5058 100644
--- a/source/op/prod_force_se_a_grad.cc
+++ b/source/op/prod_force_se_a_grad.cc
@@ -77,7 +77,9 @@ class ProdForceSeAGradOp : public OpKernel {
     OP_REQUIRES(
         context, (nloc * 3 == grad_shape.dim_size(1)),
         errors::InvalidArgument("input grad shape should be 3 x natoms"));
-    OP_REQUIRES(context, (nloc * ndescrpt * 3 == in_deriv_shape.dim_size(1)),
+    OP_REQUIRES(context,
+                (static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+                 in_deriv_shape.dim_size(1)),
                 errors::InvalidArgument("number of descriptors should match"));
     OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel),
                 errors::InvalidArgument("number of neighbors should match"));
@@ -85,7 +87,7 @@ class ProdForceSeAGradOp : public OpKernel {
     // Create an output tensor
     TensorShape grad_net_shape;
     grad_net_shape.AddDim(nframes);
-    grad_net_shape.AddDim(nloc * ndescrpt);
+    grad_net_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt);
 
     // allocate the output tensor
     Tensor* grad_net_tensor = NULL;
diff --git a/source/op/prod_force_se_a_mask.cc b/source/op/prod_force_se_a_mask.cc
index 32fcf54a79..a7b08ae664 100644
--- a/source/op/prod_force_se_a_mask.cc
+++ b/source/op/prod_force_se_a_mask.cc
@@ -57,13 +57,14 @@ class ProdForceSeAMaskOp : public OpKernel {
     OP_REQUIRES(context, (nframes == nlist_tensor.shape().dim_size(0)),
                 errors::InvalidArgument("number of samples should match"));
     OP_REQUIRES(context,
-                (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)),
+                (static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+                 in_deriv_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("number of descriptors should match"));
 
     // Create an output tensor
     TensorShape force_shape;
     force_shape.AddDim(nframes);
-    force_shape.AddDim(3 * nall);
+    force_shape.AddDim(3 * static_cast<int64_t>(nall));
     // std::cout << "forcesahpe " << force_shape.dim_size(0) << " " <<
     // force_shape.dim_size(1) << std::endl;
     Tensor *force_tensor = NULL;
diff --git a/source/op/prod_force_se_a_mask_grad.cc b/source/op/prod_force_se_a_mask_grad.cc
index 6f841b1c7d..a01919199f 100644
--- a/source/op/prod_force_se_a_mask_grad.cc
+++ b/source/op/prod_force_se_a_mask_grad.cc
@@ -71,13 +71,15 @@ class ProdForceSeAMaskGradOp : public OpKernel {
     OP_REQUIRES(
         context, (nloc * 3 == grad_shape.dim_size(1)),
         errors::InvalidArgument("input grad shape should be 3 x natoms"));
-    OP_REQUIRES(context, (nloc * ndescrpt * 3 == in_deriv_shape.dim_size(1)),
+    OP_REQUIRES(context,
+                (static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+                 in_deriv_shape.dim_size(1)),
                 errors::InvalidArgument("number of descriptors should match"));
 
     // Create an output tensor
     TensorShape grad_net_shape;
     grad_net_shape.AddDim(nframes);
-    grad_net_shape.AddDim(nloc * ndescrpt);
+    grad_net_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt);
 
     // allocate the output tensor
     Tensor *grad_net_tensor = NULL;
diff --git a/source/op/prod_force_se_r_grad.cc b/source/op/prod_force_se_r_grad.cc
index f0b4b18323..44741d20fb 100644
--- a/source/op/prod_force_se_r_grad.cc
+++ b/source/op/prod_force_se_r_grad.cc
@@ -71,13 +71,15 @@ class ProdForceSeRGradOp : public OpKernel {
     OP_REQUIRES(
         context, (nloc * 3 == grad_shape.dim_size(1)),
         errors::InvalidArgument("input grad shape should be 3 x natoms"));
-    OP_REQUIRES(context, (nloc * ndescrpt * 3 == in_deriv_shape.dim_size(1)),
+    OP_REQUIRES(context,
+                (static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+                 in_deriv_shape.dim_size(1)),
                 errors::InvalidArgument("number of descriptors should match"));
 
     // Create an output tensor
     TensorShape grad_net_shape;
     grad_net_shape.AddDim(nframes);
-    grad_net_shape.AddDim(nloc * ndescrpt);
+    grad_net_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt);
 
     // allocate the output tensor
     Tensor* grad_net_tensor = NULL;
diff --git a/source/op/prod_virial.cc b/source/op/prod_virial.cc
index 2719c6c670..42e7be669d 100644
--- a/source/op/prod_virial.cc
+++ b/source/op/prod_virial.cc
@@ -77,9 +77,12 @@ class ProdVirialOp : public OpKernel {
                 errors::InvalidArgument("number of samples should match"));
 
     OP_REQUIRES(context,
-                (nloc * ndescrpt * 12 == in_deriv_tensor.shape().dim_size(1)),
+                (static_cast<int64_t>(nloc) * ndescrpt * 12 ==
+                 in_deriv_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("number of descriptors should match"));
-    OP_REQUIRES(context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)),
+    OP_REQUIRES(context,
+                (static_cast<int64_t>(nloc) * nnei * 3 ==
+                 rij_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("dim of rij should be nnei * 3"));
     OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel),
                 errors::InvalidArgument("number of neighbors should match"));
@@ -96,7 +99,7 @@ class ProdVirialOp : public OpKernel {
                    context->allocate_output(0, virial_shape, &virial_tensor));
     TensorShape atom_virial_shape;
     atom_virial_shape.AddDim(nframes);
-    atom_virial_shape.AddDim(9 * nall);
+    atom_virial_shape.AddDim(9 * static_cast<int64_t>(nall));
     Tensor* atom_virial_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape,
                                                      &atom_virial_tensor));
diff --git a/source/op/prod_virial_grad.cc b/source/op/prod_virial_grad.cc
index b06e273453..a764e524a6 100644
--- a/source/op/prod_virial_grad.cc
+++ b/source/op/prod_virial_grad.cc
@@ -88,10 +88,14 @@ class ProdVirialGradOp : public OpKernel {
     OP_REQUIRES(
         context, (9 == grad_shape.dim_size(1)),
         errors::InvalidArgument("input grad shape should be 3 x natoms"));
-    OP_REQUIRES(context, (nloc * ndescrpt * 12 == in_deriv_shape.dim_size(1)),
+    OP_REQUIRES(context,
+                (static_cast<int64_t>(nloc) * ndescrpt * 12 ==
+                 in_deriv_shape.dim_size(1)),
                 errors::InvalidArgument("number of descriptors should match"));
-    OP_REQUIRES(context, (nloc * nnei * 3 == rij_shape.dim_size(1)),
-                errors::InvalidArgument("dim of rij should be  nnei * 3"));
+    OP_REQUIRES(
+        context,
+        (static_cast<int64_t>(nloc) * nnei * 3 == rij_shape.dim_size(1)),
+        errors::InvalidArgument("dim of rij should be  nnei * 3"));
     OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel),
                 errors::InvalidArgument("number of neighbors should match"));
     OP_REQUIRES(
@@ -101,7 +105,7 @@ class ProdVirialGradOp : public OpKernel {
     // Create an output tensor
     TensorShape grad_net_shape;
     grad_net_shape.AddDim(nframes);
-    grad_net_shape.AddDim(nloc * ndescrpt);
+    grad_net_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt);
 
     // allocate the output tensor
     Tensor* grad_net_tensor = NULL;
diff --git a/source/op/prod_virial_grad_multi_device.cc b/source/op/prod_virial_grad_multi_device.cc
index 215c26f184..3d8d2a96b3 100644
--- a/source/op/prod_virial_grad_multi_device.cc
+++ b/source/op/prod_virial_grad_multi_device.cc
@@ -118,12 +118,16 @@ class ProdVirialSeAGradOp : public OpKernel {
     assert(nframes == in_deriv_tensor.shape().dim_size(0));
     assert(nframes == rij_tensor.shape().dim_size(0));
     assert(nframes == nlist_tensor.shape().dim_size(0));
-    assert(nloc * ndescrpt == grad_net_shape.dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt == grad_net_shape.dim_size(1));
     assert(9 == grad_shape.dim_size(1));
-    assert(nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1));
-    assert(nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1));
-    assert(nloc * nnei * 3 == rij_tensor.shape().dim_size(1));
-    assert(nloc * nnei == nlist_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt ==
+           net_deriv_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+           in_deriv_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * nnei * 3 ==
+           rij_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * nnei ==
+           nlist_tensor.shape().dim_size(1));
     assert(nnei * 4 == ndescrpt);
 
     // flat the tensors
@@ -246,12 +250,16 @@ class ProdVirialSeRGradOp : public OpKernel {
     assert(nframes == in_deriv_tensor.shape().dim_size(0));
     assert(nframes == rij_tensor.shape().dim_size(0));
     assert(nframes == nlist_tensor.shape().dim_size(0));
-    assert(nloc * ndescrpt == grad_net_shape.dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt == grad_net_shape.dim_size(1));
     assert(9 == grad_shape.dim_size(1));
-    assert(nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1));
-    assert(nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1));
-    assert(nloc * nnei * 3 == rij_tensor.shape().dim_size(1));
-    assert(nloc * nnei == nlist_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt ==
+           net_deriv_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+           in_deriv_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * nnei * 3 ==
+           rij_tensor.shape().dim_size(1));
+    assert(static_cast<int64_t>(nloc) * nnei ==
+           nlist_tensor.shape().dim_size(1));
     assert(nnei * 1 == ndescrpt);
 
     // flat the tensors
diff --git a/source/op/prod_virial_multi_device.cc b/source/op/prod_virial_multi_device.cc
index 23b312b797..a544b010c5 100644
--- a/source/op/prod_virial_multi_device.cc
+++ b/source/op/prod_virial_multi_device.cc
@@ -93,7 +93,7 @@ class ProdVirialSeAOp : public OpKernel {
     virial_shape.AddDim(9);
     TensorShape atom_virial_shape;
     atom_virial_shape.AddDim(nframes);
-    atom_virial_shape.AddDim(9 * nall);
+    atom_virial_shape.AddDim(9 * static_cast<int64_t>(nall));
     int context_output_index = 0;
     Tensor* virial_tensor = NULL;
     OP_REQUIRES_OK(
@@ -192,7 +192,7 @@ class ProdVirialSeROp : public OpKernel {
     virial_shape.AddDim(9);
     TensorShape atom_virial_shape;
     atom_virial_shape.AddDim(nframes);
-    atom_virial_shape.AddDim(9 * nall);
+    atom_virial_shape.AddDim(9 * static_cast<int64_t>(nall));
     int context_output_index = 0;
     Tensor* virial_tensor = NULL;
     OP_REQUIRES_OK(
diff --git a/source/op/prod_virial_se_a_grad.cc b/source/op/prod_virial_se_a_grad.cc
index a22401d654..e3a9374b8f 100644
--- a/source/op/prod_virial_se_a_grad.cc
+++ b/source/op/prod_virial_se_a_grad.cc
@@ -84,17 +84,21 @@ class ProdVirialSeAGradOp : public OpKernel {
     OP_REQUIRES(
         context, (9 == grad_shape.dim_size(1)),
         errors::InvalidArgument("input grad shape should be 3 x natoms"));
-    OP_REQUIRES(context, (nloc * ndescrpt * 3 == in_deriv_shape.dim_size(1)),
+    OP_REQUIRES(context,
+                (static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+                 in_deriv_shape.dim_size(1)),
                 errors::InvalidArgument("number of descriptors should match"));
-    OP_REQUIRES(context, (nloc * nnei * 3 == rij_shape.dim_size(1)),
-                errors::InvalidArgument("dim of rij should be  nnei * 3"));
+    OP_REQUIRES(
+        context,
+        (static_cast<int64_t>(nloc) * nnei * 3 == rij_shape.dim_size(1)),
+        errors::InvalidArgument("dim of rij should be  nnei * 3"));
     OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel),
                 errors::InvalidArgument("number of neighbors should match"));
 
     // Create an output tensor
     TensorShape grad_net_shape;
     grad_net_shape.AddDim(nframes);
-    grad_net_shape.AddDim(nloc * ndescrpt);
+    grad_net_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt);
 
     // allocate the output tensor
     Tensor* grad_net_tensor = NULL;
diff --git a/source/op/prod_virial_se_r_grad.cc b/source/op/prod_virial_se_r_grad.cc
index b874c828df..8e9b2c25b0 100644
--- a/source/op/prod_virial_se_r_grad.cc
+++ b/source/op/prod_virial_se_r_grad.cc
@@ -78,15 +78,19 @@ class ProdVirialSeRGradOp : public OpKernel {
     OP_REQUIRES(
         context, (9 == grad_shape.dim_size(1)),
         errors::InvalidArgument("input grad shape should be 3 x natoms"));
-    OP_REQUIRES(context, (nloc * ndescrpt * 3 == in_deriv_shape.dim_size(1)),
+    OP_REQUIRES(context,
+                (static_cast<int64_t>(nloc) * ndescrpt * 3 ==
+                 in_deriv_shape.dim_size(1)),
                 errors::InvalidArgument("number of descriptors should match"));
-    OP_REQUIRES(context, (nloc * nnei * 3 == rij_shape.dim_size(1)),
-                errors::InvalidArgument("dim of rij should be  nnei * 3"));
+    OP_REQUIRES(
+        context,
+        (static_cast<int64_t>(nloc) * nnei * 3 == rij_shape.dim_size(1)),
+        errors::InvalidArgument("dim of rij should be  nnei * 3"));
 
     // Create an output tensor
     TensorShape grad_net_shape;
     grad_net_shape.AddDim(nframes);
-    grad_net_shape.AddDim(nloc * ndescrpt);
+    grad_net_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt);
 
     // allocate the output tensor
     Tensor* grad_net_tensor = NULL;
diff --git a/source/op/soft_min.cc b/source/op/soft_min.cc
index 4062ddc4cb..07c7404bbf 100644
--- a/source/op/soft_min.cc
+++ b/source/op/soft_min.cc
@@ -82,10 +82,13 @@ class SoftMinSwitchOp : public OpKernel {
     OP_REQUIRES(context, (nall == type_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("shape of type should be nall"));
     OP_REQUIRES(
-        context, (3 * nnei * nloc == rij_tensor.shape().dim_size(1)),
+        context,
+        (3 * static_cast<int64_t>(nnei) * nloc ==
+         rij_tensor.shape().dim_size(1)),
         errors::InvalidArgument("shape of rij should be 3 * nloc * nnei"));
     OP_REQUIRES(
-        context, (nnei * nloc == nlist_tensor.shape().dim_size(1)),
+        context,
+        (static_cast<int64_t>(nnei) * nloc == nlist_tensor.shape().dim_size(1)),
         errors::InvalidArgument("shape of nlist should be nloc * nnei"));
 
     // Create an output tensor
@@ -94,7 +97,7 @@ class SoftMinSwitchOp : public OpKernel {
     sw_value_shape.AddDim(nloc);
     TensorShape sw_deriv_shape;
     sw_deriv_shape.AddDim(nframes);
-    sw_deriv_shape.AddDim(3 * nnei * nloc);
+    sw_deriv_shape.AddDim(3 * static_cast<int64_t>(nnei) * nloc);
     Tensor* sw_value_tensor = NULL;
     Tensor* sw_deriv_tensor = NULL;
     tmp_idx = 0;
diff --git a/source/op/soft_min_force.cc b/source/op/soft_min_force.cc
index a2970f4c3a..14cb42b993 100644
--- a/source/op/soft_min_force.cc
+++ b/source/op/soft_min_force.cc
@@ -65,7 +65,8 @@ class SoftMinForceOp : public OpKernel {
     OP_REQUIRES(context, (nloc == du_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("number of du should match"));
     OP_REQUIRES(context,
-                (nloc * nnei * 3 == sw_deriv_tensor.shape().dim_size(1)),
+                (static_cast<int64_t>(nloc) * nnei * 3 ==
+                 sw_deriv_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("number of switch deriv should match"));
     OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel),
                 errors::InvalidArgument("number of neighbors should match"));
@@ -73,7 +74,7 @@ class SoftMinForceOp : public OpKernel {
     // Create an output tensor
     TensorShape force_shape;
     force_shape.AddDim(nframes);
-    force_shape.AddDim(3 * nall);
+    force_shape.AddDim(3 * static_cast<int64_t>(nall));
     Tensor* force_tensor = NULL;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, force_shape, &force_tensor));
diff --git a/source/op/soft_min_force_grad.cc b/source/op/soft_min_force_grad.cc
index 752ad4f93d..e173586d22 100644
--- a/source/op/soft_min_force_grad.cc
+++ b/source/op/soft_min_force_grad.cc
@@ -77,8 +77,10 @@ class SoftMinForceGradOp : public OpKernel {
     OP_REQUIRES(
         context, (nloc * 3 == grad_shape.dim_size(1)),
         errors::InvalidArgument("input grad shape should be 3 x natoms"));
-    OP_REQUIRES(context, (nloc * nnei * 3 == sw_deriv_shape.dim_size(1)),
-                errors::InvalidArgument("number of sw deriv should match"));
+    OP_REQUIRES(
+        context,
+        (static_cast<int64_t>(nloc) * nnei * 3 == sw_deriv_shape.dim_size(1)),
+        errors::InvalidArgument("number of sw deriv should match"));
     OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel),
                 errors::InvalidArgument("number of neighbors should match"));
 
diff --git a/source/op/soft_min_virial.cc b/source/op/soft_min_virial.cc
index 91a94e01c3..6c0e1f72f3 100644
--- a/source/op/soft_min_virial.cc
+++ b/source/op/soft_min_virial.cc
@@ -73,9 +73,12 @@ class SoftMinVirialOp : public OpKernel {
     OP_REQUIRES(context, (nloc == du_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("number of du should match"));
     OP_REQUIRES(context,
-                (nloc * nnei * 3 == sw_deriv_tensor.shape().dim_size(1)),
+                (static_cast<int64_t>(nloc) * nnei * 3 ==
+                 sw_deriv_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("number of sw_deriv should match"));
-    OP_REQUIRES(context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)),
+    OP_REQUIRES(context,
+                (static_cast<int64_t>(nloc) * nnei * 3 ==
+                 rij_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("dim of rij should be nnei * 3"));
     OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel),
                 errors::InvalidArgument("number of neighbors should match"));
@@ -89,7 +92,7 @@ class SoftMinVirialOp : public OpKernel {
                    context->allocate_output(0, virial_shape, &virial_tensor));
     TensorShape atom_virial_shape;
     atom_virial_shape.AddDim(nframes);
-    atom_virial_shape.AddDim(9 * nall);
+    atom_virial_shape.AddDim(9 * static_cast<int64_t>(nall));
     Tensor* atom_virial_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape,
                                                      &atom_virial_tensor));
diff --git a/source/op/soft_min_virial_grad.cc b/source/op/soft_min_virial_grad.cc
index bc9cb96a63..ac129b29af 100644
--- a/source/op/soft_min_virial_grad.cc
+++ b/source/op/soft_min_virial_grad.cc
@@ -84,10 +84,14 @@ class SoftMinVirialGradOp : public OpKernel {
         errors::InvalidArgument("input grad shape should be 3 x natoms"));
     OP_REQUIRES(context, (nloc == du_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("number of du should match"));
-    OP_REQUIRES(context, (nloc * nnei * 3 == sw_deriv_shape.dim_size(1)),
-                errors::InvalidArgument("number of descriptors should match"));
-    OP_REQUIRES(context, (nloc * nnei * 3 == rij_shape.dim_size(1)),
-                errors::InvalidArgument("dim of rij should be  nnei * 3"));
+    OP_REQUIRES(
+        context,
+        (static_cast<int64_t>(nloc) * nnei * 3 == sw_deriv_shape.dim_size(1)),
+        errors::InvalidArgument("number of descriptors should match"));
+    OP_REQUIRES(
+        context,
+        (static_cast<int64_t>(nloc) * nnei * 3 == rij_shape.dim_size(1)),
+        errors::InvalidArgument("dim of rij should be  nnei * 3"));
     OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel),
                 errors::InvalidArgument("number of neighbors should match"));
 
diff --git a/source/tests/test_auto_batch_size.py b/source/tests/test_auto_batch_size.py
index 93a96c9c29..5a349f70b9 100644
--- a/source/tests/test_auto_batch_size.py
+++ b/source/tests/test_auto_batch_size.py
@@ -45,8 +45,10 @@ def test_execute_oom_gpu(self, mock_is_gpu_available):
         self.assertEqual(result.shape, (256, 2))
 
     @unittest.mock.patch("tensorflow.compat.v1.test.is_gpu_available")
-    def test_execute_oom_cpu(self, mock_is_gpu_available):
+    @unittest.mock.patch("tensorflow.compat.v1.config.experimental.get_visible_devices")
+    def test_execute_oom_cpu(self, mock_is_gpu_available, mock_get_visible_devices):
         mock_is_gpu_available.return_value = False
+        mock_get_visible_devices.return_value = []
         # initial batch size 256 = 128 * 2, nb is always 128
         auto_batch_size = AutoBatchSize(256, 2.0)
         nb, result = auto_batch_size.execute(self.oom, 1, 2)
diff --git a/source/tests/test_compressed_training.py b/source/tests/test_compressed_training.py
new file mode 100644
index 0000000000..0a0bbeaadf
--- /dev/null
+++ b/source/tests/test_compressed_training.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import unittest
+
+# from deepmd.entrypoints.compress import compress
+from common import (
+    j_loader,
+    run_dp,
+    tests_path,
+)
+from packaging.version import parse as parse_version
+
+from deepmd.env import (
+    tf,
+)
+
+
+@unittest.skipIf(
+    parse_version(tf.__version__) < parse_version("2"),
+    f"The current tf version {tf.__version__} is too low to run the new testing model.",
+)
+class TestCompressedTrainingSeAtten(unittest.TestCase):
+    def setUp(self) -> None:
+        data_file = str(tests_path / os.path.join("model_compression", "data"))
+        self.input_file = str(tests_path / "input.json")
+        self.frozen_model = str(tests_path / "dp-compress-training-original.pb")
+        self.compressed_model = str(tests_path / "dp-compress-training-compressed.pb")
+        self.frozen_compress_training_model = str(
+            tests_path / "dp-compress-training-compress-training.pb"
+        )
+        self.ckpt_file = str(tests_path / "dp-compress-training.ckpt")
+        self.checkpoint_dir = str(tests_path)
+        jdata = j_loader(
+            str(tests_path / os.path.join("model_compression", "input.json"))
+        )
+        jdata["model"]["descriptor"] = {}
+        jdata["model"]["descriptor"]["type"] = "se_atten_v2"
+        jdata["model"]["descriptor"]["sel"] = 20
+        jdata["model"]["descriptor"]["attn_layer"] = 0
+        jdata["training"]["training_data"]["systems"] = data_file
+        jdata["training"]["validation_data"]["systems"] = data_file
+        jdata["training"]["save_ckpt"] = self.ckpt_file
+        with open(self.input_file, "w") as fp:
+            json.dump(jdata, fp, indent=4)
+
+    def test_compressed_training(self):
+        run_dp(f"dp train {self.input_file}")
+        run_dp(f"dp freeze -c {self.checkpoint_dir} -o {self.frozen_model}")
+        run_dp(f"dp compress -i {self.frozen_model} -o {self.compressed_model}")
+        # compress training
+        run_dp(f"dp train {self.input_file} -f {self.compressed_model}")
+        # restart compress training
+        run_dp(f"dp train {self.input_file} -r {self.ckpt_file}")
+        # freeze compress training
+        run_dp(
+            f"dp freeze -c {self.checkpoint_dir} -o {self.frozen_compress_training_model}"
+        )
+        # it should not be able to compress again
+        with self.assertRaises(RuntimeError):
+            run_dp(
+                f"dp compress -i {self.frozen_compress_training_model} -o {self.compressed_model}"
+            )
diff --git a/source/tests/test_deepdipole.py b/source/tests/test_deepdipole.py
index e26ad84a55..1d06b5fe92 100644
--- a/source/tests/test_deepdipole.py
+++ b/source/tests/test_deepdipole.py
@@ -2,6 +2,7 @@
 import os
 import unittest
 
+import ase.neighborlist
 import numpy as np
 from common import (
     finite_difference,
@@ -964,10 +965,6 @@ def test_1frame_full_atm(self):
         gt, ff, vv, at, av = self.dp.eval_full(
             self.coords, self.box, self.atype, atomic=True
         )
-        for dd in at, ff, av:
-            print("\n\n")
-            print(", ".join(f"{ii:.18e}" for ii in dd.reshape(-1)))
-            print("\n\n")
         # check shape of the returns
         nframes = 1
         natoms = len(self.atype)
@@ -1035,3 +1032,30 @@ def test_1frame_full_atm_shuffle(self):
         np.testing.assert_almost_equal(
             vv.reshape([-1]), self.expected_gv.reshape([-1]), decimal=default_places
         )
+
+
+@unittest.skipIf(
+    parse_version(tf.__version__) < parse_version("1.15"),
+    f"The current tf version {tf.__version__} is too low to run the new testing model.",
+)
+class TestDeepDipoleNewPBCNeighborList(TestDeepDipoleNewPBC):
+    @classmethod
+    def setUpClass(cls):
+        convert_pbtxt_to_pb(
+            str(tests_path / os.path.join("infer", "deepdipole_new.pbtxt")),
+            "deepdipole_new.pb",
+        )
+        cls.dp = DeepDipole(
+            "deepdipole_new.pb",
+            neighbor_list=ase.neighborlist.NewPrimitiveNeighborList(
+                cutoffs=6, bothways=True
+            ),
+        )
+
+    @unittest.skip("multiple frames not supported")
+    def test_2frame_full_atm(self):
+        pass
+
+    @unittest.skip("multiple frames not supported")
+    def test_2frame_old_atm(self):
+        pass
diff --git a/source/tests/test_deeppolar.py b/source/tests/test_deeppolar.py
index 271d1650c0..9627851de4 100644
--- a/source/tests/test_deeppolar.py
+++ b/source/tests/test_deeppolar.py
@@ -2,6 +2,7 @@
 import os
 import unittest
 
+import ase.neighborlist
 import numpy as np
 from common import (
     tests_path,
@@ -980,12 +981,6 @@ def test_1frame_full_atm(self):
             self.coords, self.box, self.atype, atomic=True
         )
 
-        # print the values
-        for dd in (at, ff, av):
-            print("\n\n")
-            print(", ".join(f"{i:.18e}" for i in dd.reshape(-1)))
-            print("\n\n")
-
         # check shape of the returns
         nframes = 1
         natoms = len(self.atype)
@@ -1088,3 +1083,30 @@ def test_2frame_full_atm(self):
         np.testing.assert_almost_equal(
             vv.reshape([-1]), expected_gv.reshape([-1]), decimal=default_places
         )
+
+
+@unittest.skipIf(
+    parse_version(tf.__version__) < parse_version("1.15"),
+    f"The current tf version {tf.__version__} is too low to run the new testing model.",
+)
+class TestDeepPolarNewPBCNeighborList(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        convert_pbtxt_to_pb(
+            str(tests_path / os.path.join("infer", "deeppolar_new.pbtxt")),
+            "deeppolar_new.pb",
+        )
+        cls.dp = DeepPolar(
+            "deeppolar_new.pb",
+            neighbor_list=ase.neighborlist.NewPrimitiveNeighborList(
+                cutoffs=6, bothways=True
+            ),
+        )
+
+    @unittest.skip("multiple frames not supported")
+    def test_2frame_full_atm(self):
+        pass
+
+    @unittest.skip("multiple frames not supported")
+    def test_2frame_old_atm(self):
+        pass
diff --git a/source/tests/test_deeppot_a.py b/source/tests/test_deeppot_a.py
index 1c6cdc4afc..c229b4302c 100644
--- a/source/tests/test_deeppot_a.py
+++ b/source/tests/test_deeppot_a.py
@@ -3,6 +3,7 @@
 import shutil
 import unittest
 
+import ase.neighborlist
 import numpy as np
 from common import (
     run_dp,
@@ -1096,3 +1097,25 @@ def test_2frame_atm_all_param(self):
         np.testing.assert_almost_equal(ee.ravel(), expected_se.ravel(), default_places)
         expected_sv = np.sum(expected_v.reshape([nframes, -1, 9]), axis=1)
         np.testing.assert_almost_equal(vv.ravel(), expected_sv.ravel(), default_places)
+
+
+class TestDeepPotAPBCNeighborList(TestDeepPotAPBC):
+    @classmethod
+    def setUpClass(cls):
+        convert_pbtxt_to_pb(
+            str(tests_path / os.path.join("infer", "deeppot.pbtxt")), "deeppot.pb"
+        )
+        cls.dp = DeepPot(
+            "deeppot.pb",
+            neighbor_list=ase.neighborlist.NewPrimitiveNeighborList(
+                cutoffs=6, bothways=True
+            ),
+        )
+
+    @unittest.skip("multiple frames not supported")
+    def test_2frame_atm(self):
+        pass
+
+    @unittest.skip("Zero atoms not supported")
+    def test_zero_input(self):
+        pass
diff --git a/source/tests/test_init_frz_model_se_a_tebd.py b/source/tests/test_init_frz_model_se_a_tebd.py
new file mode 100644
index 0000000000..e54cae9781
--- /dev/null
+++ b/source/tests/test_init_frz_model_se_a_tebd.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import unittest
+
+import numpy as np
+from common import (
+    j_loader,
+    run_dp,
+    tests_path,
+)
+
+from deepmd.env import (
+    GLOBAL_NP_FLOAT_PRECISION,
+    tf,
+)
+from deepmd.train.run_options import (
+    RunOptions,
+)
+from deepmd.train.trainer import (
+    DPTrainer,
+)
+from deepmd.utils.argcheck import (
+    normalize,
+)
+from deepmd.utils.compat import (
+    update_deepmd_input,
+)
+from deepmd.utils.data_system import (
+    DeepmdDataSystem,
+)
+
+if GLOBAL_NP_FLOAT_PRECISION == np.float32:
+    default_places = 4
+else:
+    default_places = 10
+
+
+def _file_delete(file):
+    if os.path.isdir(file):
+        os.rmdir(file)
+    elif os.path.isfile(file):
+        os.remove(file)
+
+
+def _init_models():
+    data_file = str(tests_path / os.path.join("init_frz_model", "data"))
+    frozen_model = str(tests_path / "init_frz_se_a_tebd.pb")
+    ckpt = str(tests_path / "init_frz_se_a_tebd.ckpt")
+    run_opt_ckpt = RunOptions(init_model=ckpt, log_level=20)
+    run_opt_frz = RunOptions(init_frz_model=frozen_model, log_level=20)
+    INPUT = str(tests_path / "input.json")
+    jdata = j_loader(str(tests_path / os.path.join("init_frz_model", "input.json")))
+    jdata["model"]["descriptor"]["type"] = "se_a_ebd_v2"
+    jdata["training"]["training_data"]["systems"] = data_file
+    jdata["training"]["validation_data"]["systems"] = data_file
+    jdata["training"]["save_ckpt"] = ckpt
+    with open(INPUT, "w") as fp:
+        json.dump(jdata, fp, indent=4)
+    ret = run_dp("dp train " + INPUT)
+    np.testing.assert_equal(ret, 0, "DP train failed!")
+    ret = run_dp("dp freeze -c " + str(tests_path) + " -o " + frozen_model)
+    np.testing.assert_equal(ret, 0, "DP freeze failed!")
+
+    jdata = update_deepmd_input(jdata, warning=True, dump="input_v2_compat.json")
+    jdata = normalize(jdata)
+    model_ckpt = DPTrainer(jdata, run_opt=run_opt_ckpt)
+    model_frz = DPTrainer(jdata, run_opt=run_opt_frz)
+    rcut = model_ckpt.model.get_rcut()
+    type_map = model_ckpt.model.get_type_map()
+    data = DeepmdDataSystem(
+        systems=[data_file],
+        batch_size=1,
+        test_size=1,
+        rcut=rcut,
+        type_map=type_map,
+        trn_all_set=True,
+    )
+    data_requirement = {
+        "energy": {
+            "ndof": 1,
+            "atomic": False,
+            "must": False,
+            "high_prec": True,
+            "type_sel": None,
+            "repeat": 1,
+            "default": 0.0,
+        },
+        "force": {
+            "ndof": 3,
+            "atomic": True,
+            "must": False,
+            "high_prec": False,
+            "type_sel": None,
+            "repeat": 1,
+            "default": 0.0,
+        },
+        "virial": {
+            "ndof": 9,
+            "atomic": False,
+            "must": False,
+            "high_prec": False,
+            "type_sel": None,
+            "repeat": 1,
+            "default": 0.0,
+        },
+        "atom_ener": {
+            "ndof": 1,
+            "atomic": True,
+            "must": False,
+            "high_prec": False,
+            "type_sel": None,
+            "repeat": 1,
+            "default": 0.0,
+        },
+        "atom_pref": {
+            "ndof": 1,
+            "atomic": True,
+            "must": False,
+            "high_prec": False,
+            "type_sel": None,
+            "repeat": 3,
+            "default": 0.0,
+        },
+    }
+    data.add_dict(data_requirement)
+    stop_batch = jdata["training"]["numb_steps"]
+
+    return INPUT, ckpt, frozen_model, model_ckpt, model_frz, data, stop_batch
+
+
+(
+    INPUT,
+    CKPT,
+    FROZEN_MODEL,
+    CKPT_TRAINER,
+    FRZ_TRAINER,
+    VALID_DATA,
+    STOP_BATCH,
+) = _init_models()
+
+
+class TestInitFrzModelA(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.dp_ckpt = CKPT_TRAINER
+        cls.dp_frz = FRZ_TRAINER
+        cls.valid_data = VALID_DATA
+        cls.stop_batch = STOP_BATCH
+
+    @classmethod
+    def tearDownClass(cls):
+        _file_delete(INPUT)
+        _file_delete(FROZEN_MODEL)
+        _file_delete("out.json")
+        _file_delete(str(tests_path / "checkpoint"))
+        _file_delete(CKPT + ".meta")
+        _file_delete(CKPT + ".index")
+        _file_delete(CKPT + ".data-00000-of-00001")
+        _file_delete(CKPT + "-0.meta")
+        _file_delete(CKPT + "-0.index")
+        _file_delete(CKPT + "-0.data-00000-of-00001")
+        _file_delete(CKPT + "-1.meta")
+        _file_delete(CKPT + "-1.index")
+        _file_delete(CKPT + "-1.data-00000-of-00001")
+        _file_delete("input_v2_compat.json")
+        _file_delete("lcurve.out")
+
+    def test_single_frame(self):
+        valid_batch = self.valid_data.get_batch()
+        natoms = valid_batch["natoms_vec"]
+        tf.reset_default_graph()
+        self.dp_ckpt.build(self.valid_data, self.stop_batch)
+        self.dp_ckpt._init_session()
+        feed_dict_ckpt = self.dp_ckpt.get_feed_dict(valid_batch, is_training=False)
+        ckpt_rmse_ckpt = self.dp_ckpt.loss.eval(
+            self.dp_ckpt.sess, feed_dict_ckpt, natoms
+        )
+        tf.reset_default_graph()
+
+        self.dp_frz.build(self.valid_data, self.stop_batch)
+        self.dp_frz._init_session()
+        feed_dict_frz = self.dp_frz.get_feed_dict(valid_batch, is_training=False)
+        ckpt_rmse_frz = self.dp_frz.loss.eval(self.dp_frz.sess, feed_dict_frz, natoms)
+        tf.reset_default_graph()
+
+        # check values
+        np.testing.assert_almost_equal(
+            ckpt_rmse_ckpt["rmse_e"], ckpt_rmse_frz["rmse_e"], default_places
+        )
+        np.testing.assert_almost_equal(
+            ckpt_rmse_ckpt["rmse_f"], ckpt_rmse_frz["rmse_f"], default_places
+        )
+        np.testing.assert_almost_equal(
+            ckpt_rmse_ckpt["rmse_v"], ckpt_rmse_frz["rmse_v"], default_places
+        )
diff --git a/source/tests/test_model_format_utils.py b/source/tests/test_model_format_utils.py
new file mode 100644
index 0000000000..f588647096
--- /dev/null
+++ b/source/tests/test_model_format_utils.py
@@ -0,0 +1,370 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import itertools
+import os
+import unittest
+from copy import (
+    deepcopy,
+)
+
+import numpy as np
+
+from deepmd_utils.model_format import (
+    DescrptSeA,
+    EmbeddingNet,
+    EnvMat,
+    FittingNet,
+    NativeLayer,
+    NativeNet,
+    NetworkCollection,
+    load_dp_model,
+    save_dp_model,
+)
+
+
+class TestNativeLayer(unittest.TestCase):
+    def test_serialize_deserize(self):
+        for (
+            ni,
+            no,
+        ), bias, ut, activation_function, resnet, ashp, prec in itertools.product(
+            [(5, 5), (5, 10), (5, 9), (9, 5)],
+            [True, False],
+            [True, False],
+            ["tanh", "none"],
+            [True, False],
+            [None, [4], [3, 2]],
+            ["float32", "float64", "single", "double"],
+        ):
+            nl0 = NativeLayer(
+                ni,
+                no,
+                bias=bias,
+                use_timestep=ut,
+                activation_function=activation_function,
+                resnet=resnet,
+                precision=prec,
+            )
+            nl1 = NativeLayer.deserialize(nl0.serialize())
+            inp_shap = [ni]
+            if ashp is not None:
+                inp_shap = ashp + inp_shap
+            inp = np.arange(np.prod(inp_shap)).reshape(inp_shap)
+            np.testing.assert_allclose(nl0.call(inp), nl1.call(inp))
+
+    def test_shape_error(self):
+        self.w0 = np.full((2, 3), 3.0)
+        self.b0 = np.full((2,), 4.0)
+        self.b1 = np.full((3,), 4.0)
+        self.idt0 = np.full((2,), 4.0)
+        with self.assertRaises(ValueError) as context:
+            network = NativeLayer.deserialize(
+                {
+                    "activation_function": "tanh",
+                    "resnet": True,
+                    "@variables": {"w": self.w0, "b": self.b0},
+                }
+            )
+            assert "not equalt to shape of b" in context.exception
+        with self.assertRaises(ValueError) as context:
+            network = NativeLayer.deserialize(
+                {
+                    "activation_function": "tanh",
+                    "resnet": True,
+                    "@variables": {"w": self.w0, "b": self.b1, "idt": self.idt0},
+                }
+            )
+            assert "not equalt to shape of idt" in context.exception
+
+
+class TestNativeNet(unittest.TestCase):
+    def setUp(self) -> None:
+        self.w0 = np.full((2, 3), 3.0)
+        self.b0 = np.full((3,), 4.0)
+        self.w1 = np.full((3, 4), 3.0)
+        self.b1 = np.full((4,), 4.0)
+
+    def test_serialize(self):
+        network = NativeNet(
+            [
+                NativeLayer(2, 3).serialize(),
+                NativeLayer(3, 4).serialize(),
+            ]
+        )
+        network[1]["w"] = self.w1
+        network[1]["b"] = self.b1
+        network[0]["w"] = self.w0
+        network[0]["b"] = self.b0
+        network[1]["activation_function"] = "tanh"
+        network[0]["activation_function"] = "tanh"
+        network[1]["resnet"] = True
+        network[0]["resnet"] = True
+        jdata = network.serialize()
+        np.testing.assert_array_equal(jdata["layers"][0]["@variables"]["w"], self.w0)
+        np.testing.assert_array_equal(jdata["layers"][0]["@variables"]["b"], self.b0)
+        np.testing.assert_array_equal(jdata["layers"][1]["@variables"]["w"], self.w1)
+        np.testing.assert_array_equal(jdata["layers"][1]["@variables"]["b"], self.b1)
+        np.testing.assert_array_equal(jdata["layers"][0]["activation_function"], "tanh")
+        np.testing.assert_array_equal(jdata["layers"][1]["activation_function"], "tanh")
+        np.testing.assert_array_equal(jdata["layers"][0]["resnet"], True)
+        np.testing.assert_array_equal(jdata["layers"][1]["resnet"], True)
+
+    def test_deserialize(self):
+        network = NativeNet.deserialize(
+            {
+                "layers": [
+                    {
+                        "activation_function": "tanh",
+                        "resnet": True,
+                        "@variables": {"w": self.w0, "b": self.b0},
+                    },
+                    {
+                        "activation_function": "tanh",
+                        "resnet": True,
+                        "@variables": {"w": self.w1, "b": self.b1},
+                    },
+                ],
+            }
+        )
+        np.testing.assert_array_equal(network[0]["w"], self.w0)
+        np.testing.assert_array_equal(network[0]["b"], self.b0)
+        np.testing.assert_array_equal(network[1]["w"], self.w1)
+        np.testing.assert_array_equal(network[1]["b"], self.b1)
+        np.testing.assert_array_equal(network[0]["activation_function"], "tanh")
+        np.testing.assert_array_equal(network[1]["activation_function"], "tanh")
+        np.testing.assert_array_equal(network[0]["resnet"], True)
+        np.testing.assert_array_equal(network[1]["resnet"], True)
+
+    def test_shape_error(self):
+        with self.assertRaises(ValueError) as context:
+            network = NativeNet.deserialize(
+                {
+                    "layers": [
+                        {
+                            "activation_function": "tanh",
+                            "resnet": True,
+                            "@variables": {"w": self.w0, "b": self.b0},
+                        },
+                        {
+                            "activation_function": "tanh",
+                            "resnet": True,
+                            "@variables": {"w": self.w0, "b": self.b0},
+                        },
+                    ],
+                }
+            )
+            assert "does not match the dim of layer" in context.exception
+
+
+class TestEmbeddingNet(unittest.TestCase):
+    def test_embedding_net(self):
+        for ni, act, idt, prec in itertools.product(
+            [1, 10],
+            ["tanh", "none"],
+            [True, False],
+            ["double", "single"],
+        ):
+            en0 = EmbeddingNet(
+                ni,
+                activation_function=act,
+                precision=prec,
+                resnet_dt=idt,
+            )
+            en1 = EmbeddingNet.deserialize(en0.serialize())
+            inp = np.ones([ni])
+            np.testing.assert_allclose(en0.call(inp), en1.call(inp))
+
+
+class TestFittingNet(unittest.TestCase):
+    def test_fitting_net(self):
+        for ni, no, act, idt, prec, bo in itertools.product(
+            [1, 10],
+            [1, 7],
+            ["tanh", "none"],
+            [True, False],
+            ["double", "single"],
+            [True, False],
+        ):
+            en0 = FittingNet(
+                ni,
+                no,
+                activation_function=act,
+                precision=prec,
+                resnet_dt=idt,
+                bias_out=bo,
+            )
+            en1 = FittingNet.deserialize(en0.serialize())
+            inp = np.ones([ni])
+            en0.call(inp)
+            en1.call(inp)
+            np.testing.assert_allclose(en0.call(inp), en1.call(inp))
+
+
+class TestNetworkCollection(unittest.TestCase):
+    def setUp(self) -> None:
+        w0 = np.full((2, 3), 3.0)
+        b0 = np.full((3,), 4.0)
+        w1 = np.full((3, 4), 3.0)
+        b1 = np.full((4,), 4.0)
+        self.network = {
+            "layers": [
+                {
+                    "activation_function": "tanh",
+                    "resnet": True,
+                    "@variables": {"w": w0, "b": b0},
+                },
+                {
+                    "activation_function": "tanh",
+                    "resnet": True,
+                    "@variables": {"w": w1, "b": b1},
+                },
+            ],
+        }
+
+    def test_two_dim(self):
+        networks = NetworkCollection(ndim=2, ntypes=2)
+        networks[(0, 0)] = self.network
+        networks[(1, 1)] = self.network
+        networks[(0, 1)] = self.network
+        with self.assertRaises(RuntimeError):
+            networks.check_completeness()
+        networks[(1, 0)] = self.network
+        networks.check_completeness()
+        np.testing.assert_equal(
+            networks.serialize(),
+            NetworkCollection.deserialize(networks.serialize()).serialize(),
+        )
+        np.testing.assert_equal(
+            networks[(0, 0)].serialize(), networks.serialize()["networks"][0]
+        )
+
+    def test_one_dim(self):
+        networks = NetworkCollection(ndim=1, ntypes=2)
+        networks[(0,)] = self.network
+        with self.assertRaises(RuntimeError):
+            networks.check_completeness()
+        networks[(1,)] = self.network
+        networks.check_completeness()
+        np.testing.assert_equal(
+            networks.serialize(),
+            NetworkCollection.deserialize(networks.serialize()).serialize(),
+        )
+        np.testing.assert_equal(
+            networks[(0,)].serialize(), networks.serialize()["networks"][0]
+        )
+
+    def test_zero_dim(self):
+        networks = NetworkCollection(ndim=0, ntypes=2)
+        networks[()] = self.network
+        networks.check_completeness()
+        np.testing.assert_equal(
+            networks.serialize(),
+            NetworkCollection.deserialize(networks.serialize()).serialize(),
+        )
+        np.testing.assert_equal(
+            networks[()].serialize(), networks.serialize()["networks"][0]
+        )
+
+
+class TestDPModel(unittest.TestCase):
+    def setUp(self) -> None:
+        self.w = np.full((3, 2), 3.0)
+        self.b = np.full((3,), 4.0)
+        self.model_dict = {
+            "type": "some_type",
+            "layers": [
+                {
+                    "activation_function": "tanh",
+                    "resnet": True,
+                    "@variables": {"w": self.w, "b": self.b},
+                },
+                {
+                    "activation_function": "tanh",
+                    "resnet": True,
+                    "@variables": {"w": self.w, "b": self.b},
+                },
+            ],
+        }
+        self.filename = "test_dp_model_format.dp"
+
+    def test_save_load_model(self):
+        save_dp_model(self.filename, deepcopy(self.model_dict))
+        model = load_dp_model(self.filename)
+        np.testing.assert_equal(model["model"], self.model_dict)
+        assert "software" in model
+        assert "version" in model
+
+    def tearDown(self) -> None:
+        if os.path.exists(self.filename):
+            os.remove(self.filename)
+
+
+class TestCaseSingleFrameWithNlist:
+    def setUp(self):
+        # nloc == 3, nall == 4
+        self.nloc = 3
+        self.nall = 4
+        self.nf, self.nt = 1, 2
+        self.coord_ext = np.array(
+            [
+                [0, 0, 0],
+                [0, 1, 0],
+                [0, 0, 1],
+                [0, -2, 0],
+            ],
+            dtype=np.float64,
+        ).reshape([1, self.nall * 3])
+        self.atype_ext = np.array([0, 0, 1, 0], dtype=int).reshape([1, self.nall])
+        # sel = [5, 2]
+        self.sel = [5, 2]
+        self.nlist = np.array(
+            [
+                [1, 3, -1, -1, -1, 2, -1],
+                [0, -1, -1, -1, -1, 2, -1],
+                [0, 1, -1, -1, -1, 0, -1],
+            ],
+            dtype=int,
+        ).reshape([1, self.nloc, sum(self.sel)])
+        self.rcut = 0.4
+        self.rcut_smth = 2.2
+
+
+class TestEnvMat(unittest.TestCase, TestCaseSingleFrameWithNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithNlist.setUp(self)
+
+    def test_self_consistency(
+        self,
+    ):
+        rng = np.random.default_rng()
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(dstd)
+        em0 = EnvMat(self.rcut, self.rcut_smth)
+        em1 = EnvMat.deserialize(em0.serialize())
+        mm0, ww0 = em0.call(self.coord_ext, self.atype_ext, self.nlist, davg, dstd)
+        mm1, ww1 = em1.call(self.coord_ext, self.atype_ext, self.nlist, davg, dstd)
+        np.testing.assert_allclose(mm0, mm1)
+        np.testing.assert_allclose(ww0, ww1)
+
+
+class TestDescrptSeA(unittest.TestCase, TestCaseSingleFrameWithNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithNlist.setUp(self)
+
+    def test_self_consistency(
+        self,
+    ):
+        rng = np.random.default_rng()
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(dstd)
+
+        em0 = DescrptSeA(self.rcut, self.rcut_smth, self.sel)
+        em0.davg = davg
+        em0.dstd = dstd
+        em1 = DescrptSeA.deserialize(em0.serialize())
+        mm0 = em0.call(self.coord_ext, self.atype_ext, self.nlist)
+        mm1 = em1.call(self.coord_ext, self.atype_ext, self.nlist)
+        np.testing.assert_allclose(mm0, mm1)
diff --git a/source/tests/test_model_pairtab.py b/source/tests/test_model_pairtab.py
new file mode 100644
index 0000000000..fd678894b5
--- /dev/null
+++ b/source/tests/test_model_pairtab.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import numpy as np
+import scipy.spatial.distance
+from common import (
+    DataSystem,
+    gen_data,
+    j_loader,
+)
+
+from deepmd.common import (
+    j_must_have,
+)
+from deepmd.env import (
+    tf,
+)
+from deepmd.model.model import (
+    Model,
+)
+
+GLOBAL_ENER_FLOAT_PRECISION = tf.float64
+GLOBAL_TF_FLOAT_PRECISION = tf.float64
+GLOBAL_NP_FLOAT_PRECISION = np.float64
+
+
+class TestModel(tf.test.TestCase):
+    def setUp(self):
+        gen_data()
+
+    def test_model(self):
+        jfile = "water.json"
+        jdata = j_loader(jfile)
+        systems = j_must_have(jdata, "systems")
+        set_pfx = j_must_have(jdata, "set_prefix")
+        batch_size = 1
+        test_size = 1
+
+        tab_filename = "test_pairtab_tab.txt"
+        jdata["model"] = {
+            "type": "pairtab",
+            "tab_file": tab_filename,
+            "rcut": 6,
+            "sel": [6],
+        }
+        rcut = j_must_have(jdata["model"], "rcut")
+
+        def pair_pot(r: float):
+            # LJ, as exmaple
+            return 4 * (1 / r**12 - 1 / r**6)
+
+        dx = 1e-4
+        d = np.arange(dx, rcut + dx, dx)
+        tab = np.array(
+            [
+                d,
+                pair_pot(d),
+                pair_pot(d),
+                pair_pot(d),
+            ]
+        ).T
+        np.savetxt(tab_filename, tab)
+
+        data = DataSystem(systems, set_pfx, batch_size, test_size, rcut, run_opt=None)
+
+        test_data = data.get_test()
+        numb_test = 1
+
+        model = Model(
+            **jdata["model"],
+        )
+
+        t_prop_c = tf.placeholder(tf.float32, [5], name="t_prop_c")
+        t_energy = tf.placeholder(GLOBAL_ENER_FLOAT_PRECISION, [None], name="t_energy")
+        t_force = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name="t_force")
+        t_virial = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name="t_virial")
+        t_atom_ener = tf.placeholder(
+            GLOBAL_TF_FLOAT_PRECISION, [None], name="t_atom_ener"
+        )
+        t_coord = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name="i_coord")
+        t_type = tf.placeholder(tf.int32, [None], name="i_type")
+        t_natoms = tf.placeholder(tf.int32, [model.ntypes + 2], name="i_natoms")
+        t_box = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, 9], name="i_box")
+        t_mesh = tf.placeholder(tf.int32, [None], name="i_mesh")
+        is_training = tf.placeholder(tf.bool)
+        t_fparam = None
+
+        model_pred = model.build(
+            t_coord,
+            t_type,
+            t_natoms,
+            t_box,
+            t_mesh,
+            t_fparam,
+            suffix="test_pairtab",
+            reuse=False,
+        )
+        energy = model_pred["energy"]
+        force = model_pred["force"]
+        virial = model_pred["virial"]
+
+        feed_dict_test = {
+            t_prop_c: test_data["prop_c"],
+            t_energy: test_data["energy"][:numb_test],
+            t_force: np.reshape(test_data["force"][:numb_test, :], [-1]),
+            t_virial: np.reshape(test_data["virial"][:numb_test, :], [-1]),
+            t_atom_ener: np.reshape(test_data["atom_ener"][:numb_test, :], [-1]),
+            t_coord: np.reshape(test_data["coord"][:numb_test, :], [-1]),
+            t_box: test_data["box"][:numb_test, :],
+            t_type: np.reshape(test_data["type"][:numb_test, :], [-1]),
+            t_natoms: test_data["natoms_vec"],
+            t_mesh: [],  # nopbc
+            is_training: False,
+        }
+
+        with self.cached_session() as sess:
+            sess.run(tf.global_variables_initializer())
+            [e, _, _] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
+
+        e = e.reshape([-1])
+
+        coord = test_data["coord"][0, :].reshape(-1, 3)
+        distance = scipy.spatial.distance.cdist(coord, coord).ravel()
+        refe = [np.sum(pair_pot(distance[np.nonzero(distance)])) / 2]
+
+        refe = np.reshape(refe, [-1])
+
+        places = 10
+        np.testing.assert_almost_equal(e, refe, places)
diff --git a/source/tests/test_nvnmd_entrypoints.py b/source/tests/test_nvnmd_entrypoints.py
index 3e721516f1..d82c905024 100644
--- a/source/tests/test_nvnmd_entrypoints.py
+++ b/source/tests/test_nvnmd_entrypoints.py
@@ -521,6 +521,7 @@ def test_mapt_cnn_v1(self):
         map_file = str(tests_path / "nvnmd" / "out" / "map_v1_cnn.npy")
         # mapt
         mapObj = MapTable(config_file, weight_file, map_file)
+        mapObj.Gs_Gt_mode = 0
         mapt = mapObj.build_map()
         #
         N = 32
@@ -859,8 +860,9 @@ def test_wrap_qnn_v1(self):
         # test
         data = FioBin().load(jdata["nvnmd_model"])
         idx = [0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+        idx = [i + 128 * 4 for i in idx]
         pred = [data[i] for i in idx]
-        red_dout = [1, 0, 0, 128, 0, 0, 0, 8, 249, 0, 0, 0, 91, 252, 183, 254]
+        red_dout = [249, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 254, 95, 24, 176]
         np.testing.assert_equal(pred, red_dout)
         # close
         nvnmd_cfg.enable = False
diff --git a/source/tests/test_output_def.py b/source/tests/test_output_def.py
new file mode 100644
index 0000000000..82d1b13a80
--- /dev/null
+++ b/source/tests/test_output_def.py
@@ -0,0 +1,304 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+from typing import (
+    List,
+)
+
+import numpy as np
+
+from deepmd_utils.model_format import (
+    FittingOutputDef,
+    ModelOutputDef,
+    NativeOP,
+    OutputVariableDef,
+    fitting_check_output,
+    model_check_output,
+)
+from deepmd_utils.model_format.output_def import (
+    check_var,
+)
+
+
+class VariableDef:
+    def __init__(
+        self,
+        name: str,
+        shape: List[int],
+        atomic: bool = True,
+    ):
+        self.name = name
+        self.shape = list(shape)
+        self.atomic = atomic
+
+
+class TestDef(unittest.TestCase):
+    def test_model_output_def(self):
+        defs = [
+            OutputVariableDef("energy", [1], True, True),
+            OutputVariableDef("dos", [10], True, False),
+            OutputVariableDef("foo", [3], False, False),
+        ]
+        # fitting definition
+        fd = FittingOutputDef(defs)
+        expected_keys = ["energy", "dos", "foo"]
+        self.assertEqual(
+            set(expected_keys),
+            set(fd.keys()),
+        )
+        # shape
+        self.assertEqual(fd["energy"].shape, [1])
+        self.assertEqual(fd["dos"].shape, [10])
+        self.assertEqual(fd["foo"].shape, [3])
+        # atomic
+        self.assertEqual(fd["energy"].atomic, True)
+        self.assertEqual(fd["dos"].atomic, True)
+        self.assertEqual(fd["foo"].atomic, True)
+        # reduce
+        self.assertEqual(fd["energy"].reduciable, True)
+        self.assertEqual(fd["dos"].reduciable, True)
+        self.assertEqual(fd["foo"].reduciable, False)
+        # derivative
+        self.assertEqual(fd["energy"].differentiable, True)
+        self.assertEqual(fd["dos"].differentiable, False)
+        self.assertEqual(fd["foo"].differentiable, False)
+        # model definition
+        md = ModelOutputDef(fd)
+        expected_keys = [
+            "energy",
+            "dos",
+            "foo",
+            "energy_redu",
+            "energy_derv_r",
+            "energy_derv_c",
+            "dos_redu",
+        ]
+        self.assertEqual(
+            set(expected_keys),
+            set(md.keys()),
+        )
+        for kk in expected_keys:
+            self.assertEqual(md[kk].name, kk)
+        # reduce
+        self.assertEqual(md["energy"].reduciable, True)
+        self.assertEqual(md["dos"].reduciable, True)
+        self.assertEqual(md["foo"].reduciable, False)
+        # derivative
+        self.assertEqual(md["energy"].differentiable, True)
+        self.assertEqual(md["dos"].differentiable, False)
+        self.assertEqual(md["foo"].differentiable, False)
+        # shape
+        self.assertEqual(md["energy"].shape, [1])
+        self.assertEqual(md["dos"].shape, [10])
+        self.assertEqual(md["foo"].shape, [3])
+        self.assertEqual(md["energy_redu"].shape, [1])
+        self.assertEqual(md["energy_derv_r"].shape, [1, 3])
+        self.assertEqual(md["energy_derv_c"].shape, [1, 3, 3])
+        # atomic
+        self.assertEqual(md["energy"].atomic, True)
+        self.assertEqual(md["dos"].atomic, True)
+        self.assertEqual(md["foo"].atomic, True)
+        self.assertEqual(md["energy_redu"].atomic, False)
+        self.assertEqual(md["energy_derv_r"].atomic, True)
+        self.assertEqual(md["energy_derv_c"].atomic, True)
+
+    def test_raise_no_redu_deriv(self):
+        with self.assertRaises(ValueError) as context:
+            (OutputVariableDef("energy", [1], False, True),)
+
+    def test_model_decorator(self):
+        nf = 2
+        nloc = 3
+        nall = 4
+
+        @model_check_output
+        class Foo(NativeOP):
+            def output_def(self):
+                defs = [
+                    OutputVariableDef("energy", [1], True, True),
+                ]
+                return ModelOutputDef(FittingOutputDef(defs))
+
+            def call(self):
+                return {
+                    "energy": np.zeros([nf, nloc, 1]),
+                    "energy_redu": np.zeros([nf, 1]),
+                    "energy_derv_r": np.zeros([nf, nall, 1, 3]),
+                    "energy_derv_c": np.zeros([nf, nall, 1, 3, 3]),
+                }
+
+        ff = Foo()
+        ff()
+
+    def test_model_decorator_keyerror(self):
+        nf = 2
+        nloc = 3
+        nall = 4
+
+        @model_check_output
+        class Foo(NativeOP):
+            def __init__(self):
+                super().__init__()
+
+            def output_def(self):
+                defs = [
+                    OutputVariableDef("energy", [1], True, True),
+                ]
+                return ModelOutputDef(FittingOutputDef(defs))
+
+            def call(self):
+                return {
+                    "energy": np.zeros([nf, nloc, 1]),
+                    "energy_redu": np.zeros([nf, 1]),
+                    "energy_derv_c": np.zeros([nf, nall, 1, 3, 3]),
+                }
+
+        ff = Foo()
+        with self.assertRaises(KeyError) as context:
+            ff()
+            self.assertIn("energy_derv_r", context.exception)
+
+    def test_model_decorator_shapeerror(self):
+        nf = 2
+        nloc = 3
+        nall = 4
+
+        @model_check_output
+        class Foo(NativeOP):
+            def __init__(
+                self,
+                shape_rd=[nf, 1],
+                shape_dr=[nf, nall, 1, 3],
+            ):
+                self.shape_rd, self.shape_dr = shape_rd, shape_dr
+
+            def output_def(self):
+                defs = [
+                    OutputVariableDef("energy", [1], True, True),
+                ]
+                return ModelOutputDef(FittingOutputDef(defs))
+
+            def call(self):
+                return {
+                    "energy": np.zeros([nf, nloc, 1]),
+                    "energy_redu": np.zeros(self.shape_rd),
+                    "energy_derv_r": np.zeros(self.shape_dr),
+                    "energy_derv_c": np.zeros([nf, nall, 1, 3, 3]),
+                }
+
+        ff = Foo()
+        ff()
+        # shape of reduced energy
+        with self.assertRaises(ValueError) as context:
+            ff = Foo(shape_rd=[nf, nloc, 1])
+            ff()
+            self.assertIn("not matching", context.exception)
+        with self.assertRaises(ValueError) as context:
+            ff = Foo(shape_rd=[nf, 2])
+            ff()
+            self.assertIn("not matching", context.exception)
+        # shape of dr
+        with self.assertRaises(ValueError) as context:
+            ff = Foo(shape_dr=[nf, nloc, 1])
+            ff()
+            self.assertIn("not matching", context.exception)
+        with self.assertRaises(ValueError) as context:
+            ff = Foo(shape_dr=[nf, nloc, 1, 3, 3])
+            ff()
+            self.assertIn("not matching", context.exception)
+        with self.assertRaises(ValueError) as context:
+            ff = Foo(shape_dr=[nf, nloc, 1, 4])
+            ff()
+            self.assertIn("not matching", context.exception)
+
+    def test_fitting_decorator(self):
+        nf = 2
+        nloc = 3
+        nall = 4
+
+        @fitting_check_output
+        class Foo(NativeOP):
+            def output_def(self):
+                defs = [
+                    OutputVariableDef("energy", [1], True, True),
+                ]
+                return FittingOutputDef(defs)
+
+            def call(self):
+                return {
+                    "energy": np.zeros([nf, nloc, 1]),
+                }
+
+        ff = Foo()
+        ff()
+
+    def test_fitting_decorator_shapeerror(self):
+        nf = 2
+        nloc = 3
+
+        @fitting_check_output
+        class Foo(NativeOP):
+            def __init__(
+                self,
+                shape=[nf, nloc, 1],
+            ):
+                self.shape = shape
+
+            def output_def(self):
+                defs = [
+                    OutputVariableDef("energy", [1], True, True),
+                ]
+                return FittingOutputDef(defs)
+
+            def call(self):
+                return {
+                    "energy": np.zeros(self.shape),
+                }
+
+        ff = Foo()
+        ff()
+        # shape of reduced energy
+        with self.assertRaises(ValueError) as context:
+            ff = Foo(shape=[nf, 1])
+            ff()
+            self.assertIn("not matching", context.exception)
+        with self.assertRaises(ValueError) as context:
+            ff = Foo(shape=[nf, nloc, 2])
+            ff()
+            self.assertIn("not matching", context.exception)
+
+    def test_check_var(self):
+        var_def = VariableDef("foo", [2, 3], atomic=True)
+        with self.assertRaises(ValueError) as context:
+            check_var(np.zeros([2, 3, 4, 5, 6]), var_def)
+            self.assertIn("length not matching", context.exception)
+        with self.assertRaises(ValueError) as context:
+            check_var(np.zeros([2, 3, 4, 5]), var_def)
+            self.assertIn("shape not matching", context.exception)
+        check_var(np.zeros([2, 3, 2, 3]), var_def)
+
+        var_def = VariableDef("foo", [2, 3], atomic=False)
+        with self.assertRaises(ValueError) as context:
+            check_var(np.zeros([2, 3, 4, 5]), var_def)
+            self.assertIn("length not matching", context.exception)
+        with self.assertRaises(ValueError) as context:
+            check_var(np.zeros([2, 3, 4]), var_def)
+            self.assertIn("shape not matching", context.exception)
+        check_var(np.zeros([2, 2, 3]), var_def)
+
+        var_def = VariableDef("foo", [2, -1], atomic=True)
+        with self.assertRaises(ValueError) as context:
+            check_var(np.zeros([2, 3, 4, 5, 6]), var_def)
+            self.assertIn("length not matching", context.exception)
+        with self.assertRaises(ValueError) as context:
+            check_var(np.zeros([2, 3, 4, 5]), var_def)
+            self.assertIn("shape not matching", context.exception)
+        check_var(np.zeros([2, 3, 2, 8]), var_def)
+
+        var_def = VariableDef("foo", [2, -1], atomic=False)
+        with self.assertRaises(ValueError) as context:
+            check_var(np.zeros([2, 3, 4, 5]), var_def)
+            self.assertIn("length not matching", context.exception)
+        with self.assertRaises(ValueError) as context:
+            check_var(np.zeros([2, 3, 4]), var_def)
+            self.assertIn("shape not matching", context.exception)
+        check_var(np.zeros([2, 2, 8]), var_def)