diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml
index 775b88cfd3..a1ac032891 100644
--- a/.github/workflows/build_cc.yml
+++ b/.github/workflows/build_cc.yml
@@ -11,7 +11,7 @@ name: Build C++
 jobs:
   buildcc:
     name: Build C++
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     strategy:
       matrix:
         include:
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index c912ece8d5..583e7785d9 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -13,7 +13,7 @@ concurrency:
 jobs:
   analyze:
     name: Analyze
-    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
+    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-22.04' }}
     timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
     permissions:
       actions: read
diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml
index ebbfc4d960..768590980f 100644
--- a/.github/workflows/test_cc.yml
+++ b/.github/workflows/test_cc.yml
@@ -11,7 +11,7 @@ name: Test C++
 jobs:
   testcc:
     name: Test C++
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     strategy:
       matrix:
         check_memleak: [true, false]
diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
index 8274921909..87d7266e03 100644
--- a/.github/workflows/test_python.yml
+++ b/.github/workflows/test_python.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         group: [1, 2, 3, 4, 5, 6]
-        python: ["3.8", "3.12"]
+        python: ["3.9", "3.12"]
 
     steps:
     - uses: actions/checkout@v4
@@ -34,7 +34,7 @@ jobs:
         # existing TensorFlow package. Currently, it uses
         # TensorFlow in the build dependency, but if it
         # changes, setting `TENSORFLOW_ROOT`.
-        TENSORFLOW_VERSION: ${{ matrix.python == '3.8' && '2.13.1' || '2.16.1' }}
+        TENSORFLOW_VERSION: 2.16.1
         DP_ENABLE_PYTORCH: 1
         DP_BUILD_TESTING: 1
         UV_EXTRA_INDEX_URL: "https://pypi.anaconda.org/njzjz/simple https://pypi.anaconda.org/mpi4py/simple"
@@ -69,7 +69,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python: ["3.8", "3.12"]
+        python: ["3.9", "3.12"]
     needs: testpython
     steps:
     - name: Get durations from cache
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5d34f39752..6a1d303f64 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: trailing-whitespace
         exclude: "^.+\\.pbtxt$"
@@ -29,7 +29,7 @@ repos:
         exclude: ^source/3rdparty
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.6.7
+    rev: v0.6.9
     hooks:
       - id: ruff
         args: ["--fix"]
@@ -47,15 +47,15 @@ repos:
         exclude: ^source/3rdparty
   # Python inside docs
   - repo: https://github.com/asottile/blacken-docs
-    rev: 1.18.0
+    rev: 1.19.0
     hooks:
       - id: blacken-docs
   # C++
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v18.1.8
+    rev: v19.1.1
     hooks:
       - id: clang-format
-        exclude: ^source/3rdparty|source/lib/src/gpu/cudart/.+\.inc
+        exclude: ^(source/3rdparty|source/lib/src/gpu/cudart/.+\.inc|.+\.ipynb$)
   # markdown, yaml, CSS, javascript
   - repo: https://github.com/pre-commit/mirrors-prettier
     rev: v4.0.0-alpha.8
@@ -146,7 +146,7 @@ repos:
         exclude: .pre-commit-config.yaml|source/lmp
   # customized pylint rules
   - repo: https://github.com/pylint-dev/pylint/
-    rev: v3.3.0
+    rev: v3.3.1
     hooks:
       - id: pylint
         entry: env PYTHONPATH=source/checker pylint
diff --git a/backend/dp_backend.py b/backend/dp_backend.py
index dbd2d2a52b..81c3f20f19 100644
--- a/backend/dp_backend.py
+++ b/backend/dp_backend.py
@@ -1,10 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """A PEP-517 backend to find TensorFlow."""
 
-from typing import (
-    List,
-)
-
 from scikit_build_core import build as _orig
 
 from .find_pytorch import (
@@ -26,7 +22,7 @@
 ]
 
 
-def __dir__() -> List[str]:
+def __dir__() -> list[str]:
     return __all__
 
 
@@ -42,7 +38,7 @@ def __dir__() -> List[str]:
 
 def get_requires_for_build_wheel(
     config_settings: dict,
-) -> List[str]:
+) -> list[str]:
     return (
         _orig.get_requires_for_build_wheel(config_settings)
         + find_tensorflow()[1]
@@ -52,7 +48,7 @@ def get_requires_for_build_wheel(
 
 def get_requires_for_build_editable(
     config_settings: dict,
-) -> List[str]:
+) -> list[str]:
     return (
         _orig.get_requires_for_build_editable(config_settings)
         + find_tensorflow()[1]
diff --git a/backend/dynamic_metadata.py b/backend/dynamic_metadata.py
index 83123e6e41..a66e9a2759 100644
--- a/backend/dynamic_metadata.py
+++ b/backend/dynamic_metadata.py
@@ -4,8 +4,6 @@
     Path,
 )
 from typing import (
-    Dict,
-    List,
     Optional,
 )
 
@@ -27,13 +25,13 @@
 __all__ = ["dynamic_metadata"]
 
 
-def __dir__() -> List[str]:
+def __dir__() -> list[str]:
     return __all__
 
 
 def dynamic_metadata(
     field: str,
-    settings: Optional[Dict[str, object]] = None,
+    settings: Optional[dict[str, object]] = None,
 ):
     assert field in ["optional-dependencies", "entry-points", "scripts"]
     _, _, find_libpython_requires, extra_scripts, tf_version, pt_version = (
diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py
index 04f297a963..e01f4e84fe 100644
--- a/backend/find_pytorch.py
+++ b/backend/find_pytorch.py
@@ -18,9 +18,7 @@
     get_path,
 )
 from typing import (
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -30,7 +28,7 @@
 
 
 @lru_cache
-def find_pytorch() -> Tuple[Optional[str], List[str]]:
+def find_pytorch() -> tuple[Optional[str], list[str]]:
     """Find PyTorch library.
 
     Tries to find PyTorch in the order of:
diff --git a/backend/find_tensorflow.py b/backend/find_tensorflow.py
index 514490a926..5b0de0b2dd 100644
--- a/backend/find_tensorflow.py
+++ b/backend/find_tensorflow.py
@@ -17,9 +17,7 @@
     get_path,
 )
 from typing import (
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -29,7 +27,7 @@
 
 
 @lru_cache
-def find_tensorflow() -> Tuple[Optional[str], List[str]]:
+def find_tensorflow() -> tuple[Optional[str], list[str]]:
     """Find TensorFlow library.
 
     Tries to find TensorFlow in the order of:
@@ -156,18 +154,15 @@ def get_tf_requirement(tf_version: str = "") -> dict:
                 "tensorflow; platform_machine=='aarch64' or (platform_machine=='arm64' and platform_system == 'Darwin')",
                 # https://github.com/tensorflow/tensorflow/issues/61830
                 "tensorflow-cpu!=2.15.*; platform_system=='Windows'",
-                # TODO: build(wheel): unpin h5py on aarch64
-                # Revert after https://github.com/h5py/h5py/issues/2408 is fixed;
-                # or set UV_PREFER_BINARY when https://github.com/astral-sh/uv/issues/1794 is resolved.
-                # 3.6.0 is the first version to have aarch64 wheels.
-                "h5py>=3.6.0,<3.11.0; platform_system=='Linux' and platform_machine=='aarch64'",
+                # https://github.com/h5py/h5py/issues/2408
+                "h5py>=3.6.0,!=3.11.0; platform_system=='Linux' and platform_machine=='aarch64'",
                 *extra_requires,
             ],
             "gpu": [
                 "tensorflow",
                 "tensorflow-metal; platform_machine=='arm64' and platform_system == 'Darwin'",
                 # See above.
-                "h5py>=3.6.0,<3.11.0; platform_system=='Linux' and platform_machine=='aarch64'",
+                "h5py>=3.6.0,!=3.11.0; platform_system=='Linux' and platform_machine=='aarch64'",
                 *extra_requires,
             ],
             **extra_select,
diff --git a/backend/read_env.py b/backend/read_env.py
index ae82778f4e..edc3600115 100644
--- a/backend/read_env.py
+++ b/backend/read_env.py
@@ -5,9 +5,6 @@
 from functools import (
     lru_cache,
 )
-from typing import (
-    Tuple,
-)
 
 from packaging.version import (
     Version,
@@ -24,7 +21,7 @@
 
 
 @lru_cache
-def get_argument_from_env() -> Tuple[str, list, list, dict, str, str]:
+def get_argument_from_env() -> tuple[str, list, list, dict, str, str]:
     """Get the arguments from environment variables.
 
     The environment variables are assumed to be not changed during the build.
diff --git a/deepmd/backend/backend.py b/deepmd/backend/backend.py
index 8f7bca319e..3263169f6f 100644
--- a/deepmd/backend/backend.py
+++ b/deepmd/backend/backend.py
@@ -10,9 +10,6 @@
     TYPE_CHECKING,
     Callable,
     ClassVar,
-    Dict,
-    List,
-    Type,
 )
 
 from deepmd.utils.plugin import (
@@ -45,7 +42,7 @@ class Backend(PluginVariant, make_plugin_registry("backend")):
     """
 
     @staticmethod
-    def get_backend(key: str) -> Type["Backend"]:
+    def get_backend(key: str) -> type["Backend"]:
         """Get the backend by key.
 
         Parameters
@@ -61,7 +58,7 @@ def get_backend(key: str) -> Type["Backend"]:
         return Backend.get_class_by_type(key)
 
     @staticmethod
-    def get_backends() -> Dict[str, Type["Backend"]]:
+    def get_backends() -> dict[str, type["Backend"]]:
         """Get all the registered backend names.
 
         Returns
@@ -74,7 +71,7 @@ def get_backends() -> Dict[str, Type["Backend"]]:
     @staticmethod
     def get_backends_by_feature(
         feature: "Backend.Feature",
-    ) -> Dict[str, Type["Backend"]]:
+    ) -> dict[str, type["Backend"]]:
         """Get all the registered backend names with a specific feature.
 
         Parameters
@@ -94,7 +91,7 @@ def get_backends_by_feature(
         }
 
     @staticmethod
-    def detect_backend_by_model(filename: str) -> Type["Backend"]:
+    def detect_backend_by_model(filename: str) -> type["Backend"]:
         """Detect the backend of the given model file.
 
         Parameters
@@ -128,7 +125,7 @@ class Feature(Flag):
 
     features: ClassVar[Feature] = Feature(0)
     """The features of the backend."""
-    suffixes: ClassVar[List[str]] = []
+    suffixes: ClassVar[list[str]] = []
     """The supported suffixes of the saved model.
 
     The first element is considered as the default suffix."""
@@ -157,7 +154,7 @@ def entry_point_hook(self) -> Callable[["Namespace"], None]:
 
     @property
     @abstractmethod
-    def deep_eval(self) -> Type["DeepEvalBackend"]:
+    def deep_eval(self) -> type["DeepEvalBackend"]:
         """The Deep Eval backend of the backend.
 
         Returns
@@ -169,7 +166,7 @@ def deep_eval(self) -> Type["DeepEvalBackend"]:
 
     @property
     @abstractmethod
-    def neighbor_stat(self) -> Type["NeighborStat"]:
+    def neighbor_stat(self) -> type["NeighborStat"]:
         """The neighbor statistics of the backend.
 
         Returns
diff --git a/deepmd/backend/dpmodel.py b/deepmd/backend/dpmodel.py
index c51d097d5a..7c21b256ae 100644
--- a/deepmd/backend/dpmodel.py
+++ b/deepmd/backend/dpmodel.py
@@ -3,8 +3,6 @@
     TYPE_CHECKING,
     Callable,
     ClassVar,
-    List,
-    Type,
 )
 
 from deepmd.backend.backend import (
@@ -37,7 +35,7 @@ class DPModelBackend(Backend):
         Backend.Feature.DEEP_EVAL | Backend.Feature.NEIGHBOR_STAT | Backend.Feature.IO
     )
     """The features of the backend."""
-    suffixes: ClassVar[List[str]] = [".dp", ".yaml", ".yml"]
+    suffixes: ClassVar[list[str]] = [".dp", ".yaml", ".yml"]
     """The suffixes of the backend."""
 
     def is_available(self) -> bool:
@@ -62,7 +60,7 @@ def entry_point_hook(self) -> Callable[["Namespace"], None]:
         raise NotImplementedError(f"Unsupported backend: {self.name}")
 
     @property
-    def deep_eval(self) -> Type["DeepEvalBackend"]:
+    def deep_eval(self) -> type["DeepEvalBackend"]:
         """The Deep Eval backend of the backend.
 
         Returns
@@ -77,7 +75,7 @@ def deep_eval(self) -> Type["DeepEvalBackend"]:
         return DeepEval
 
     @property
-    def neighbor_stat(self) -> Type["NeighborStat"]:
+    def neighbor_stat(self) -> type["NeighborStat"]:
         """The neighbor statistics of the backend.
 
         Returns
diff --git a/deepmd/backend/jax.py b/deepmd/backend/jax.py
index ece0761772..db92d6bed1 100644
--- a/deepmd/backend/jax.py
+++ b/deepmd/backend/jax.py
@@ -6,8 +6,6 @@
     TYPE_CHECKING,
     Callable,
     ClassVar,
-    List,
-    Type,
 )
 
 from deepmd.backend.backend import (
@@ -41,7 +39,7 @@ class JAXBackend(Backend):
         # | Backend.Feature.IO
     )
     """The features of the backend."""
-    suffixes: ClassVar[List[str]] = []
+    suffixes: ClassVar[list[str]] = []
     """The suffixes of the backend."""
 
     def is_available(self) -> bool:
@@ -66,7 +64,7 @@ def entry_point_hook(self) -> Callable[["Namespace"], None]:
         raise NotImplementedError
 
     @property
-    def deep_eval(self) -> Type["DeepEvalBackend"]:
+    def deep_eval(self) -> type["DeepEvalBackend"]:
         """The Deep Eval backend of the backend.
 
         Returns
@@ -77,7 +75,7 @@ def deep_eval(self) -> Type["DeepEvalBackend"]:
         raise NotImplementedError
 
     @property
-    def neighbor_stat(self) -> Type["NeighborStat"]:
+    def neighbor_stat(self) -> type["NeighborStat"]:
         """The neighbor statistics of the backend.
 
         Returns
diff --git a/deepmd/backend/pytorch.py b/deepmd/backend/pytorch.py
index fb7d30e994..f5b0dd92b2 100644
--- a/deepmd/backend/pytorch.py
+++ b/deepmd/backend/pytorch.py
@@ -6,8 +6,6 @@
     TYPE_CHECKING,
     Callable,
     ClassVar,
-    List,
-    Type,
 )
 
 from deepmd.backend.backend import (
@@ -41,7 +39,7 @@ class PyTorchBackend(Backend):
         | Backend.Feature.IO
     )
     """The features of the backend."""
-    suffixes: ClassVar[List[str]] = [".pth", ".pt"]
+    suffixes: ClassVar[list[str]] = [".pth", ".pt"]
     """The suffixes of the backend."""
 
     def is_available(self) -> bool:
@@ -68,7 +66,7 @@ def entry_point_hook(self) -> Callable[["Namespace"], None]:
         return deepmd_main
 
     @property
-    def deep_eval(self) -> Type["DeepEvalBackend"]:
+    def deep_eval(self) -> type["DeepEvalBackend"]:
         """The Deep Eval backend of the backend.
 
         Returns
@@ -81,7 +79,7 @@ def deep_eval(self) -> Type["DeepEvalBackend"]:
         return DeepEvalPT
 
     @property
-    def neighbor_stat(self) -> Type["NeighborStat"]:
+    def neighbor_stat(self) -> type["NeighborStat"]:
         """The neighbor statistics of the backend.
 
         Returns
diff --git a/deepmd/backend/suffix.py b/deepmd/backend/suffix.py
index 273fbc0951..d694b43488 100644
--- a/deepmd/backend/suffix.py
+++ b/deepmd/backend/suffix.py
@@ -6,7 +6,6 @@
 )
 from typing import (
     Optional,
-    Type,
     Union,
 )
 
@@ -18,7 +17,7 @@
 def format_model_suffix(
     filename: str,
     feature: Optional[Backend.Feature] = None,
-    preferred_backend: Optional[Union[str, Type["Backend"]]] = None,
+    preferred_backend: Optional[Union[str, type["Backend"]]] = None,
     strict_prefer: Optional[bool] = None,
 ) -> str:
     """Check and format the suffixes of a filename.
diff --git a/deepmd/backend/tensorflow.py b/deepmd/backend/tensorflow.py
index 15b03ee7c8..6b73d7c469 100644
--- a/deepmd/backend/tensorflow.py
+++ b/deepmd/backend/tensorflow.py
@@ -6,8 +6,6 @@
     TYPE_CHECKING,
     Callable,
     ClassVar,
-    List,
-    Type,
 )
 
 from deepmd.backend.backend import (
@@ -41,7 +39,7 @@ class TensorFlowBackend(Backend):
         | Backend.Feature.IO
     )
     """The features of the backend."""
-    suffixes: ClassVar[List[str]] = [".pb"]
+    suffixes: ClassVar[list[str]] = [".pb"]
     """The suffixes of the backend."""
 
     def is_available(self) -> bool:
@@ -77,7 +75,7 @@ def entry_point_hook(self) -> Callable[["Namespace"], None]:
         return deepmd_main
 
     @property
-    def deep_eval(self) -> Type["DeepEvalBackend"]:
+    def deep_eval(self) -> type["DeepEvalBackend"]:
         """The Deep Eval backend of the backend.
 
         Returns
@@ -90,7 +88,7 @@ def deep_eval(self) -> Type["DeepEvalBackend"]:
         return DeepEvalTF
 
     @property
-    def neighbor_stat(self) -> Type["NeighborStat"]:
+    def neighbor_stat(self) -> type["NeighborStat"]:
         """The neighbor statistics of the backend.
 
         Returns
diff --git a/deepmd/calculator.py b/deepmd/calculator.py
index 2d3e7ce831..032fa2bcfa 100644
--- a/deepmd/calculator.py
+++ b/deepmd/calculator.py
@@ -7,8 +7,6 @@
 from typing import (
     TYPE_CHECKING,
     ClassVar,
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -42,7 +40,7 @@ class DP(Calculator):
         path to the model
     label : str, optional
         calculator label, by default "DP"
-    type_dict : Dict[str, int], optional
+    type_dict : dict[str, int], optional
         mapping of element types and their numbers, best left None and the calculator
         will infer this information from model, by default None
     neighbor_list : ase.neighborlist.NeighborList, optional
@@ -72,7 +70,7 @@ class DP(Calculator):
     """
 
     name = "DP"
-    implemented_properties: ClassVar[List[str]] = [
+    implemented_properties: ClassVar[list[str]] = [
         "energy",
         "free_energy",
         "forces",
@@ -84,7 +82,7 @@ def __init__(
         self,
         model: Union[str, "Path"],
         label: str = "DP",
-        type_dict: Optional[Dict[str, int]] = None,
+        type_dict: Optional[dict[str, int]] = None,
         neighbor_list=None,
         **kwargs,
     ) -> None:
@@ -100,8 +98,8 @@ def __init__(
     def calculate(
         self,
         atoms: Optional["Atoms"] = None,
-        properties: List[str] = ["energy", "forces", "virial"],
-        system_changes: List[str] = all_changes,
+        properties: list[str] = ["energy", "forces", "virial"],
+        system_changes: list[str] = all_changes,
     ):
         """Run calculation with deepmd model.
 
@@ -109,10 +107,10 @@ def calculate(
         ----------
         atoms : Optional[Atoms], optional
             atoms object to run the calculation on, by default None
-        properties : List[str], optional
+        properties : list[str], optional
             unused, only for function signature compatibility,
             by default ["energy", "forces", "stress"]
-        system_changes : List[str], optional
+        system_changes : list[str], optional
             unused, only for function signature compatibility, by default all_changes
         """
         if atoms is not None:
diff --git a/deepmd/common.py b/deepmd/common.py
index f58634f224..fdfeef0e6d 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -14,9 +14,6 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Dict,
-    List,
-    Set,
     TypeVar,
     Union,
     get_args,
@@ -60,8 +57,8 @@
     "linear",
 ]
 # get_args is new in py38
-VALID_PRECISION: Set[_PRECISION] = set(get_args(_PRECISION))
-VALID_ACTIVATION: Set[_ACTIVATION] = set(get_args(_ACTIVATION))
+VALID_PRECISION: set[_PRECISION] = set(get_args(_PRECISION))
+VALID_ACTIVATION: set[_ACTIVATION] = set(get_args(_ACTIVATION))
 
 if TYPE_CHECKING:
     _DICT_VAL = TypeVar("_DICT_VAL")
@@ -127,17 +124,17 @@ def make_default_mesh(pbc: bool, mixed_type: bool) -> np.ndarray:
 
 
 def j_deprecated(
-    jdata: Dict[str, "_DICT_VAL"], key: str, deprecated_key: List[str] = []
+    jdata: dict[str, "_DICT_VAL"], key: str, deprecated_key: list[str] = []
 ) -> "_DICT_VAL":
     """Assert that supplied dictionary conaines specified key.
 
     Parameters
     ----------
-    jdata : Dict[str, _DICT_VAL]
+    jdata : dict[str, _DICT_VAL]
         dictionary to check
     key : str
         key to check
-    deprecated_key : List[str], optional
+    deprecated_key : list[str], optional
         list of deprecated keys, by default []
 
     Returns
@@ -161,7 +158,7 @@ def j_deprecated(
         return jdata[key]
 
 
-def j_loader(filename: Union[str, Path]) -> Dict[str, Any]:
+def j_loader(filename: Union[str, Path]) -> dict[str, Any]:
     """Load yaml or json settings file.
 
     Parameters
@@ -171,7 +168,7 @@ def j_loader(filename: Union[str, Path]) -> Dict[str, Any]:
 
     Returns
     -------
-    Dict[str, Any]
+    dict[str, Any]
         loaded dictionary
 
     Raises
@@ -190,7 +187,7 @@ def j_loader(filename: Union[str, Path]) -> Dict[str, Any]:
         raise TypeError("config file must be json, or yaml/yml")
 
 
-def expand_sys_str(root_dir: Union[str, Path]) -> List[str]:
+def expand_sys_str(root_dir: Union[str, Path]) -> list[str]:
     """Recursively iterate over directories taking those that contain `type.raw` file.
 
     Parameters
@@ -200,7 +197,7 @@ def expand_sys_str(root_dir: Union[str, Path]) -> List[str]:
 
     Returns
     -------
-    List[str]
+    list[str]
         list of string pointing to system directories
     """
     root_dir = DPPath(root_dir)
diff --git a/deepmd/dpmodel/array_api.py b/deepmd/dpmodel/array_api.py
index e4af2ad627..360df78a7b 100644
--- a/deepmd/dpmodel/array_api.py
+++ b/deepmd/dpmodel/array_api.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """Utilities for the array API."""
 
+import array_api_compat
+
 
 def support_array_api(version: str) -> callable:
     """Mark a function as supporting the specific version of the array API.
@@ -27,3 +29,41 @@ def set_version(func: callable) -> callable:
         return func
 
     return set_version
+
+
+# array api adds take_along_axis in https://github.com/data-apis/array-api/pull/816
+# but it hasn't been released yet
+# below is a pure Python implementation of take_along_axis
+# https://github.com/data-apis/array-api/issues/177#issuecomment-2093630595
+def xp_swapaxes(a, axis1, axis2):
+    xp = array_api_compat.array_namespace(a)
+    axes = list(range(a.ndim))
+    axes[axis1], axes[axis2] = axes[axis2], axes[axis1]
+    a = xp.permute_dims(a, axes)
+    return a
+
+
+def xp_take_along_axis(arr, indices, axis):
+    xp = array_api_compat.array_namespace(arr)
+    arr = xp_swapaxes(arr, axis, -1)
+    indices = xp_swapaxes(indices, axis, -1)
+
+    m = arr.shape[-1]
+    n = indices.shape[-1]
+
+    shape = list(arr.shape)
+    shape.pop(-1)
+    shape = [*shape, n]
+
+    arr = xp.reshape(arr, (-1,))
+    if n != 0:
+        indices = xp.reshape(indices, (-1, n))
+    else:
+        indices = xp.reshape(indices, (0, 0))
+
+    offset = (xp.arange(indices.shape[0]) * m)[:, xp.newaxis]
+    indices = xp.reshape(offset + indices, (-1,))
+
+    out = xp.take(arr, indices)
+    out = xp.reshape(out, shape)
+    return xp_swapaxes(out, axis, -1)
diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index 5ea65a9d73..c29a76b3f1 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import copy
 from typing import (
-    Dict,
-    List,
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -36,11 +33,11 @@
 class BaseAtomicModel(BaseAtomicModel_, NativeOP):
     def __init__(
         self,
-        type_map: List[str],
-        atom_exclude_types: List[int] = [],
-        pair_exclude_types: List[Tuple[int, int]] = [],
+        type_map: list[str],
+        atom_exclude_types: list[int] = [],
+        pair_exclude_types: list[tuple[int, int]] = [],
         rcond: Optional[float] = None,
-        preset_out_bias: Optional[Dict[str, np.ndarray]] = None,
+        preset_out_bias: Optional[dict[str, np.ndarray]] = None,
     ):
         super().__init__()
         self.type_map = type_map
@@ -52,7 +49,7 @@ def __init__(
     def init_out_stat(self):
         """Initialize the output bias."""
         ntypes = self.get_ntypes()
-        self.bias_keys: List[str] = list(self.fitting_output_def().keys())
+        self.bias_keys: list[str] = list(self.fitting_output_def().keys())
         self.max_out_size = max(
             [self.atomic_output_def()[kk].size for kk in self.bias_keys]
         )
@@ -78,13 +75,13 @@ def __getitem__(self, key):
         else:
             raise KeyError(key)
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map."""
         return self.type_map
 
     def reinit_atom_exclude(
         self,
-        exclude_types: List[int] = [],
+        exclude_types: list[int] = [],
     ):
         self.atom_exclude_types = exclude_types
         if exclude_types == []:
@@ -94,7 +91,7 @@ def reinit_atom_exclude(
 
     def reinit_pair_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.pair_exclude_types = exclude_types
         if exclude_types == []:
@@ -119,7 +116,7 @@ def atomic_output_def(self) -> FittingOutputDef:
         )
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -143,7 +140,7 @@ def forward_common_atomic(
         mapping: Optional[np.ndarray] = None,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         """Common interface for atomic inference.
 
         This method accept extended coordinates, extended atom typs, neighbor list,
@@ -217,7 +214,7 @@ def call(
         mapping: Optional[np.ndarray] = None,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         return self.forward_common_atomic(
             extended_coord,
             extended_atype,
@@ -251,7 +248,7 @@ def deserialize(cls, data: dict) -> "BaseAtomicModel":
 
     def apply_out_stat(
         self,
-        ret: Dict[str, np.ndarray],
+        ret: dict[str, np.ndarray],
         atype: np.ndarray,
     ):
         """Apply the stat to each atomic output.
@@ -274,7 +271,7 @@ def apply_out_stat(
 
     def _varsize(
         self,
-        shape: List[int],
+        shape: list[int],
     ) -> int:
         output_size = 1
         len_shape = len(shape)
@@ -286,7 +283,7 @@ def _get_bias_index(
         self,
         kk: str,
     ) -> int:
-        res: List[int] = []
+        res: list[int] = []
         for i, e in enumerate(self.bias_keys):
             if e == kk:
                 res.append(i)
@@ -295,8 +292,8 @@ def _get_bias_index(
 
     def _fetch_out_stat(
         self,
-        keys: List[str],
-    ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
+        keys: list[str],
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
         ret_bias = {}
         ret_std = {}
         ntypes = self.get_ntypes()
diff --git a/deepmd/dpmodel/atomic_model/dp_atomic_model.py b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
index a446bde06f..7e576eb484 100644
--- a/deepmd/dpmodel/atomic_model/dp_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import copy
 from typing import (
-    Dict,
-    List,
     Optional,
 )
 
@@ -46,7 +44,7 @@ def __init__(
         self,
         descriptor,
         fitting,
-        type_map: List[str],
+        type_map: list[str],
         **kwargs,
     ):
         super().__init__(type_map, **kwargs)
@@ -64,7 +62,7 @@ def get_rcut(self) -> float:
         """Get the cut-off radius."""
         return self.descriptor.get_rcut()
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Get the neighbor selection."""
         return self.descriptor.get_sel()
 
@@ -96,7 +94,7 @@ def forward_atomic(
         mapping: Optional[np.ndarray] = None,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         """Models' atomic predictions.
 
         Parameters
@@ -140,7 +138,7 @@ def forward_atomic(
         return ret
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -192,7 +190,7 @@ def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.fitting.get_dim_aparam()
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
diff --git a/deepmd/dpmodel/atomic_model/linear_atomic_model.py b/deepmd/dpmodel/atomic_model/linear_atomic_model.py
index d522347f41..79a51635d2 100644
--- a/deepmd/dpmodel/atomic_model/linear_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/linear_atomic_model.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import copy
 from typing import (
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -48,8 +45,8 @@ class LinearEnergyAtomicModel(BaseAtomicModel):
 
     def __init__(
         self,
-        models: List[BaseAtomicModel],
-        type_map: List[str],
+        models: list[BaseAtomicModel],
+        type_map: list[str],
         **kwargs,
     ):
         super().__init__(type_map, **kwargs)
@@ -104,12 +101,12 @@ def get_rcut(self) -> float:
         """Get the cut-off radius."""
         return max(self.get_model_rcuts())
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map."""
         return self.type_map
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -125,22 +122,22 @@ def change_type_map(
                 else None,
             )
 
-    def get_model_rcuts(self) -> List[float]:
+    def get_model_rcuts(self) -> list[float]:
         """Get the cut-off radius for each individual models."""
         return [model.get_rcut() for model in self.models]
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         return [max([model.get_nsel() for model in self.models])]
 
-    def get_model_nsels(self) -> List[int]:
+    def get_model_nsels(self) -> list[int]:
         """Get the processed sels for each individual models. Not distinguishing types."""
         return [model.get_nsel() for model in self.models]
 
-    def get_model_sels(self) -> List[Union[int, List[int]]]:
+    def get_model_sels(self) -> list[Union[int, list[int]]]:
         """Get the sels for each individual models."""
         return [model.get_sel() for model in self.models]
 
-    def _sort_rcuts_sels(self) -> Tuple[List[float], List[int]]:
+    def _sort_rcuts_sels(self) -> tuple[list[float], list[int]]:
         # sort the pair of rcut and sels in ascending order, first based on sel, then on rcut.
         zipped = sorted(
             zip(self.get_model_rcuts(), self.get_model_nsels()),
@@ -156,7 +153,7 @@ def forward_atomic(
         mapping: Optional[np.ndarray] = None,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         """Return atomic prediction.
 
         Parameters
@@ -219,16 +216,16 @@ def forward_atomic(
         return fit_ret
 
     @staticmethod
-    def remap_atype(ori_map: List[str], new_map: List[str]) -> np.ndarray:
+    def remap_atype(ori_map: list[str], new_map: list[str]) -> np.ndarray:
         """
         This method is used to map the atype from the common type_map to the original type_map of
         indivial AtomicModels.
 
         Parameters
         ----------
-        ori_map : List[str]
+        ori_map : list[str]
             The original type map of an AtomicModel.
-        new_map : List[str]
+        new_map : list[str]
             The common type map of the DPZBLLinearEnergyAtomicModel, created by the `get_type_map` method,
             must be a subset of the ori_map.
 
@@ -284,8 +281,8 @@ def _compute_weight(
         self,
         extended_coord: np.ndarray,
         extended_atype: np.ndarray,
-        nlists_: List[np.ndarray],
-    ) -> List[np.ndarray]:
+        nlists_: list[np.ndarray],
+    ) -> list[np.ndarray]:
         """This should be a list of user defined weights that matches the number of models to be combined."""
         nmodels = len(self.models)
         nframes, nloc, _ = nlists_[0].shape
@@ -300,7 +297,7 @@ def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return max([model.get_dim_aparam() for model in self.models])
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
@@ -347,7 +344,7 @@ def __init__(
         zbl_model: PairTabAtomicModel,
         sw_rmin: float,
         sw_rmax: float,
-        type_map: List[str],
+        type_map: list[str],
         smin_alpha: Optional[float] = 0.1,
         **kwargs,
     ):
@@ -391,13 +388,13 @@ def _compute_weight(
         self,
         extended_coord: np.ndarray,
         extended_atype: np.ndarray,
-        nlists_: List[np.ndarray],
-    ) -> List[np.ndarray]:
+        nlists_: list[np.ndarray],
+    ) -> list[np.ndarray]:
         """ZBL weight.
 
         Returns
         -------
-        List[np.ndarray]
+        list[np.ndarray]
             the atomic ZBL weight for interpolation. (nframes, nloc, 1)
         """
         assert (
diff --git a/deepmd/dpmodel/atomic_model/make_base_atomic_model.py b/deepmd/dpmodel/atomic_model/make_base_atomic_model.py
index bf345eaa12..6c0fc88e2c 100644
--- a/deepmd/dpmodel/atomic_model/make_base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/make_base_atomic_model.py
@@ -4,8 +4,6 @@
     abstractmethod,
 )
 from typing import (
-    Dict,
-    List,
     Optional,
 )
 
@@ -57,7 +55,7 @@ def get_rcut(self) -> float:
             pass
 
         @abstractmethod
-        def get_type_map(self) -> List[str]:
+        def get_type_map(self) -> list[str]:
             """Get the type map."""
             pass
 
@@ -66,7 +64,7 @@ def get_ntypes(self) -> int:
             return len(self.get_type_map())
 
         @abstractmethod
-        def get_sel(self) -> List[int]:
+        def get_sel(self) -> list[int]:
             """Returns the number of selected atoms for each type."""
             pass
 
@@ -87,7 +85,7 @@ def get_dim_aparam(self) -> int:
             """Get the number (dimension) of atomic parameters of this atomic model."""
 
         @abstractmethod
-        def get_sel_type(self) -> List[int]:
+        def get_sel_type(self) -> list[int]:
             """Get the selected atom types of this model.
 
             Only atoms with selected atom types have atomic contribution
@@ -132,7 +130,7 @@ def fwd(
             mapping: Optional[t_tensor] = None,
             fparam: Optional[t_tensor] = None,
             aparam: Optional[t_tensor] = None,
-        ) -> Dict[str, t_tensor]:
+        ) -> dict[str, t_tensor]:
             pass
 
         @abstractmethod
@@ -146,7 +144,7 @@ def deserialize(cls, data: dict):
 
         @abstractmethod
         def change_type_map(
-            self, type_map: List[str], model_with_new_type_stat=None
+            self, type_map: list[str], model_with_new_type_stat=None
         ) -> None:
             pass
 
@@ -182,7 +180,7 @@ def do_grad_r(
             """
             odef = self.fitting_output_def()
             if var_name is None:
-                require: List[bool] = []
+                require: list[bool] = []
                 for vv in odef.keys():
                     require.append(self.do_grad_(vv, "r"))
                 return any(require)
@@ -199,7 +197,7 @@ def do_grad_c(
             """
             odef = self.fitting_output_def()
             if var_name is None:
-                require: List[bool] = []
+                require: list[bool] = []
                 for vv in odef.keys():
                     require.append(self.do_grad_(vv, "c"))
                 return any(require)
diff --git a/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py b/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
index 4218c24e3e..22471d3f32 100644
--- a/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import copy
 from typing import (
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -57,10 +55,10 @@ def __init__(
         self,
         tab_file: str,
         rcut: float,
-        sel: Union[int, List[int]],
-        type_map: List[str],
+        sel: Union[int, list[int]],
+        type_map: list[str],
         rcond: Optional[float] = None,
-        atom_ener: Optional[List[float]] = None,
+        atom_ener: Optional[list[float]] = None,
         **kwargs,
     ):
         super().__init__(type_map, **kwargs)
@@ -109,10 +107,10 @@ def fitting_output_def(self) -> FittingOutputDef:
     def get_rcut(self) -> float:
         return self.rcut
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         return self.type_map
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         return [self.sel]
 
     def get_nsel(self) -> int:
@@ -140,7 +138,7 @@ def need_sorted_nlist_for_lower(self) -> bool:
         return False
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -190,7 +188,7 @@ def forward_atomic(
         mapping: Optional[np.ndarray] = None,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         nframes, nloc, nnei = nlist.shape
         extended_coord = extended_coord.reshape(nframes, -1, 3)
 
@@ -394,7 +392,7 @@ def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return 0
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
diff --git a/deepmd/dpmodel/descriptor/descriptor.py b/deepmd/dpmodel/descriptor/descriptor.py
index e48479cca8..6d0644f856 100644
--- a/deepmd/dpmodel/descriptor/descriptor.py
+++ b/deepmd/dpmodel/descriptor/descriptor.py
@@ -6,8 +6,6 @@
 )
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -57,7 +55,7 @@ def get_nsel(self) -> int:
         pass
 
     @abstractmethod
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         pass
 
@@ -83,7 +81,7 @@ def get_dim_emb(self) -> int:
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -91,11 +89,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -104,7 +102,7 @@ def compute_input_stats(
         """
         raise NotImplementedError
 
-    def get_stats(self) -> Dict[str, StatItem]:
+    def get_stats(self) -> dict[str, StatItem]:
         """Get the statistics of the descriptor."""
         raise NotImplementedError
 
@@ -152,7 +150,7 @@ def extend_descrpt_stat(des, type_map, des_with_stat=None):
     ----------
     des : DescriptorBlock
         The descriptor block to be extended.
-    type_map : List[str]
+    type_map : list[str]
         The name of each type of atoms to be extended.
     des_with_stat : DescriptorBlock, Optional
         The descriptor block has additional statistics of types from newly provided `type_map`.
diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 70cb818eef..add9cb9f71 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -2,12 +2,11 @@
 from typing import (
     Any,
     Callable,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
+import array_api_compat
 import numpy as np
 
 from deepmd.dpmodel import (
@@ -15,6 +14,9 @@
     PRECISION_DICT,
     NativeOP,
 )
+from deepmd.dpmodel.array_api import (
+    xp_take_along_axis,
+)
 from deepmd.dpmodel.utils import (
     EmbeddingNet,
     EnvMat,
@@ -34,9 +36,6 @@
 from deepmd.dpmodel.utils.update_sel import (
     UpdateSel,
 )
-from deepmd.env import (
-    GLOBAL_NP_FLOAT_PRECISION,
-)
 from deepmd.utils.data_system import (
     DeepmdDataSystem,
 )
@@ -61,13 +60,16 @@
 
 
 def np_softmax(x, axis=-1):
-    x = np.nan_to_num(x)  # to avoid value warning
-    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
-    return e_x / np.sum(e_x, axis=axis, keepdims=True)
+    xp = array_api_compat.array_namespace(x)
+    # x = xp.nan_to_num(x)  # to avoid value warning
+    x = xp.where(xp.isnan(x), xp.zeros_like(x), x)
+    e_x = xp.exp(x - xp.max(x, axis=axis, keepdims=True))
+    return e_x / xp.sum(e_x, axis=axis, keepdims=True)
 
 
 def np_normalize(x, axis=-1):
-    return x / np.linalg.norm(x, axis=axis, keepdims=True)
+    xp = array_api_compat.array_namespace(x)
+    return x / xp.linalg.vector_norm(x, axis=axis, keepdims=True)
 
 
 @BaseDescriptor.register("se_atten")
@@ -171,7 +173,7 @@ class DescrptDPA1(NativeOP, BaseDescriptor):
             (Only support False to keep consistent with other backend references.)
             (Not used in this version. True option is not implemented.)
             If mask the diagonal of attention weights
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     env_protection: float
@@ -203,7 +205,7 @@ class DescrptDPA1(NativeOP, BaseDescriptor):
             Whether to use electronic configuration type embedding.
     use_tebd_bias : bool, Optional
             Whether to use bias in the type embedding layer.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     spin
             (Only support None to keep consistent with other backend references.)
@@ -227,9 +229,9 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: Union[List[int], int],
+        sel: Union[list[int], int],
         ntypes: int,
-        neuron: List[int] = [25, 50, 100],
+        neuron: list[int] = [25, 50, 100],
         axis_neuron: int = 8,
         tebd_dim: int = 8,
         tebd_input_mode: str = "concat",
@@ -240,7 +242,7 @@ def __init__(
         attn_layer: int = 2,
         attn_dotr: bool = True,
         attn_mask: bool = False,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
@@ -256,9 +258,9 @@ def __init__(
         stripped_type_embedding: Optional[bool] = None,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         # consistent with argcheck, not used though
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ) -> None:
         ## seed, uniform_seed, not included.
         # Ensure compatibility with the deprecated stripped_type_embedding option.
@@ -333,7 +335,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return self.se_atten.get_nsel()
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.se_atten.get_sel()
 
@@ -341,7 +343,7 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.se_atten.get_ntypes()
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
@@ -395,7 +397,7 @@ def dim_out(self):
     def dim_emb(self):
         return self.get_dim_emb()
 
-    def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None):
+    def compute_input_stats(self, merged: list[dict], path: Optional[DPPath] = None):
         """Update mean and stddev for descriptor elements."""
         raise NotImplementedError
 
@@ -408,12 +410,12 @@ def set_stat_mean_and_stddev(
         self.se_atten.mean = mean
         self.se_atten.stddev = stddev
 
-    def get_stat_mean_and_stddev(self) -> Tuple[np.ndarray, np.ndarray]:
+    def get_stat_mean_and_stddev(self) -> tuple[np.ndarray, np.ndarray]:
         """Get mean and stddev for descriptor."""
         return self.se_atten.mean, self.se_atten.stddev
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -476,10 +478,14 @@ def call(
             The smooth switch function.
         """
         del mapping
+        xp = array_api_compat.array_namespace(coord_ext, atype_ext, nlist)
         nf, nloc, nnei = nlist.shape
-        nall = coord_ext.reshape(nf, -1).shape[1] // 3
+        nall = xp.reshape(coord_ext, (nf, -1)).shape[1] // 3
         # nf x nall x tebd_dim
-        atype_embd_ext = self.type_embedding.call()[atype_ext]
+        atype_embd_ext = xp.reshape(
+            xp.take(self.type_embedding.call(), xp.reshape(atype_ext, [-1]), axis=0),
+            (nf, nall, self.tebd_dim),
+        )
         # nfnl x tebd_dim
         atype_embd = atype_embd_ext[:, :nloc, :]
         grrg, g2, h2, rot_mat, sw = self.se_atten(
@@ -491,8 +497,8 @@ def call(
         )
         # nf x nloc x (ng x ng1 + tebd_dim)
         if self.concat_output_tebd:
-            grrg = np.concatenate(
-                [grrg, atype_embd.reshape(nf, nloc, self.tebd_dim)], axis=-1
+            grrg = xp.concat(
+                [grrg, xp.reshape(atype_embd, (nf, nloc, self.tebd_dim))], axis=-1
             )
         return grrg, rot_mat, None, None, sw
 
@@ -538,8 +544,8 @@ def serialize(self) -> dict:
             "exclude_types": obj.exclude_types,
             "env_protection": obj.env_protection,
             "@variables": {
-                "davg": obj["davg"],
-                "dstd": obj["dstd"],
+                "davg": np.array(obj["davg"]),
+                "dstd": np.array(obj["dstd"]),
             },
             ## to be updated when the options are supported.
             "trainable": self.trainable,
@@ -588,9 +594,9 @@ def deserialize(cls, data: dict) -> "DescrptDPA1":
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -623,9 +629,9 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: Union[List[int], int],
+        sel: Union[list[int], int],
         ntypes: int,
-        neuron: List[int] = [25, 50, 100],
+        neuron: list[int] = [25, 50, 100],
         axis_neuron: int = 8,
         tebd_dim: int = 8,
         tebd_input_mode: str = "concat",
@@ -635,7 +641,7 @@ def __init__(
         attn_layer: int = 2,
         attn_dotr: bool = True,
         attn_mask: bool = False,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
@@ -646,7 +652,7 @@ def __init__(
         trainable_ln: bool = True,
         ln_eps: Optional[float] = 1e-5,
         smooth: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ) -> None:
         self.rcut = rcut
         self.rcut_smth = rcut_smth
@@ -685,12 +691,12 @@ def __init__(
             self.embd_input_dim = 1 + self.tebd_dim_input
         else:
             self.embd_input_dim = 1
-        self.embeddings = NetworkCollection(
+        embeddings = NetworkCollection(
             ndim=0,
             ntypes=self.ntypes,
             network_type="embedding_network",
         )
-        self.embeddings[0] = EmbeddingNet(
+        embeddings[0] = EmbeddingNet(
             self.embd_input_dim,
             self.neuron,
             self.activation_function,
@@ -698,13 +704,14 @@ def __init__(
             self.precision,
             seed=child_seed(seed, 0),
         )
+        self.embeddings = embeddings
         if self.tebd_input_mode in ["strip"]:
-            self.embeddings_strip = NetworkCollection(
+            embeddings_strip = NetworkCollection(
                 ndim=0,
                 ntypes=self.ntypes,
                 network_type="embedding_network",
             )
-            self.embeddings_strip[0] = EmbeddingNet(
+            embeddings_strip[0] = EmbeddingNet(
                 self.tebd_dim_input,
                 self.neuron,
                 self.activation_function,
@@ -712,6 +719,7 @@ def __init__(
                 self.precision,
                 seed=child_seed(seed, 1),
             )
+            self.embeddings_strip = embeddings_strip
         else:
             self.embeddings_strip = None
         self.dpa1_attention = NeighborGatedAttention(
@@ -748,7 +756,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
@@ -817,7 +825,7 @@ def dim_emb(self):
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data."""
@@ -829,7 +837,7 @@ def get_stats(self):
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
@@ -839,9 +847,10 @@ def cal_g(
         ss,
         embedding_idx,
     ):
+        xp = array_api_compat.array_namespace(ss)
         nfnl, nnei = ss.shape[0:2]
-        shape2 = np.prod(ss.shape[2:])
-        ss = ss.reshape(nfnl, nnei, shape2)
+        shape2 = xp.prod(xp.asarray(ss.shape[2:]))
+        ss = xp.reshape(ss, (nfnl, nnei, shape2))
         # nfnl x nnei x ng
         gg = self.embeddings[embedding_idx].call(ss)
         return gg
@@ -852,9 +861,10 @@ def cal_g_strip(
         embedding_idx,
     ):
         assert self.embeddings_strip is not None
+        xp = array_api_compat.array_namespace(ss)
         nfnl, nnei = ss.shape[0:2]
-        shape2 = np.prod(ss.shape[2:])
-        ss = ss.reshape(nfnl, nnei, shape2)
+        shape2 = xp.prod(xp.asarray(ss.shape[2:]))
+        ss = xp.reshape(ss, (nfnl, nnei, shape2))
         # nfnl x nnei x ng
         gg = self.embeddings_strip[embedding_idx].call(ss)
         return gg
@@ -867,6 +877,7 @@ def call(
         atype_embd_ext: Optional[np.ndarray] = None,
         mapping: Optional[np.ndarray] = None,
     ):
+        xp = array_api_compat.array_namespace(nlist, coord_ext, atype_ext)
         # nf x nloc x nnei x 4
         dmatrix, diff, sw = self.env_mat.call(
             coord_ext, atype_ext, nlist, self.mean, self.stddev
@@ -874,41 +885,42 @@ def call(
         nf, nloc, nnei, _ = dmatrix.shape
         exclude_mask = self.emask.build_type_exclude_mask(nlist, atype_ext)
         # nfnl x nnei
-        exclude_mask = exclude_mask.reshape(nf * nloc, nnei)
+        exclude_mask = xp.reshape(exclude_mask, (nf * nloc, nnei))
         # nfnl x nnei
-        nlist = nlist.reshape(nf * nloc, nnei)
-        nlist = np.where(exclude_mask, nlist, -1)
+        nlist = xp.reshape(nlist, (nf * nloc, nnei))
+        nlist = xp.where(exclude_mask, nlist, xp.full_like(nlist, -1))
         # nfnl x nnei x 4
-        dmatrix = dmatrix.reshape(nf * nloc, nnei, 4)
+        dmatrix = xp.reshape(dmatrix, (nf * nloc, nnei, 4))
         # nfnl x nnei x 1
-        sw = sw.reshape(nf * nloc, nnei, 1)
+        sw = xp.reshape(sw, (nf * nloc, nnei, 1))
         # nfnl x tebd_dim
-        atype_embd = atype_embd_ext[:, :nloc, :].reshape(nf * nloc, self.tebd_dim)
+        atype_embd = xp.reshape(atype_embd_ext[:, :nloc, :], (nf * nloc, self.tebd_dim))
         # nfnl x nnei x tebd_dim
-        atype_embd_nnei = np.tile(atype_embd[:, np.newaxis, :], (1, nnei, 1))
+        atype_embd_nnei = xp.tile(atype_embd[:, xp.newaxis, :], (1, nnei, 1))
         # nfnl x nnei
         nlist_mask = nlist != -1
         # nfnl x nnei x 1
-        sw = np.where(nlist_mask[:, :, None], sw, 0.0)
-        nlist_masked = np.where(nlist_mask, nlist, 0)
-        index = np.tile(nlist_masked.reshape(nf, -1, 1), (1, 1, self.tebd_dim))
+        sw = xp.where(nlist_mask[:, :, None], sw, xp.full_like(sw, 0.0))
+        nlist_masked = xp.where(nlist_mask, nlist, xp.zeros_like(nlist))
+        index = xp.tile(xp.reshape(nlist_masked, (nf, -1, 1)), (1, 1, self.tebd_dim))
         # nfnl x nnei x tebd_dim
-        atype_embd_nlist = np.take_along_axis(atype_embd_ext, index, axis=1).reshape(
-            nf * nloc, nnei, self.tebd_dim
+        atype_embd_nlist = xp_take_along_axis(atype_embd_ext, index, axis=1)
+        atype_embd_nlist = xp.reshape(
+            atype_embd_nlist, (nf * nloc, nnei, self.tebd_dim)
         )
         ng = self.neuron[-1]
         # nfnl x nnei x 4
-        rr = dmatrix.reshape(nf * nloc, nnei, 4)
-        rr = rr * exclude_mask[:, :, None]
+        rr = xp.reshape(dmatrix, (nf * nloc, nnei, 4))
+        rr = rr * xp.astype(exclude_mask[:, :, None], rr.dtype)
         # nfnl x nnei x 1
         ss = rr[..., 0:1]
         if self.tebd_input_mode in ["concat"]:
             if not self.type_one_side:
                 # nfnl x nnei x (1 + 2 * tebd_dim)
-                ss = np.concatenate([ss, atype_embd_nlist, atype_embd_nnei], axis=-1)
+                ss = xp.concat([ss, atype_embd_nlist, atype_embd_nnei], axis=-1)
             else:
                 # nfnl x nnei x (1 + tebd_dim)
-                ss = np.concatenate([ss, atype_embd_nlist], axis=-1)
+                ss = xp.concat([ss, atype_embd_nlist], axis=-1)
                 # calculate gg
                 # nfnl x nnei x ng
             gg = self.cal_g(ss, 0)
@@ -918,42 +930,47 @@ def call(
             assert self.embeddings_strip is not None
             if not self.type_one_side:
                 # nfnl x nnei x (tebd_dim * 2)
-                tt = np.concatenate([atype_embd_nlist, atype_embd_nnei], axis=-1)
+                tt = xp.concat([atype_embd_nlist, atype_embd_nnei], axis=-1)
             else:
                 # nfnl x nnei x tebd_dim
                 tt = atype_embd_nlist
             # nfnl x nnei x ng
             gg_t = self.cal_g_strip(tt, 0)
             if self.smooth:
-                gg_t = gg_t * sw.reshape(-1, self.nnei, 1)
+                gg_t = gg_t * xp.reshape(sw, (-1, self.nnei, 1))
             # nfnl x nnei x ng
             gg = gg_s * gg_t + gg_s
         else:
             raise NotImplementedError
 
-        input_r = rr.reshape(-1, nnei, 4)[:, :, 1:4] / np.maximum(
-            np.linalg.norm(rr.reshape(-1, nnei, 4)[:, :, 1:4], axis=-1, keepdims=True),
-            1e-12,
+        normed = xp.linalg.vector_norm(
+            xp.reshape(rr, (-1, nnei, 4))[:, :, 1:4], axis=-1, keepdims=True
+        )
+        input_r = xp.reshape(rr, (-1, nnei, 4))[:, :, 1:4] / xp.maximum(
+            normed,
+            xp.full_like(normed, 1e-12),
         )
         gg = self.dpa1_attention(
             gg, nlist_mask, input_r=input_r, sw=sw
         )  # shape is [nframes*nloc, self.neei, out_size]
         # nfnl x ng x 4
-        gr = np.einsum("lni,lnj->lij", gg, rr)
+        # gr = xp.einsum("lni,lnj->lij", gg, rr)
+        gr = xp.sum(gg[:, :, :, None] * rr[:, :, None, :], axis=1)
         gr /= self.nnei
         gr1 = gr[:, : self.axis_neuron, :]
         # nfnl x ng x ng1
-        grrg = np.einsum("lid,ljd->lij", gr, gr1)
+        # grrg = xp.einsum("lid,ljd->lij", gr, gr1)
+        grrg = xp.sum(gr[:, :, None, :] * gr1[:, None, :, :], axis=3)
         # nf x nloc x (ng x ng1)
-        grrg = grrg.reshape(nf, nloc, ng * self.axis_neuron).astype(
-            GLOBAL_NP_FLOAT_PRECISION
+        grrg = xp.astype(
+            xp.reshape(grrg, (nf, nloc, ng * self.axis_neuron)), coord_ext.dtype
         )
         return (
-            grrg.reshape(nf, nloc, self.filter_neuron[-1] * self.axis_neuron),
-            gg.reshape(nf, nloc, self.nnei, self.filter_neuron[-1]),
-            dmatrix.reshape(nf, nloc, self.nnei, 4)[..., 1:],
-            gr[..., 1:].reshape(nf, nloc, self.filter_neuron[-1], 3),
-            sw,
+            xp.reshape(grrg, (nf, nloc, self.filter_neuron[-1] * self.axis_neuron)),
+            xp.reshape(gg, (nf, nloc, self.nnei, self.filter_neuron[-1])),
+            xp.reshape(dmatrix, (nf, nloc, self.nnei, 4))[..., 1:],
+            xp.reshape(gr[..., 1:], (nf, nloc, self.filter_neuron[-1], 3)),
+            xp.reshape(sw, (nf, nloc, nnei, 1)),
         )
 
     def has_message_passing(self) -> bool:
@@ -964,6 +981,77 @@ def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the descriptor block needs sorted nlist when using `forward_lower`."""
         return False
 
+    def serialize(self) -> dict:
+        """Serialize the descriptor to dict."""
+        obj = self
+        data = {
+            "@class": "DescriptorBlock",
+            "type": "dpa1",
+            "@version": 1,
+            "rcut": obj.rcut,
+            "rcut_smth": obj.rcut_smth,
+            "sel": obj.sel,
+            "ntypes": obj.ntypes,
+            "neuron": obj.neuron,
+            "axis_neuron": obj.axis_neuron,
+            "tebd_dim": obj.tebd_dim,
+            "tebd_input_mode": obj.tebd_input_mode,
+            "set_davg_zero": obj.set_davg_zero,
+            "attn": obj.attn,
+            "attn_layer": obj.attn_layer,
+            "attn_dotr": obj.attn_dotr,
+            "attn_mask": obj.attn_mask,
+            "activation_function": obj.activation_function,
+            "resnet_dt": obj.resnet_dt,
+            "scaling_factor": obj.scaling_factor,
+            "normalize": obj.normalize,
+            "temperature": obj.temperature,
+            "trainable_ln": obj.trainable_ln,
+            "ln_eps": obj.ln_eps,
+            "smooth": obj.smooth,
+            "type_one_side": obj.type_one_side,
+            # make deterministic
+            "precision": np.dtype(PRECISION_DICT[obj.precision]).name,
+            "embeddings": obj.embeddings.serialize(),
+            "attention_layers": obj.dpa1_attention.serialize(),
+            "env_mat": obj.env_mat.serialize(),
+            "exclude_types": obj.exclude_types,
+            "env_protection": obj.env_protection,
+            "@variables": {
+                "davg": np.array(obj["davg"]),
+                "dstd": np.array(obj["dstd"]),
+            },
+        }
+        if obj.tebd_input_mode in ["strip"]:
+            data.update({"embeddings_strip": obj.embeddings_strip.serialize()})
+        return data
+
+    @classmethod
+    def deserialize(cls, data: dict) -> "DescrptDPA1":
+        """Deserialize from dict."""
+        data = data.copy()
+        check_version_compatibility(data.pop("@version"), 1, 1)
+        data.pop("@class")
+        data.pop("type")
+        variables = data.pop("@variables")
+        embeddings = data.pop("embeddings")
+        attention_layers = data.pop("attention_layers")
+        env_mat = data.pop("env_mat")
+        tebd_input_mode = data["tebd_input_mode"]
+        if tebd_input_mode in ["strip"]:
+            embeddings_strip = data.pop("embeddings_strip")
+        else:
+            embeddings_strip = None
+        obj = cls(**data)
+
+        obj["davg"] = variables["davg"]
+        obj["dstd"] = variables["dstd"]
+        obj.embeddings = NetworkCollection.deserialize(embeddings)
+        if tebd_input_mode in ["strip"]:
+            obj.embeddings_strip = NetworkCollection.deserialize(embeddings_strip)
+        obj.dpa1_attention = NeighborGatedAttention.deserialize(attention_layers)
+        return obj
+
 
 class NeighborGatedAttention(NativeOP):
     def __init__(
@@ -981,7 +1069,7 @@ def __init__(
         ln_eps: float = 1e-5,
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         """Construct a neighbor-wise attention net."""
         super().__init__()
@@ -1109,7 +1197,7 @@ def __init__(
         ln_eps: float = 1e-5,
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         """Construct a neighbor-wise attention layer."""
         super().__init__()
@@ -1215,7 +1303,7 @@ def __init__(
         bias: bool = True,
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         """Construct a multi-head neighbor-wise attention net."""
         super().__init__()
@@ -1256,18 +1344,23 @@ def __init__(
         )
 
     def call(self, query, nei_mask, input_r=None, sw=None, attnw_shift=20.0):
+        xp = array_api_compat.array_namespace(query, nei_mask)
         # Linear projection
-        q, k, v = np.split(self.in_proj(query), 3, axis=-1)
+        # q, k, v = xp.split(self.in_proj(query), 3, axis=-1)
+        _query = self.in_proj(query)
+        q = _query[..., 0 : self.head_dim]
+        k = _query[..., self.head_dim : self.head_dim * 2]
+        v = _query[..., self.head_dim * 2 : self.head_dim * 3]
         # Reshape and normalize
         # (nf x nloc) x num_heads x nnei x head_dim
-        q = q.reshape(-1, self.nnei, self.num_heads, self.head_dim).transpose(
-            0, 2, 1, 3
+        q = xp.permute_dims(
+            xp.reshape(q, (-1, self.nnei, self.num_heads, self.head_dim)), (0, 2, 1, 3)
         )
-        k = k.reshape(-1, self.nnei, self.num_heads, self.head_dim).transpose(
-            0, 2, 1, 3
+        k = xp.permute_dims(
+            xp.reshape(k, (-1, self.nnei, self.num_heads, self.head_dim)), (0, 2, 1, 3)
         )
-        v = v.reshape(-1, self.nnei, self.num_heads, self.head_dim).transpose(
-            0, 2, 1, 3
+        v = xp.permute_dims(
+            xp.reshape(v, (-1, self.nnei, self.num_heads, self.head_dim)), (0, 2, 1, 3)
         )
         if self.normalize:
             q = np_normalize(q, axis=-1)
@@ -1276,29 +1369,38 @@ def call(self, query, nei_mask, input_r=None, sw=None, attnw_shift=20.0):
         q = q * self.scaling
         # Attention weights
         # (nf x nloc) x num_heads x nnei x nnei
-        attn_weights = q @ k.transpose(0, 1, 3, 2)
-        nei_mask = nei_mask.reshape(-1, self.nnei)
+        attn_weights = q @ xp.permute_dims(k, (0, 1, 3, 2))
+        nei_mask = xp.reshape(nei_mask, (-1, self.nnei))
         if self.smooth:
-            sw = sw.reshape(-1, 1, self.nnei)
+            sw = xp.reshape(sw, (-1, 1, self.nnei))
             attn_weights = (attn_weights + attnw_shift) * sw[:, :, :, None] * sw[
                 :, :, None, :
             ] - attnw_shift
         else:
-            attn_weights = np.where(nei_mask[:, None, None, :], attn_weights, -np.inf)
+            attn_weights = xp.where(
+                nei_mask[:, None, None, :],
+                attn_weights,
+                xp.full_like(attn_weights, -xp.inf),
+            )
         attn_weights = np_softmax(attn_weights, axis=-1)
-        attn_weights = np.where(nei_mask[:, None, :, None], attn_weights, 0.0)
+        attn_weights = xp.where(
+            nei_mask[:, None, :, None], attn_weights, xp.zeros_like(attn_weights)
+        )
         if self.smooth:
             attn_weights = attn_weights * sw[:, :, :, None] * sw[:, :, None, :]
         if self.dotr:
-            angular_weight = (input_r @ input_r.transpose(0, 2, 1)).reshape(
-                -1, 1, self.nnei, self.nnei
+            angular_weight = xp.reshape(
+                input_r @ xp.permute_dims(input_r, (0, 2, 1)),
+                (-1, 1, self.nnei, self.nnei),
             )
             attn_weights = attn_weights * angular_weight
         # Output projection
         # (nf x nloc) x num_heads x nnei x head_dim
         o = attn_weights @ v
         # (nf x nloc) x nnei x (num_heads x head_dim)
-        o = o.transpose(0, 2, 1, 3).reshape(-1, self.nnei, self.hidden_dim)
+        o = xp.reshape(
+            xp.permute_dims(o, (0, 2, 1, 3)), (-1, self.nnei, self.hidden_dim)
+        )
         output = self.out_proj(o)
         return output, attn_weights
 
diff --git a/deepmd/dpmodel/descriptor/dpa2.py b/deepmd/dpmodel/descriptor/dpa2.py
index 43c57f443f..285dc724a7 100644
--- a/deepmd/dpmodel/descriptor/dpa2.py
+++ b/deepmd/dpmodel/descriptor/dpa2.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -70,7 +68,7 @@ def __init__(
         rcut: float,
         rcut_smth: float,
         nsel: int,
-        neuron: List[int] = [25, 50, 100],
+        neuron: list[int] = [25, 50, 100],
         axis_neuron: int = 16,
         tebd_dim: int = 8,
         tebd_input_mode: str = "concat",
@@ -79,7 +77,7 @@ def __init__(
         resnet_dt: bool = False,
         type_one_side: bool = False,
         use_three_body: bool = False,
-        three_body_neuron: List[int] = [2, 4, 8],
+        three_body_neuron: list[int] = [2, 4, 8],
         three_body_sel: int = 40,
         three_body_rcut: float = 4.0,
         three_body_rcut_smth: float = 0.5,
@@ -371,14 +369,14 @@ def __init__(
         concat_output_tebd: bool = True,
         precision: str = "float64",
         smooth: bool = True,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
         add_tebd_to_repinit_out: bool = False,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
     ):
         r"""The DPA-2 descriptor. see https://arxiv.org/abs/2312.15492.
 
@@ -394,7 +392,7 @@ def __init__(
             The precision of the embedding net parameters.
         smooth : bool, optional
             Whether to use smoothness in processes such as attention weights calculation.
-        exclude_types : List[List[int]], optional
+        exclude_types : list[list[int]], optional
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
         env_protection : float, optional
@@ -410,7 +408,7 @@ def __init__(
             Whether to use electronic configuration type embedding.
         use_tebd_bias : bool, Optional
             Whether to use bias in the type embedding layer.
-        type_map : List[str], Optional
+        type_map : list[str], Optional
             A list of strings. Give the name to each type of atoms.
 
         Returns
@@ -602,7 +600,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
@@ -610,7 +608,7 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.ntypes
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
@@ -660,7 +658,7 @@ def share_params(self, base_class, shared_level, resume=False):
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -723,14 +721,14 @@ def dim_emb(self):
         """Returns the embedding dimension g2."""
         return self.get_dim_emb()
 
-    def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None):
+    def compute_input_stats(self, merged: list[dict], path: Optional[DPPath] = None):
         """Update mean and stddev for descriptor elements."""
         raise NotImplementedError
 
     def set_stat_mean_and_stddev(
         self,
-        mean: List[np.ndarray],
-        stddev: List[np.ndarray],
+        mean: list[np.ndarray],
+        stddev: list[np.ndarray],
     ) -> None:
         """Update mean and stddev for descriptor."""
         descrpt_list = [self.repinit, self.repformers]
@@ -740,7 +738,7 @@ def set_stat_mean_and_stddev(
             descrpt.mean = mean[ii]
             descrpt.stddev = stddev[ii]
 
-    def get_stat_mean_and_stddev(self) -> Tuple[List[np.ndarray], List[np.ndarray]]:
+    def get_stat_mean_and_stddev(self) -> tuple[list[np.ndarray], list[np.ndarray]]:
         """Get mean and stddev for descriptor."""
         mean_list = [self.repinit.mean, self.repformers.mean]
         stddev_list = [
@@ -1015,9 +1013,9 @@ def deserialize(cls, data: dict) -> "DescrptDPA2":
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/dpmodel/descriptor/hybrid.py b/deepmd/dpmodel/descriptor/hybrid.py
index 4cd4e230ae..3aa8882db1 100644
--- a/deepmd/dpmodel/descriptor/hybrid.py
+++ b/deepmd/dpmodel/descriptor/hybrid.py
@@ -2,10 +2,7 @@
 import math
 from typing import (
     Any,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -37,14 +34,14 @@ class DescrptHybrid(BaseDescriptor, NativeOP):
 
     Parameters
     ----------
-    list : list : List[Union[BaseDescriptor, Dict[str, Any]]]
+    list : list : list[Union[BaseDescriptor, dict[str, Any]]]
         Build a descriptor from the concatenation of the list of descriptors.
         The descriptor can be either an object or a dictionary.
     """
 
     def __init__(
         self,
-        list: List[Union[BaseDescriptor, Dict[str, Any]]],
+        list: list[Union[BaseDescriptor, dict[str, Any]]],
     ) -> None:
         super().__init__()
         # warning: list is conflict with built-in list
@@ -69,7 +66,7 @@ def __init__(
             ), f"number of atom types in {ii}th descrptor {self.descrpt_list[0].__class__.__name__} does not match others"
         # if hybrid sel is larger than sub sel, the nlist needs to be cut for each type
         hybrid_sel = self.get_sel()
-        self.nlist_cut_idx: List[np.ndarray] = []
+        self.nlist_cut_idx: list[np.ndarray] = []
         if self.mixed_types() and not all(
             descrpt.mixed_types() for descrpt in self.descrpt_list
         ):
@@ -107,7 +104,7 @@ def get_rcut_smth(self) -> float:
         # Note: Using the minimum rcut_smth might not be appropriate in all scenarios. Consider using a different approach or provide detailed documentation on why the minimum value is chosen.
         return np.min([descrpt.get_rcut_smth() for descrpt in self.descrpt_list]).item()
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         if self.mixed_types():
             return [
@@ -124,7 +121,7 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.descrpt_list[0].get_ntypes()
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.descrpt_list[0].get_type_map()
 
@@ -169,7 +166,7 @@ def share_params(self, base_class, shared_level, resume=False):
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -182,15 +179,15 @@ def change_type_map(
                 else None,
             )
 
-    def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None):
+    def compute_input_stats(self, merged: list[dict], path: Optional[DPPath] = None):
         """Update mean and stddev for descriptor elements."""
         for descrpt in self.descrpt_list:
             descrpt.compute_input_stats(merged, path)
 
     def set_stat_mean_and_stddev(
         self,
-        mean: List[Union[np.ndarray, List[np.ndarray]]],
-        stddev: List[Union[np.ndarray, List[np.ndarray]]],
+        mean: list[Union[np.ndarray, list[np.ndarray]]],
+        stddev: list[Union[np.ndarray, list[np.ndarray]]],
     ) -> None:
         """Update mean and stddev for descriptor."""
         for ii, descrpt in enumerate(self.descrpt_list):
@@ -198,9 +195,9 @@ def set_stat_mean_and_stddev(
 
     def get_stat_mean_and_stddev(
         self,
-    ) -> Tuple[
-        List[Union[np.ndarray, List[np.ndarray]]],
-        List[Union[np.ndarray, List[np.ndarray]]],
+    ) -> tuple[
+        list[Union[np.ndarray, list[np.ndarray]]],
+        list[Union[np.ndarray, list[np.ndarray]]],
     ]:
         """Get mean and stddev for descriptor."""
         mean_list = []
@@ -279,9 +276,9 @@ def call(
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/dpmodel/descriptor/make_base_descriptor.py b/deepmd/dpmodel/descriptor/make_base_descriptor.py
index 6ce54c6f12..a9b434d5f5 100644
--- a/deepmd/dpmodel/descriptor/make_base_descriptor.py
+++ b/deepmd/dpmodel/descriptor/make_base_descriptor.py
@@ -5,9 +5,7 @@
 )
 from typing import (
     Callable,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -61,7 +59,7 @@ def get_rcut_smth(self) -> float:
             pass
 
         @abstractmethod
-        def get_sel(self) -> List[int]:
+        def get_sel(self) -> list[int]:
             """Returns the number of selected neighboring atoms for each type."""
             pass
 
@@ -79,7 +77,7 @@ def get_ntypes(self) -> int:
             pass
 
         @abstractmethod
-        def get_type_map(self) -> List[str]:
+        def get_type_map(self) -> list[str]:
             """Get the name to each type of atoms."""
             pass
 
@@ -124,7 +122,7 @@ def share_params(self, base_class, shared_level, resume=False):
 
         @abstractmethod
         def change_type_map(
-            self, type_map: List[str], model_with_new_type_stat=None
+            self, type_map: list[str], model_with_new_type_stat=None
         ) -> None:
             """Change the type related params to new ones, according to `type_map` and the original one in the model.
             If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -143,7 +141,7 @@ def get_stat_mean_and_stddev(self):
 
         def compute_input_stats(
             self,
-            merged: Union[Callable[[], List[dict]], List[dict]],
+            merged: Union[Callable[[], list[dict]], list[dict]],
             path: Optional[DPPath] = None,
         ):
             """Update mean and stddev for descriptor elements."""
@@ -188,9 +186,9 @@ def deserialize(cls, data: dict) -> "BD":
         def update_sel(
             cls,
             train_data: DeepmdDataSystem,
-            type_map: Optional[List[str]],
+            type_map: Optional[list[str]],
             local_jdata: dict,
-        ) -> Tuple[dict, Optional[float]]:
+        ) -> tuple[dict, Optional[float]]:
             """Update the selection and perform neighbor statistics.
 
             Parameters
diff --git a/deepmd/dpmodel/descriptor/repformers.py b/deepmd/dpmodel/descriptor/repformers.py
index 7254f0bc3d..ec8be21a53 100644
--- a/deepmd/dpmodel/descriptor/repformers.py
+++ b/deepmd/dpmodel/descriptor/repformers.py
@@ -1,9 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Callable,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -110,7 +108,7 @@ class DescrptBlockRepformers(NativeOP, DescriptorBlock):
         The precision of the embedding net parameters.
     smooth : bool, optional
         Whether to use smoothness in processes such as attention weights calculation.
-    exclude_types : List[List[int]], optional
+    exclude_types : list[list[int]], optional
         The excluded pairs of types which have no interaction with each other.
         For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     env_protection : float, optional
@@ -159,7 +157,7 @@ def __init__(
         update_residual_init: str = "norm",
         set_davg_zero: bool = True,
         smooth: bool = True,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         precision: str = "float64",
         trainable_ln: bool = True,
@@ -167,7 +165,7 @@ def __init__(
         g1_out_conv: bool = True,
         g1_out_mlp: bool = True,
         ln_eps: Optional[float] = 1e-5,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__()
         self.rcut = rcut
@@ -272,7 +270,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
@@ -337,7 +335,7 @@ def dim_emb(self):
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data."""
@@ -349,7 +347,7 @@ def get_stats(self):
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
@@ -436,7 +434,7 @@ def get_residual(
     _mode: str = "norm",
     trainable: bool = True,
     precision: str = "float64",
-    seed: Optional[Union[int, List[int]]] = None,
+    seed: Optional[Union[int, list[int]]] = None,
 ) -> np.ndarray:
     """
     Get residual tensor for one update vector.
@@ -694,7 +692,7 @@ def __init__(
         smooth: bool = True,
         attnw_shift: float = 20.0,
         precision: str = "float64",
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         """Return neighbor-wise multi-head self-attention maps, with gate mechanism."""
         super().__init__()
@@ -812,7 +810,7 @@ def __init__(
         input_dim: int,
         head_num: int,
         precision: str = "float64",
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__()
         self.input_dim = input_dim
@@ -897,7 +895,7 @@ def __init__(
         input_dim: int,
         head_num: int,
         precision: str = "float64",
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__()
         self.input_dim = input_dim
@@ -970,7 +968,7 @@ def __init__(
         smooth: bool = True,
         attnw_shift: float = 20.0,
         precision: str = "float64",
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__()
         self.input_dim = input_dim
@@ -1132,7 +1130,7 @@ def __init__(
         g1_out_conv: bool = True,
         g1_out_mlp: bool = True,
         ln_eps: Optional[float] = 1e-5,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__()
         self.epsilon = 1e-4  # protection of 1./nnei
@@ -1508,10 +1506,10 @@ def call(
         assert (nf, nloc) == g1.shape[:2]
         assert (nf, nloc, nnei) == h2.shape[:3]
 
-        g2_update: List[np.ndarray] = [g2]
-        h2_update: List[np.ndarray] = [h2]
-        g1_update: List[np.ndarray] = [g1]
-        g1_mlp: List[np.ndarray] = [g1] if not self.g1_out_mlp else []
+        g2_update: list[np.ndarray] = [g2]
+        h2_update: list[np.ndarray] = [h2]
+        g1_update: list[np.ndarray] = [g1]
+        g1_mlp: list[np.ndarray] = [g1] if not self.g1_out_mlp else []
         if self.g1_out_mlp:
             assert self.g1_self_mlp is not None
             g1_self_mlp = self.act(self.g1_self_mlp(g1))
@@ -1613,7 +1611,7 @@ def call(
 
     def list_update_res_avg(
         self,
-        update_list: List[np.ndarray],
+        update_list: list[np.ndarray],
     ) -> np.ndarray:
         nitem = len(update_list)
         uu = update_list[0]
@@ -1621,7 +1619,7 @@ def list_update_res_avg(
             uu = uu + update_list[ii]
         return uu / (float(nitem) ** 0.5)
 
-    def list_update_res_incr(self, update_list: List[np.ndarray]) -> np.ndarray:
+    def list_update_res_incr(self, update_list: list[np.ndarray]) -> np.ndarray:
         nitem = len(update_list)
         uu = update_list[0]
         scale = 1.0 / (float(nitem - 1) ** 0.5) if nitem > 1 else 0.0
@@ -1630,7 +1628,7 @@ def list_update_res_incr(self, update_list: List[np.ndarray]) -> np.ndarray:
         return uu
 
     def list_update_res_residual(
-        self, update_list: List[np.ndarray], update_name: str = "g1"
+        self, update_list: list[np.ndarray], update_name: str = "g1"
     ) -> np.ndarray:
         nitem = len(update_list)
         uu = update_list[0]
@@ -1648,7 +1646,7 @@ def list_update_res_residual(
         return uu
 
     def list_update(
-        self, update_list: List[np.ndarray], update_name: str = "g1"
+        self, update_list: list[np.ndarray], update_name: str = "g1"
     ) -> np.ndarray:
         if self.update_style == "res_avg":
             return self.list_update_res_avg(update_list)
diff --git a/deepmd/dpmodel/descriptor/se_atten_v2.py b/deepmd/dpmodel/descriptor/se_atten_v2.py
index d29580062c..e0ac222524 100644
--- a/deepmd/dpmodel/descriptor/se_atten_v2.py
+++ b/deepmd/dpmodel/descriptor/se_atten_v2.py
@@ -1,9 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Any,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -38,9 +36,9 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: Union[List[int], int],
+        sel: Union[list[int], int],
         ntypes: int,
-        neuron: List[int] = [25, 50, 100],
+        neuron: list[int] = [25, 50, 100],
         axis_neuron: int = 8,
         tebd_dim: int = 8,
         resnet_dt: bool = False,
@@ -50,7 +48,7 @@ def __init__(
         attn_layer: int = 2,
         attn_dotr: bool = True,
         attn_mask: bool = False,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
@@ -65,9 +63,9 @@ def __init__(
         stripped_type_embedding: Optional[bool] = None,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         # consistent with argcheck, not used though
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ) -> None:
         DescrptDPA1.__init__(
             self,
diff --git a/deepmd/dpmodel/descriptor/se_e2_a.py b/deepmd/dpmodel/descriptor/se_e2_a.py
index 11856521c8..29577ef79e 100644
--- a/deepmd/dpmodel/descriptor/se_e2_a.py
+++ b/deepmd/dpmodel/descriptor/se_e2_a.py
@@ -3,9 +3,7 @@
 import itertools
 from typing import (
     Any,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -108,7 +106,7 @@ class DescrptSeA(NativeOP, BaseDescriptor):
             If the weights of embedding net are trainable.
     type_one_side
             Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     env_protection: float
@@ -121,7 +119,7 @@ class DescrptSeA(NativeOP, BaseDescriptor):
             The precision of the embedding net parameters. Supported options are |PRECISION|
     spin
             The deepspin object.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     ntypes : int
             Number of element types.
@@ -147,22 +145,22 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         axis_neuron: int = 8,
         resnet_dt: bool = False,
         trainable: bool = True,
         type_one_side: bool = True,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         env_protection: float = 0.0,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
         spin: Optional[Any] = None,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         ntypes: Optional[int] = None,  # to be compat with input
         # consistent with argcheck, not used though
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ) -> None:
         del ntypes
         ## seed, uniform_seed, not included.
@@ -282,7 +280,7 @@ def share_params(self, base_class, shared_level, resume=False):
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -297,11 +295,11 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.ntypes
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
-    def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None):
+    def compute_input_stats(self, merged: list[dict], path: Optional[DPPath] = None):
         """Update mean and stddev for descriptor elements."""
         raise NotImplementedError
 
@@ -314,7 +312,7 @@ def set_stat_mean_and_stddev(
         self.davg = mean
         self.dstd = stddev
 
-    def get_stat_mean_and_stddev(self) -> Tuple[np.ndarray, np.ndarray]:
+    def get_stat_mean_and_stddev(self) -> tuple[np.ndarray, np.ndarray]:
         """Get mean and stddev for descriptor."""
         return self.davg, self.dstd
 
@@ -331,7 +329,7 @@ def cal_g(
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
@@ -473,9 +471,9 @@ def deserialize(cls, data: dict) -> "DescrptSeA":
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/dpmodel/descriptor/se_r.py b/deepmd/dpmodel/descriptor/se_r.py
index 2d9f6f5a52..c9d27175d6 100644
--- a/deepmd/dpmodel/descriptor/se_r.py
+++ b/deepmd/dpmodel/descriptor/se_r.py
@@ -2,9 +2,7 @@
 import copy
 from typing import (
     Any,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -68,7 +66,7 @@ class DescrptSeR(NativeOP, BaseDescriptor):
             If the weights of embedding net are trainable.
     type_one_side
             Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     set_davg_zero
@@ -79,7 +77,7 @@ class DescrptSeR(NativeOP, BaseDescriptor):
             The precision of the embedding net parameters. Supported options are |PRECISION|
     spin
             The deepspin object.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     ntypes : int
             Number of element types.
@@ -105,21 +103,21 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         resnet_dt: bool = False,
         trainable: bool = True,
         type_one_side: bool = True,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         env_protection: float = 0.0,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
         spin: Optional[Any] = None,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         ntypes: Optional[int] = None,  # to be compat with input
         # consistent with argcheck, not used though
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ) -> None:
         del ntypes
         ## seed, uniform_seed, not included.
@@ -240,7 +238,7 @@ def share_params(self, base_class, shared_level, resume=False):
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -255,11 +253,11 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.ntypes
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
-    def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None):
+    def compute_input_stats(self, merged: list[dict], path: Optional[DPPath] = None):
         """Update mean and stddev for descriptor elements."""
         raise NotImplementedError
 
@@ -272,7 +270,7 @@ def set_stat_mean_and_stddev(
         self.davg = mean
         self.dstd = stddev
 
-    def get_stat_mean_and_stddev(self) -> Tuple[np.ndarray, np.ndarray]:
+    def get_stat_mean_and_stddev(self) -> tuple[np.ndarray, np.ndarray]:
         """Get mean and stddev for descriptor."""
         return self.davg, self.dstd
 
@@ -398,9 +396,9 @@ def deserialize(cls, data: dict) -> "DescrptSeR":
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/dpmodel/descriptor/se_t.py b/deepmd/dpmodel/descriptor/se_t.py
index 364600aa8b..f2ea751c50 100644
--- a/deepmd/dpmodel/descriptor/se_t.py
+++ b/deepmd/dpmodel/descriptor/se_t.py
@@ -2,9 +2,7 @@
 import copy
 import itertools
 from typing import (
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -73,7 +71,7 @@ class DescrptSeT(NativeOP, BaseDescriptor):
             The activation function in the embedding net. Supported options are |ACTIVATION_FN|
     env_protection : float
             Protection parameter to prevent division by zero errors during environment matrix calculations.
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     precision : str
@@ -82,7 +80,7 @@ class DescrptSeT(NativeOP, BaseDescriptor):
             If the weights of embedding net are trainable.
     seed : int, Optional
             Random seed for initializing the network parameters.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     ntypes : int
             Number of element types.
@@ -93,17 +91,17 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         resnet_dt: bool = False,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         env_protection: float = 0.0,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         precision: str = DEFAULT_PRECISION,
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
-        type_map: Optional[List[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
+        type_map: Optional[list[str]] = None,
         ntypes: Optional[int] = None,  # to be compat with input
     ) -> None:
         del ntypes
@@ -174,7 +172,7 @@ def dim_out(self):
         return self.get_dim_out()
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -235,11 +233,11 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.ntypes
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
-    def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None):
+    def compute_input_stats(self, merged: list[dict], path: Optional[DPPath] = None):
         """Update mean and stddev for descriptor elements."""
         raise NotImplementedError
 
@@ -252,13 +250,13 @@ def set_stat_mean_and_stddev(
         self.davg = mean
         self.dstd = stddev
 
-    def get_stat_mean_and_stddev(self) -> Tuple[np.ndarray, np.ndarray]:
+    def get_stat_mean_and_stddev(self) -> tuple[np.ndarray, np.ndarray]:
         """Get mean and stddev for descriptor."""
         return self.davg, self.dstd
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
@@ -399,9 +397,9 @@ def deserialize(cls, data: dict) -> "DescrptSeT":
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/dpmodel/descriptor/se_t_tebd.py b/deepmd/dpmodel/descriptor/se_t_tebd.py
index b6e362d2d7..147a335926 100644
--- a/deepmd/dpmodel/descriptor/se_t_tebd.py
+++ b/deepmd/dpmodel/descriptor/se_t_tebd.py
@@ -1,9 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Callable,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -64,7 +62,7 @@ class DescrptSeTTebd(NativeOP, BaseDescriptor):
             The cut-off radius
     rcut_smth
             From where the environment matrix should be smoothed
-    sel : Union[List[int], int]
+    sel : Union[list[int], int]
             list[int]: sel[i] specifies the maxmum number of type i atoms in the cut-off radius
             int: the total maxmum number of atoms in the cut-off radius
     ntypes : int
@@ -86,7 +84,7 @@ class DescrptSeTTebd(NativeOP, BaseDescriptor):
             The activation function in the embedding net. Supported options are |ACTIVATION_FN|
     env_protection: float
             Protection parameter to prevent division by zero errors during environment matrix calculations.
-    exclude_types : List[Tuple[int, int]]
+    exclude_types : list[tuple[int, int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     precision
@@ -95,7 +93,7 @@ class DescrptSeTTebd(NativeOP, BaseDescriptor):
             If the weights of embedding net are trainable.
     seed
             Random seed for initializing the network parameters.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     concat_output_tebd: bool
             Whether to concat type embedding at the output of the descriptor.
@@ -112,7 +110,7 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: Union[List[int], int],
+        sel: Union[list[int], int],
         ntypes: int,
         neuron: list = [2, 4, 8],
         tebd_dim: int = 8,
@@ -121,11 +119,11 @@ def __init__(
         set_davg_zero: bool = True,
         activation_function: str = "tanh",
         env_protection: float = 0.0,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         precision: str = "float64",
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
-        type_map: Optional[List[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
+        type_map: Optional[list[str]] = None,
         concat_output_tebd: bool = True,
         use_econf_tebd: bool = False,
         use_tebd_bias=False,
@@ -178,7 +176,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return self.se_ttebd.get_nsel()
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.se_ttebd.get_sel()
 
@@ -186,7 +184,7 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.se_ttebd.get_ntypes()
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
@@ -240,7 +238,7 @@ def dim_out(self):
     def dim_emb(self):
         return self.get_dim_emb()
 
-    def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None):
+    def compute_input_stats(self, merged: list[dict], path: Optional[DPPath] = None):
         """Update mean and stddev for descriptor elements."""
         raise NotImplementedError
 
@@ -253,12 +251,12 @@ def set_stat_mean_and_stddev(
         self.se_ttebd.mean = mean
         self.se_ttebd.stddev = stddev
 
-    def get_stat_mean_and_stddev(self) -> Tuple[np.ndarray, np.ndarray]:
+    def get_stat_mean_and_stddev(self) -> tuple[np.ndarray, np.ndarray]:
         """Get mean and stddev for descriptor."""
         return self.se_ttebd.mean, self.se_ttebd.stddev
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -412,9 +410,9 @@ def deserialize(cls, data: dict) -> "DescrptSeTTebd":
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -447,7 +445,7 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: Union[List[int], int],
+        sel: Union[list[int], int],
         ntypes: int,
         neuron: list = [25, 50, 100],
         tebd_dim: int = 8,
@@ -456,10 +454,10 @@ def __init__(
         activation_function="tanh",
         precision: str = "float64",
         resnet_dt: bool = False,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         smooth: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ) -> None:
         self.rcut = rcut
         self.rcut_smth = rcut_smth
@@ -541,7 +539,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
@@ -610,7 +608,7 @@ def dim_emb(self):
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data."""
@@ -622,7 +620,7 @@ def get_stats(self):
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
diff --git a/deepmd/dpmodel/fitting/dipole_fitting.py b/deepmd/dpmodel/fitting/dipole_fitting.py
index 20e732823b..f67bbc93a4 100644
--- a/deepmd/dpmodel/fitting/dipole_fitting.py
+++ b/deepmd/dpmodel/fitting/dipole_fitting.py
@@ -2,8 +2,6 @@
 import copy
 from typing import (
     Any,
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -81,7 +79,7 @@ class DipoleFitting(GeneralFitting):
     c_differentiable
             If the variable is differentiated with respect to the cell tensor (pbc case).
             Only reducible variable are differentiable.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
 
@@ -90,25 +88,25 @@ def __init__(
         ntypes: int,
         dim_descrpt: int,
         embedding_width: int,
-        neuron: List[int] = [120, 120, 120],
+        neuron: list[int] = [120, 120, 120],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
         numb_aparam: int = 0,
         rcond: Optional[float] = None,
         tot_ener_zero: bool = False,
-        trainable: Optional[List[bool]] = None,
+        trainable: Optional[list[bool]] = None,
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
-        layer_name: Optional[List[Optional[str]]] = None,
+        layer_name: Optional[list[Optional[str]]] = None,
         use_aparam_as_mask: bool = False,
         spin: Any = None,
         mixed_types: bool = False,
-        exclude_types: List[int] = [],
+        exclude_types: list[int] = [],
         r_differentiable: bool = True,
         c_differentiable: bool = True,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         old_impl=False,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         if tot_ener_zero:
             raise NotImplementedError("tot_ener_zero is not implemented")
@@ -188,7 +186,7 @@ def call(
         h2: Optional[np.ndarray] = None,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         """Calculate the fitting.
 
         Parameters
diff --git a/deepmd/dpmodel/fitting/dos_fitting.py b/deepmd/dpmodel/fitting/dos_fitting.py
index 0d4cee68e2..e9cd4a17ae 100644
--- a/deepmd/dpmodel/fitting/dos_fitting.py
+++ b/deepmd/dpmodel/fitting/dos_fitting.py
@@ -2,7 +2,6 @@
 import copy
 from typing import (
     TYPE_CHECKING,
-    List,
     Optional,
     Union,
 )
@@ -33,19 +32,19 @@ def __init__(
         ntypes: int,
         dim_descrpt: int,
         numb_dos: int = 300,
-        neuron: List[int] = [120, 120, 120],
+        neuron: list[int] = [120, 120, 120],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
         numb_aparam: int = 0,
         bias_dos: Optional[np.ndarray] = None,
         rcond: Optional[float] = None,
-        trainable: Union[bool, List[bool]] = True,
+        trainable: Union[bool, list[bool]] = True,
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
         mixed_types: bool = False,
-        exclude_types: List[int] = [],
-        type_map: Optional[List[str]] = None,
-        seed: Optional[Union[int, List[int]]] = None,
+        exclude_types: list[int] = [],
+        type_map: Optional[list[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         if bias_dos is not None:
             self.bias_dos = bias_dos
diff --git a/deepmd/dpmodel/fitting/ener_fitting.py b/deepmd/dpmodel/fitting/ener_fitting.py
index 60f23f9628..9a1eae0156 100644
--- a/deepmd/dpmodel/fitting/ener_fitting.py
+++ b/deepmd/dpmodel/fitting/ener_fitting.py
@@ -3,7 +3,6 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    List,
     Optional,
     Union,
 )
@@ -30,23 +29,23 @@ def __init__(
         self,
         ntypes: int,
         dim_descrpt: int,
-        neuron: List[int] = [120, 120, 120],
+        neuron: list[int] = [120, 120, 120],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
         numb_aparam: int = 0,
         rcond: Optional[float] = None,
         tot_ener_zero: bool = False,
-        trainable: Optional[List[bool]] = None,
-        atom_ener: Optional[List[float]] = None,
+        trainable: Optional[list[bool]] = None,
+        atom_ener: Optional[list[float]] = None,
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
-        layer_name: Optional[List[Optional[str]]] = None,
+        layer_name: Optional[list[Optional[str]]] = None,
         use_aparam_as_mask: bool = False,
         spin: Any = None,
         mixed_types: bool = False,
-        exclude_types: List[int] = [],
-        type_map: Optional[List[str]] = None,
-        seed: Optional[Union[int, List[int]]] = None,
+        exclude_types: list[int] = [],
+        type_map: Optional[list[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__(
             var_name="energy",
diff --git a/deepmd/dpmodel/fitting/general_fitting.py b/deepmd/dpmodel/fitting/general_fitting.py
index a20405018e..fd80ccb4aa 100644
--- a/deepmd/dpmodel/fitting/general_fitting.py
+++ b/deepmd/dpmodel/fitting/general_fitting.py
@@ -5,18 +5,20 @@
 )
 from typing import (
     Any,
-    Dict,
-    List,
     Optional,
     Union,
 )
 
+import array_api_compat
 import numpy as np
 
 from deepmd.dpmodel import (
     DEFAULT_PRECISION,
     NativeOP,
 )
+from deepmd.dpmodel.common import (
+    to_numpy_array,
+)
 from deepmd.dpmodel.utils import (
     AtomExcludeMask,
     FittingNet,
@@ -78,15 +80,15 @@ class GeneralFitting(NativeOP, BaseFitting):
     mixed_types
             If true, use a uniform fitting net for all atom types, otherwise use
             different fitting nets for different atom types.
-    exclude_types: List[int]
+    exclude_types: list[int]
             Atomic contributions of the excluded atom types are set zero.
-    remove_vaccum_contribution: List[bool], optional
+    remove_vaccum_contribution: list[bool], optional
         Remove vaccum contribution before the bias is added. The list assigned each
         type. For `mixed_types` provide `[True]`, otherwise it should be a list of the same
         length as `ntypes` signaling if or not removing the vaccum contribution for the atom types in the list.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
-    seed: Optional[Union[int, List[int]]]
+    seed: Optional[Union[int, list[int]]]
         Random seed for initializing the network parameters.
     """
 
@@ -95,24 +97,24 @@ def __init__(
         var_name: str,
         ntypes: int,
         dim_descrpt: int,
-        neuron: List[int] = [120, 120, 120],
+        neuron: list[int] = [120, 120, 120],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
         numb_aparam: int = 0,
         bias_atom_e: Optional[np.ndarray] = None,
         rcond: Optional[float] = None,
         tot_ener_zero: bool = False,
-        trainable: Optional[List[bool]] = None,
+        trainable: Optional[list[bool]] = None,
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
-        layer_name: Optional[List[Optional[str]]] = None,
+        layer_name: Optional[list[Optional[str]]] = None,
         use_aparam_as_mask: bool = False,
         spin: Any = None,
         mixed_types: bool = True,
-        exclude_types: List[int] = [],
-        remove_vaccum_contribution: Optional[List[bool]] = None,
-        type_map: Optional[List[str]] = None,
-        seed: Optional[Union[int, List[int]]] = None,
+        exclude_types: list[int] = [],
+        remove_vaccum_contribution: Optional[list[bool]] = None,
+        type_map: Optional[list[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         self.var_name = var_name
         self.ntypes = ntypes
@@ -192,7 +194,7 @@ def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.numb_aparam
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
@@ -201,12 +203,12 @@ def get_sel_type(self) -> List[int]:
         """
         return [ii for ii in range(self.ntypes) if ii not in self.exclude_types]
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -261,7 +263,7 @@ def __getitem__(self, key):
 
     def reinit_exclude(
         self,
-        exclude_types: List[int] = [],
+        exclude_types: list[int] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = AtomExcludeMask(self.ntypes, self.exclude_types)
@@ -285,11 +287,11 @@ def serialize(self) -> dict:
             "exclude_types": self.exclude_types,
             "nets": self.nets.serialize(),
             "@variables": {
-                "bias_atom_e": self.bias_atom_e,
-                "fparam_avg": self.fparam_avg,
-                "fparam_inv_std": self.fparam_inv_std,
-                "aparam_avg": self.aparam_avg,
-                "aparam_inv_std": self.aparam_inv_std,
+                "bias_atom_e": to_numpy_array(self.bias_atom_e),
+                "fparam_avg": to_numpy_array(self.fparam_avg),
+                "fparam_inv_std": to_numpy_array(self.fparam_inv_std),
+                "aparam_avg": to_numpy_array(self.aparam_avg),
+                "aparam_inv_std": to_numpy_array(self.aparam_inv_std),
             },
             "type_map": self.type_map,
             # not supported
@@ -322,7 +324,7 @@ def _call_common(
         h2: Optional[np.ndarray] = None,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         """Calculate the fitting.
 
         Parameters
@@ -346,6 +348,7 @@ def _call_common(
             The atomic parameter. shape: nf x nloc x nap. nap being `numb_aparam`
 
         """
+        xp = array_api_compat.array_namespace(descriptor, atype)
         nf, nloc, nd = descriptor.shape
         net_dim_out = self._net_out_dim()
         # check input dim
@@ -361,7 +364,7 @@ def _call_common(
             # we consider it as always zero for convenience.
             # Needs a compute_input_stats for vaccum passed from the
             # descriptor.
-            xx_zeros = np.zeros_like(xx)
+            xx_zeros = xp.zeros_like(xx)
         else:
             xx_zeros = None
         # check fparam dim, concate to input descriptor
@@ -373,13 +376,15 @@ def _call_common(
                     "which is not consistent with {self.numb_fparam}.",
                 )
             fparam = (fparam - self.fparam_avg) * self.fparam_inv_std
-            fparam = np.tile(fparam.reshape([nf, 1, self.numb_fparam]), [1, nloc, 1])
-            xx = np.concatenate(
+            fparam = xp.tile(
+                xp.reshape(fparam, [nf, 1, self.numb_fparam]), (1, nloc, 1)
+            )
+            xx = xp.concat(
                 [xx, fparam],
                 axis=-1,
             )
             if xx_zeros is not None:
-                xx_zeros = np.concatenate(
+                xx_zeros = xp.concat(
                     [xx_zeros, fparam],
                     axis=-1,
                 )
@@ -391,24 +396,24 @@ def _call_common(
                     "get an input aparam of dim {aparam.shape[-1]}, ",
                     "which is not consistent with {self.numb_aparam}.",
                 )
-            aparam = aparam.reshape([nf, nloc, self.numb_aparam])
+            aparam = xp.reshape(aparam, [nf, nloc, self.numb_aparam])
             aparam = (aparam - self.aparam_avg) * self.aparam_inv_std
-            xx = np.concatenate(
+            xx = xp.concat(
                 [xx, aparam],
                 axis=-1,
             )
             if xx_zeros is not None:
-                xx_zeros = np.concatenate(
+                xx_zeros = xp.concat(
                     [xx_zeros, aparam],
                     axis=-1,
                 )
 
         # calcualte the prediction
         if not self.mixed_types:
-            outs = np.zeros([nf, nloc, net_dim_out])  # pylint: disable=no-explicit-dtype
+            outs = xp.zeros([nf, nloc, net_dim_out])  # pylint: disable=no-explicit-dtype
             for type_i in range(self.ntypes):
-                mask = np.tile(
-                    (atype == type_i).reshape([nf, nloc, 1]), [1, 1, net_dim_out]
+                mask = xp.tile(
+                    xp.reshape((atype == type_i), [nf, nloc, 1]), (1, 1, net_dim_out)
                 )
                 atom_property = self.nets[(type_i,)](xx)
                 if self.remove_vaccum_contribution is not None and not (
@@ -417,15 +422,18 @@ def _call_common(
                 ):
                     assert xx_zeros is not None
                     atom_property -= self.nets[(type_i,)](xx_zeros)
-                atom_property = atom_property + self.bias_atom_e[type_i]
-                atom_property = atom_property * mask
+                atom_property = atom_property + self.bias_atom_e[type_i, ...]
+                atom_property = atom_property * xp.astype(mask, atom_property.dtype)
                 outs = outs + atom_property  # Shape is [nframes, natoms[0], 1]
         else:
-            outs = self.nets[()](xx) + self.bias_atom_e[atype]
+            outs = self.nets[()](xx) + xp.reshape(
+                xp.take(self.bias_atom_e, xp.reshape(atype, [-1]), axis=0),
+                [nf, nloc, net_dim_out],
+            )
             if xx_zeros is not None:
                 outs -= self.nets[()](xx_zeros)
         # nf x nloc
         exclude_mask = self.emask.build_type_exclude_mask(atype)
         # nf x nloc x nod
-        outs = outs * exclude_mask[:, :, None]
+        outs = outs * xp.astype(exclude_mask[:, :, None], outs.dtype)
         return {self.var_name: outs}
diff --git a/deepmd/dpmodel/fitting/invar_fitting.py b/deepmd/dpmodel/fitting/invar_fitting.py
index 2e469eefe1..893853bb38 100644
--- a/deepmd/dpmodel/fitting/invar_fitting.py
+++ b/deepmd/dpmodel/fitting/invar_fitting.py
@@ -2,8 +2,6 @@
 import copy
 from typing import (
     Any,
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -105,9 +103,9 @@ class InvarFitting(GeneralFitting):
             And the aparam will not be used as the atomic parameters for embedding.
     mixed_types
             If false, different atomic types uses different fitting net, otherwise different atom types share the same fitting net.
-    exclude_types: List[int]
+    exclude_types: list[int]
             Atomic contributions of the excluded atom types are set zero.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
 
     """
@@ -118,24 +116,24 @@ def __init__(
         ntypes: int,
         dim_descrpt: int,
         dim_out: int,
-        neuron: List[int] = [120, 120, 120],
+        neuron: list[int] = [120, 120, 120],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
         numb_aparam: int = 0,
         bias_atom: Optional[np.ndarray] = None,
         rcond: Optional[float] = None,
         tot_ener_zero: bool = False,
-        trainable: Optional[List[bool]] = None,
-        atom_ener: Optional[List[float]] = None,
+        trainable: Optional[list[bool]] = None,
+        atom_ener: Optional[list[float]] = None,
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
-        layer_name: Optional[List[Optional[str]]] = None,
+        layer_name: Optional[list[Optional[str]]] = None,
         use_aparam_as_mask: bool = False,
         spin: Any = None,
         mixed_types: bool = True,
-        exclude_types: List[int] = [],
-        type_map: Optional[List[str]] = None,
-        seed: Optional[Union[int, List[int]]] = None,
+        exclude_types: list[int] = [],
+        type_map: Optional[list[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         if tot_ener_zero:
             raise NotImplementedError("tot_ener_zero is not implemented")
@@ -219,7 +217,7 @@ def call(
         h2: Optional[np.ndarray] = None,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         """Calculate the fitting.
 
         Parameters
diff --git a/deepmd/dpmodel/fitting/make_base_fitting.py b/deepmd/dpmodel/fitting/make_base_fitting.py
index 417ccc892a..a67273356d 100644
--- a/deepmd/dpmodel/fitting/make_base_fitting.py
+++ b/deepmd/dpmodel/fitting/make_base_fitting.py
@@ -4,8 +4,6 @@
     abstractmethod,
 )
 from typing import (
-    Dict,
-    List,
     Optional,
 )
 
@@ -60,7 +58,7 @@ def fwd(
             h2: Optional[t_tensor] = None,
             fparam: Optional[t_tensor] = None,
             aparam: Optional[t_tensor] = None,
-        ) -> Dict[str, t_tensor]:
+        ) -> dict[str, t_tensor]:
             """Calculate fitting."""
             pass
 
@@ -69,13 +67,13 @@ def compute_output_stats(self, merged):
             raise NotImplementedError
 
         @abstractmethod
-        def get_type_map(self) -> List[str]:
+        def get_type_map(self) -> list[str]:
             """Get the name to each type of atoms."""
             pass
 
         @abstractmethod
         def change_type_map(
-            self, type_map: List[str], model_with_new_type_stat=None
+            self, type_map: list[str], model_with_new_type_stat=None
         ) -> None:
             """Change the type related params to new ones, according to `type_map` and the original one in the model.
             If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
diff --git a/deepmd/dpmodel/fitting/polarizability_fitting.py b/deepmd/dpmodel/fitting/polarizability_fitting.py
index d3036fe8b8..2ff5052a83 100644
--- a/deepmd/dpmodel/fitting/polarizability_fitting.py
+++ b/deepmd/dpmodel/fitting/polarizability_fitting.py
@@ -2,8 +2,6 @@
 import copy
 from typing import (
     Any,
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -82,11 +80,11 @@ class PolarFitting(GeneralFitting):
     fit_diag : bool
             Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to
             normal polarizability matrix by contracting with the rotation matrix.
-    scale : List[float]
+    scale : list[float]
             The output of the fitting net (polarizability matrix) for type i atom will be scaled by scale[i]
     shift_diag : bool
             Whether to shift the diagonal part of the polarizability matrix. The shift operation is carried out after scale.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
 
@@ -95,26 +93,26 @@ def __init__(
         ntypes: int,
         dim_descrpt: int,
         embedding_width: int,
-        neuron: List[int] = [120, 120, 120],
+        neuron: list[int] = [120, 120, 120],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
         numb_aparam: int = 0,
         rcond: Optional[float] = None,
         tot_ener_zero: bool = False,
-        trainable: Optional[List[bool]] = None,
+        trainable: Optional[list[bool]] = None,
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
-        layer_name: Optional[List[Optional[str]]] = None,
+        layer_name: Optional[list[Optional[str]]] = None,
         use_aparam_as_mask: bool = False,
         spin: Any = None,
         mixed_types: bool = False,
-        exclude_types: List[int] = [],
+        exclude_types: list[int] = [],
         old_impl: bool = False,
         fit_diag: bool = True,
-        scale: Optional[List[float]] = None,
+        scale: Optional[list[float]] = None,
         shift_diag: bool = True,
-        type_map: Optional[List[str]] = None,
-        seed: Optional[Union[int, List[int]]] = None,
+        type_map: Optional[list[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         if tot_ener_zero:
             raise NotImplementedError("tot_ener_zero is not implemented")
@@ -223,7 +221,7 @@ def output_def(self):
         )
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -257,7 +255,7 @@ def call(
         h2: Optional[np.ndarray] = None,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         """Calculate the fitting.
 
         Parameters
diff --git a/deepmd/dpmodel/fitting/property_fitting.py b/deepmd/dpmodel/fitting/property_fitting.py
index 014dda4188..1a8fe44aae 100644
--- a/deepmd/dpmodel/fitting/property_fitting.py
+++ b/deepmd/dpmodel/fitting/property_fitting.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import copy
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -60,9 +59,9 @@ class PropertyFittingNet(InvarFitting):
             The precision of the embedding net parameters. Supported options are |PRECISION|
     mixed_types
             If false, different atomic types uses different fitting net, otherwise different atom types share the same fitting net.
-    exclude_types: List[int]
+    exclude_types: list[int]
             Atomic contributions of the excluded atom types are set zero.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
 
@@ -71,10 +70,10 @@ def __init__(
         ntypes: int,
         dim_descrpt: int,
         task_dim: int = 1,
-        neuron: List[int] = [128, 128, 128],
+        neuron: list[int] = [128, 128, 128],
         bias_atom_p: Optional[np.ndarray] = None,
         rcond: Optional[float] = None,
-        trainable: Union[bool, List[bool]] = True,
+        trainable: Union[bool, list[bool]] = True,
         intensive: bool = False,
         bias_method: str = "normal",
         resnet_dt: bool = True,
@@ -83,8 +82,8 @@ def __init__(
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
         mixed_types: bool = True,
-        exclude_types: List[int] = [],
-        type_map: Optional[List[str]] = None,
+        exclude_types: list[int] = [],
+        type_map: Optional[list[str]] = None,
         # not used
         seed: Optional[int] = None,
     ):
diff --git a/deepmd/dpmodel/infer/deep_eval.py b/deepmd/dpmodel/infer/deep_eval.py
index 02625f5331..695edb29d2 100644
--- a/deepmd/dpmodel/infer/deep_eval.py
+++ b/deepmd/dpmodel/infer/deep_eval.py
@@ -4,11 +4,7 @@
     TYPE_CHECKING,
     Any,
     Callable,
-    Dict,
-    List,
     Optional,
-    Tuple,
-    Type,
     Union,
 )
 
@@ -109,7 +105,7 @@ def get_ntypes(self) -> int:
         """Get the number of atom types of this model."""
         return len(self.type_map)
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map (element name of the atom types) of this model."""
         return self.type_map
 
@@ -122,7 +118,7 @@ def get_dim_aparam(self) -> int:
         return self.dp.get_dim_aparam()
 
     @property
-    def model_type(self) -> Type["DeepEvalWrapper"]:
+    def model_type(self) -> type["DeepEvalWrapper"]:
         """The the evaluator of the model type."""
         model_output_type = self.dp.model_output_type()
         if "energy" in model_output_type:
@@ -138,7 +134,7 @@ def model_type(self) -> Type["DeepEvalWrapper"]:
         else:
             raise RuntimeError("Unknown model type")
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
@@ -168,7 +164,7 @@ def eval(
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
         **kwargs: Any,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         """Evaluate the energy, force and virial by using this DP.
 
         Parameters
@@ -226,7 +222,7 @@ def eval(
             )
         )
 
-    def _get_request_defs(self, atomic: bool) -> List[OutputVariableDef]:
+    def _get_request_defs(self, atomic: bool) -> list[OutputVariableDef]:
         """Get the requested output definitions.
 
         When atomic is True, all output_def are requested.
@@ -290,7 +286,7 @@ def _get_natoms_and_nframes(
         coords: np.ndarray,
         atom_types: np.ndarray,
         mixed_type: bool = False,
-    ) -> Tuple[int, int]:
+    ) -> tuple[int, int]:
         if mixed_type:
             natoms = len(atom_types[0])
         else:
@@ -307,7 +303,7 @@ def _eval_model(
         coords: np.ndarray,
         cells: Optional[np.ndarray],
         atom_types: np.ndarray,
-        request_defs: List[OutputVariableDef],
+        request_defs: list[OutputVariableDef],
     ):
         model = self.dp
 
diff --git a/deepmd/dpmodel/model/base_model.py b/deepmd/dpmodel/model/base_model.py
index c6d482c72f..3f71003bad 100644
--- a/deepmd/dpmodel/model/base_model.py
+++ b/deepmd/dpmodel/model/base_model.py
@@ -7,10 +7,7 @@
 )
 from typing import (
     Any,
-    List,
     Optional,
-    Tuple,
-    Type,
 )
 
 from deepmd.utils.data_system import (
@@ -22,7 +19,7 @@
 )
 
 
-def make_base_model() -> Type[object]:
+def make_base_model() -> type[object]:
     class BaseBaseModel(ABC, PluginVariant, make_plugin_registry("model")):
         """Base class for final exported model that will be directly used for inference.
 
@@ -67,7 +64,7 @@ def __call__(self, *args: Any, **kwds: Any) -> Any:
             pass
 
         @abstractmethod
-        def get_type_map(self) -> List[str]:
+        def get_type_map(self) -> list[str]:
             """Get the type map."""
 
         @abstractmethod
@@ -83,7 +80,7 @@ def get_dim_aparam(self):
             """Get the number (dimension) of atomic parameters of this atomic model."""
 
         @abstractmethod
-        def get_sel_type(self) -> List[int]:
+        def get_sel_type(self) -> list[int]:
             """Get the selected atom types of this model.
 
             Only atoms with selected atom types have atomic contribution
@@ -99,7 +96,7 @@ def is_aparam_nall(self) -> bool:
             """
 
         @abstractmethod
-        def model_output_type(self) -> List[str]:
+        def model_output_type(self) -> list[str]:
             """Get the output type for the model."""
 
         @abstractmethod
@@ -166,9 +163,9 @@ def get_nsel(self) -> int:
         def update_sel(
             cls,
             train_data: DeepmdDataSystem,
-            type_map: Optional[List[str]],
+            type_map: Optional[list[str]],
             local_jdata: dict,
-        ) -> Tuple[dict, Optional[float]]:
+        ) -> tuple[dict, Optional[float]]:
             """Update the selection and perform neighbor statistics.
 
             Parameters
diff --git a/deepmd/dpmodel/model/dp_model.py b/deepmd/dpmodel/model/dp_model.py
index 1597ba0b14..eda0414398 100644
--- a/deepmd/dpmodel/model/dp_model.py
+++ b/deepmd/dpmodel/model/dp_model.py
@@ -2,9 +2,7 @@
 
 
 from typing import (
-    List,
     Optional,
-    Tuple,
 )
 
 from deepmd.dpmodel.descriptor.base_descriptor import (
@@ -21,9 +19,9 @@ class DPModelCommon:
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index ee4c1f035a..8cdb7e1f25 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -1,10 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    Dict,
-    List,
     Optional,
-    Tuple,
-    Type,
 )
 
 import numpy as np
@@ -42,7 +38,7 @@
 )
 
 
-def make_model(T_AtomicModel: Type[BaseAtomicModel]):
+def make_model(T_AtomicModel: type[BaseAtomicModel]):
     """Make a model as a derived class of an atomic model.
 
     The model provide two interfaces.
@@ -87,7 +83,7 @@ def model_output_def(self):
             """Get the output def for the model."""
             return ModelOutputDef(self.atomic_output_def())
 
-        def model_output_type(self) -> List[str]:
+        def model_output_type(self) -> list[str]:
             """Get the output type for the model."""
             output_def = self.model_output_def()
             var_defs = output_def.var_defs
@@ -106,7 +102,7 @@ def call(
             fparam: Optional[np.ndarray] = None,
             aparam: Optional[np.ndarray] = None,
             do_atomic_virial: bool = False,
-        ) -> Dict[str, np.ndarray]:
+        ) -> dict[str, np.ndarray]:
             """Return model prediction.
 
             Parameters
@@ -128,7 +124,7 @@ def call(
             Returns
             -------
             ret_dict
-                The result dict of type Dict[str,np.ndarray].
+                The result dict of type dict[str,np.ndarray].
                 The keys are defined by the `ModelOutputDef`.
 
             """
@@ -249,7 +245,7 @@ def input_type_cast(
             box: Optional[np.ndarray] = None,
             fparam: Optional[np.ndarray] = None,
             aparam: Optional[np.ndarray] = None,
-        ) -> Tuple[
+        ) -> tuple[
             np.ndarray,
             Optional[np.ndarray],
             Optional[np.ndarray],
@@ -263,7 +259,7 @@ def input_type_cast(
             ###
             ### type checking would not pass jit, convert to coord prec anyway
             ###
-            _lst: List[Optional[np.ndarray]] = [
+            _lst: list[Optional[np.ndarray]] = [
                 vv.astype(coord.dtype) if vv is not None else None
                 for vv in [box, fparam, aparam]
             ]
@@ -285,9 +281,9 @@ def input_type_cast(
 
         def output_type_cast(
             self,
-            model_ret: Dict[str, np.ndarray],
+            model_ret: dict[str, np.ndarray],
             input_prec: str,
-        ) -> Dict[str, np.ndarray]:
+        ) -> dict[str, np.ndarray]:
             """Convert the model output to the input prec."""
             do_cast = (
                 input_prec
@@ -427,7 +423,7 @@ def do_grad_c(
             return self.atomic_model.do_grad_c(var_name)
 
         def change_type_map(
-            self, type_map: List[str], model_with_new_type_stat=None
+            self, type_map: list[str], model_with_new_type_stat=None
         ) -> None:
             """Change the type related params to new ones, according to `type_map` and the original one in the model.
             If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -449,7 +445,7 @@ def get_dim_aparam(self) -> int:
             """Get the number (dimension) of atomic parameters of this atomic model."""
             return self.atomic_model.get_dim_aparam()
 
-        def get_sel_type(self) -> List[int]:
+        def get_sel_type(self) -> list[int]:
             """Get the selected atom types of this model.
 
             Only atoms with selected atom types have atomic contribution
@@ -469,7 +465,7 @@ def get_rcut(self) -> float:
             """Get the cut-off radius."""
             return self.atomic_model.get_rcut()
 
-        def get_type_map(self) -> List[str]:
+        def get_type_map(self) -> list[str]:
             """Get the type map."""
             return self.atomic_model.get_type_map()
 
@@ -481,7 +477,7 @@ def get_nnei(self) -> int:
             """Returns the total number of selected neighboring atoms in the cut-off radius."""
             return self.atomic_model.get_nnei()
 
-        def get_sel(self) -> List[int]:
+        def get_sel(self) -> list[int]:
             """Returns the number of selected atoms for each type."""
             return self.atomic_model.get_sel()
 
diff --git a/deepmd/dpmodel/model/spin_model.py b/deepmd/dpmodel/model/spin_model.py
index d9c96a979e..b0801fe59e 100644
--- a/deepmd/dpmodel/model/spin_model.py
+++ b/deepmd/dpmodel/model/spin_model.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    Dict,
-    List,
     Optional,
 )
 
@@ -222,7 +220,7 @@ def expand_aparam(aparam, nloc: int):
             )
         return aparam
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map."""
         tmap = self.backbone_model.get_type_map()
         ntypes = len(tmap) // 2  # ignore the virtual type
@@ -244,7 +242,7 @@ def get_dim_aparam(self):
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.backbone_model.get_dim_aparam()
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
         Only atoms with selected atom types have atomic contribution
         to the result of the model.
@@ -258,7 +256,7 @@ def is_aparam_nall(self) -> bool:
         """
         return self.backbone_model.is_aparam_nall()
 
-    def model_output_type(self) -> List[str]:
+    def model_output_type(self) -> list[str]:
         """Get the output type for the model."""
         return self.backbone_model.model_output_type()
 
@@ -333,7 +331,7 @@ def call(
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
         do_atomic_virial: bool = False,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         """Return model prediction.
 
         Parameters
@@ -358,7 +356,7 @@ def call(
         Returns
         -------
         ret_dict
-            The result dict of type Dict[str,np.ndarray].
+            The result dict of type dict[str,np.ndarray].
             The keys are defined by the `ModelOutputDef`.
 
         """
diff --git a/deepmd/dpmodel/model/transform_output.py b/deepmd/dpmodel/model/transform_output.py
index 67fb016389..43c275b1be 100644
--- a/deepmd/dpmodel/model/transform_output.py
+++ b/deepmd/dpmodel/model/transform_output.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    Dict,
-)
 
 import numpy as np
 
@@ -17,11 +14,11 @@
 
 
 def fit_output_to_model_output(
-    fit_ret: Dict[str, np.ndarray],
+    fit_ret: dict[str, np.ndarray],
     fit_output_def: FittingOutputDef,
     coord_ext: np.ndarray,
     do_atomic_virial: bool = False,
-) -> Dict[str, np.ndarray]:
+) -> dict[str, np.ndarray]:
     """Transform the output of the fitting network to
     the model output.
 
@@ -49,11 +46,11 @@ def fit_output_to_model_output(
 
 
 def communicate_extended_output(
-    model_ret: Dict[str, np.ndarray],
+    model_ret: dict[str, np.ndarray],
     model_output_def: ModelOutputDef,
     mapping: np.ndarray,  # nf x nloc
     do_atomic_virial: bool = False,
-) -> Dict[str, np.ndarray]:
+) -> dict[str, np.ndarray]:
     """Transform the output of the model network defined on
     local and ghost (extended) atoms to local atoms.
 
diff --git a/deepmd/dpmodel/output_def.py b/deepmd/dpmodel/output_def.py
index d55ea3988d..2ceb4f412a 100644
--- a/deepmd/dpmodel/output_def.py
+++ b/deepmd/dpmodel/output_def.py
@@ -3,16 +3,11 @@
 from enum import (
     IntEnum,
 )
-from typing import (
-    Dict,
-    List,
-    Tuple,
-)
 
 
 def check_shape(
-    shape: List[int],
-    def_shape: List[int],
+    shape: list[int],
+    def_shape: list[int],
 ):
     """Check if the shape satisfies the defined shape."""
     assert len(shape) == len(def_shape)
@@ -193,7 +188,7 @@ class OutputVariableDef:
     def __init__(
         self,
         name: str,
-        shape: List[int],
+        shape: list[int],
         reducible: bool = False,
         r_differentiable: bool = False,
         c_differentiable: bool = False,
@@ -256,7 +251,7 @@ class FittingOutputDef:
 
     def __init__(
         self,
-        var_defs: List[OutputVariableDef],
+        var_defs: list[OutputVariableDef],
     ):
         self.var_defs = {vv.name: vv for vv in var_defs}
 
@@ -266,7 +261,7 @@ def __getitem__(
     ) -> OutputVariableDef:
         return self.var_defs[key]
 
-    def get_data(self) -> Dict[str, OutputVariableDef]:
+    def get_data(self) -> dict[str, OutputVariableDef]:
         return self.var_defs
 
     def keys(self):
@@ -298,7 +293,7 @@ def __init__(
         self.def_hess_r, _ = do_derivative(self.def_derv_r)
         self.def_derv_c_redu = do_reduce(self.def_derv_c)
         self.def_mask = do_mask(self.def_outp.get_data())
-        self.var_defs: Dict[str, OutputVariableDef] = {}
+        self.var_defs: dict[str, OutputVariableDef] = {}
         for ii in [
             self.def_outp.get_data(),
             self.def_redu,
@@ -318,7 +313,7 @@ def __getitem__(
 
     def get_data(
         self,
-    ) -> Dict[str, OutputVariableDef]:
+    ) -> dict[str, OutputVariableDef]:
         return self.var_defs
 
     def keys(self):
@@ -347,11 +342,11 @@ def get_reduce_name(name: str) -> str:
     return name + "_redu"
 
 
-def get_deriv_name(name: str) -> Tuple[str, str]:
+def get_deriv_name(name: str) -> tuple[str, str]:
     return name + "_derv_r", name + "_derv_c"
 
 
-def get_deriv_name_mag(name: str) -> Tuple[str, str]:
+def get_deriv_name_mag(name: str) -> tuple[str, str]:
     return name + "_derv_r_mag", name + "_derv_c_mag"
 
 
@@ -424,9 +419,9 @@ def check_deriv(var_def: OutputVariableDef) -> bool:
 
 
 def do_reduce(
-    def_outp_data: Dict[str, OutputVariableDef],
-) -> Dict[str, OutputVariableDef]:
-    def_redu: Dict[str, OutputVariableDef] = {}
+    def_outp_data: dict[str, OutputVariableDef],
+) -> dict[str, OutputVariableDef]:
+    def_redu: dict[str, OutputVariableDef] = {}
     for kk, vv in def_outp_data.items():
         if vv.reducible:
             rk = get_reduce_name(kk)
@@ -443,9 +438,9 @@ def do_reduce(
 
 
 def do_mask(
-    def_outp_data: Dict[str, OutputVariableDef],
-) -> Dict[str, OutputVariableDef]:
-    def_mask: Dict[str, OutputVariableDef] = {}
+    def_outp_data: dict[str, OutputVariableDef],
+) -> dict[str, OutputVariableDef]:
+    def_mask: dict[str, OutputVariableDef] = {}
     # for deep eval when has atomic mask
     def_mask["mask"] = OutputVariableDef(
         name="mask",
@@ -468,10 +463,10 @@ def do_mask(
 
 
 def do_derivative(
-    def_outp_data: Dict[str, OutputVariableDef],
-) -> Tuple[Dict[str, OutputVariableDef], Dict[str, OutputVariableDef]]:
-    def_derv_r: Dict[str, OutputVariableDef] = {}
-    def_derv_c: Dict[str, OutputVariableDef] = {}
+    def_outp_data: dict[str, OutputVariableDef],
+) -> tuple[dict[str, OutputVariableDef], dict[str, OutputVariableDef]]:
+    def_derv_r: dict[str, OutputVariableDef] = {}
+    def_derv_c: dict[str, OutputVariableDef] = {}
     for kk, vv in def_outp_data.items():
         rkr, rkc = get_deriv_name(kk)
         rkrm, rkcm = get_deriv_name_mag(kk)
diff --git a/deepmd/dpmodel/utils/env_mat.py b/deepmd/dpmodel/utils/env_mat.py
index 41f2591279..f4bc333a03 100644
--- a/deepmd/dpmodel/utils/env_mat.py
+++ b/deepmd/dpmodel/utils/env_mat.py
@@ -12,6 +12,7 @@
 )
 from deepmd.dpmodel.array_api import (
     support_array_api,
+    xp_take_along_axis,
 )
 
 
@@ -44,33 +45,34 @@ def _make_env_mat(
     protection: float = 0.0,
 ):
     """Make smooth environment matrix."""
+    xp = array_api_compat.array_namespace(nlist)
     nf, nloc, nnei = nlist.shape
     # nf x nall x 3
-    coord = coord.reshape(nf, -1, 3)
+    coord = xp.reshape(coord, (nf, -1, 3))
     mask = nlist >= 0
-    nlist = nlist * mask
+    nlist = nlist * xp.astype(mask, nlist.dtype)
     # nf x (nloc x nnei) x 3
-    index = np.tile(nlist.reshape(nf, -1, 1), (1, 1, 3))
-    coord_r = np.take_along_axis(coord, index, 1)
+    index = xp.tile(xp.reshape(nlist, (nf, -1, 1)), (1, 1, 3))
+    coord_r = xp_take_along_axis(coord, index, 1)
     # nf x nloc x nnei x 3
-    coord_r = coord_r.reshape(nf, nloc, nnei, 3)
+    coord_r = xp.reshape(coord_r, (nf, nloc, nnei, 3))
     # nf x nloc x 1 x 3
-    coord_l = coord[:, :nloc].reshape(nf, -1, 1, 3)
+    coord_l = xp.reshape(coord[:, :nloc, ...], (nf, -1, 1, 3))
     # nf x nloc x nnei x 3
     diff = coord_r - coord_l
     # nf x nloc x nnei
-    length = np.linalg.norm(diff, axis=-1, keepdims=True)
+    length = xp.linalg.vector_norm(diff, axis=-1, keepdims=True)
     # for index 0 nloc atom
-    length = length + ~np.expand_dims(mask, -1)
+    length = length + xp.astype(~xp.expand_dims(mask, axis=-1), length.dtype)
     t0 = 1 / (length + protection)
     t1 = diff / (length + protection) ** 2
     weight = compute_smooth_weight(length, ruct_smth, rcut)
-    weight = weight * np.expand_dims(mask, -1)
+    weight = weight * xp.astype(xp.expand_dims(mask, axis=-1), weight.dtype)
     if radial_only:
         env_mat = t0 * weight
     else:
-        env_mat = np.concatenate([t0, t1], axis=-1) * weight
-    return env_mat, diff * np.expand_dims(mask, -1), weight
+        env_mat = xp.concat([t0, t1], axis=-1) * weight
+    return env_mat, diff * xp.astype(xp.expand_dims(mask, axis=-1), diff.dtype), weight
 
 
 class EnvMat(NativeOP):
@@ -122,13 +124,14 @@ def call(
         switch
             The value of switch function. shape: nf x nloc x nnei
         """
+        xp = array_api_compat.array_namespace(coord_ext, atype_ext, nlist)
         em, diff, sw = self._call(nlist, coord_ext, radial_only)
         nf, nloc, nnei = nlist.shape
         atype = atype_ext[:, :nloc]
         if davg is not None:
-            em -= davg[atype]
+            em -= xp.reshape(xp.take(davg, xp.reshape(atype, (-1,)), axis=0), em.shape)
         if dstd is not None:
-            em /= dstd[atype]
+            em /= xp.reshape(xp.take(dstd, xp.reshape(atype, (-1,)), axis=0), em.shape)
         return em, diff, sw
 
     def _call(self, nlist, coord_ext, radial_only):
diff --git a/deepmd/dpmodel/utils/exclude_mask.py b/deepmd/dpmodel/utils/exclude_mask.py
index ff668b8153..b09a9b3e47 100644
--- a/deepmd/dpmodel/utils/exclude_mask.py
+++ b/deepmd/dpmodel/utils/exclude_mask.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    List,
-    Tuple,
-)
 
+import array_api_compat
 import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    xp_take_along_axis,
+)
+
 
 class AtomExcludeMask:
     """Computes the type exclusion mask for atoms."""
@@ -13,16 +14,16 @@ class AtomExcludeMask:
     def __init__(
         self,
         ntypes: int,
-        exclude_types: List[int] = [],
+        exclude_types: list[int] = [],
     ):
         self.ntypes = ntypes
         self.exclude_types = exclude_types
-        self.type_mask = np.array(
+        type_mask = np.array(
             [1 if tt_i not in self.exclude_types else 0 for tt_i in range(ntypes)],
             dtype=np.int32,
         )
         # (ntypes)
-        self.type_mask = self.type_mask.reshape([-1])
+        self.type_mask = type_mask.reshape([-1])
 
     def get_exclude_types(self):
         return self.exclude_types
@@ -49,8 +50,11 @@ def build_type_exclude_mask(
             otherwise being 1.
 
         """
+        xp = array_api_compat.array_namespace(atype)
         nf, natom = atype.shape
-        return self.type_mask[atype].reshape(nf, natom)
+        return xp.reshape(
+            xp.take(self.type_mask, xp.reshape(atype, [-1]), axis=0), (nf, natom)
+        )
 
 
 class PairExcludeMask:
@@ -59,7 +63,7 @@ class PairExcludeMask:
     def __init__(
         self,
         ntypes: int,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.ntypes = ntypes
         self.exclude_types = set()
@@ -68,7 +72,7 @@ def __init__(
             self.exclude_types.add((tt[0], tt[1]))
             self.exclude_types.add((tt[1], tt[0]))
         # ntypes + 1 for nlist masks
-        self.type_mask = np.array(
+        type_mask = np.array(
             [
                 [
                     1 if (tt_i, tt_j) not in self.exclude_types else 0
@@ -79,7 +83,7 @@ def __init__(
             dtype=np.int32,
         )
         # (ntypes+1 x ntypes+1)
-        self.type_mask = self.type_mask.reshape([-1])
+        self.type_mask = type_mask.reshape([-1])
 
     def get_exclude_types(self):
         return self.exclude_types
@@ -106,23 +110,29 @@ def build_type_exclude_mask(
             otherwise being 1.
 
         """
+        xp = array_api_compat.array_namespace(nlist, atype_ext)
         if len(self.exclude_types) == 0:
             # safely return 1 if nothing is excluded.
-            return np.ones_like(nlist, dtype=np.int32)
+            return xp.ones_like(nlist, dtype=xp.int32)
         nf, nloc, nnei = nlist.shape
         nall = atype_ext.shape[1]
         # add virtual atom of type ntypes. nf x nall+1
-        ae = np.concatenate(
-            [atype_ext, self.ntypes * np.ones([nf, 1], dtype=atype_ext.dtype)], axis=-1
+        ae = xp.concat(
+            [atype_ext, self.ntypes * xp.ones([nf, 1], dtype=atype_ext.dtype)], axis=-1
         )
-        type_i = atype_ext[:, :nloc].reshape(nf, nloc) * (self.ntypes + 1)
+        type_i = xp.reshape(atype_ext[:, :nloc], (nf, nloc)) * (self.ntypes + 1)
         # nf x nloc x nnei
-        index = np.where(nlist == -1, nall, nlist).reshape(nf, nloc * nnei)
-        type_j = np.take_along_axis(ae, index, axis=1).reshape(nf, nloc, nnei)
+        index = xp.reshape(
+            xp.where(nlist == -1, xp.full_like(nlist, nall), nlist), (nf, nloc * nnei)
+        )
+        type_j = xp_take_along_axis(ae, index, axis=1)
+        type_j = xp.reshape(type_j, (nf, nloc, nnei))
         type_ij = type_i[:, :, None] + type_j
         # nf x (nloc x nnei)
-        type_ij = type_ij.reshape(nf, nloc * nnei)
-        mask = self.type_mask[type_ij].reshape(nf, nloc, nnei)
+        type_ij = xp.reshape(type_ij, (nf, nloc * nnei))
+        mask = xp.reshape(
+            xp.take(self.type_mask, xp.reshape(type_ij, (-1,))), (nf, nloc, nnei)
+        )
         return mask
 
     def __contains__(self, item):
diff --git a/deepmd/dpmodel/utils/neighbor_stat.py b/deepmd/dpmodel/utils/neighbor_stat.py
index 96b39d20ad..744a4476cd 100644
--- a/deepmd/dpmodel/utils/neighbor_stat.py
+++ b/deepmd/dpmodel/utils/neighbor_stat.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
+from collections.abc import (
     Iterator,
+)
+from typing import (
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -47,7 +48,7 @@ def call(
         coord: np.ndarray,
         atype: np.ndarray,
         cell: Optional[np.ndarray],
-    ) -> Tuple[float, np.ndarray]:
+    ) -> tuple[float, np.ndarray]:
         """Calculate the neareest neighbor distance between atoms, maximum nbor size of
         atoms and the output data range of the environment matrix.
 
@@ -130,7 +131,7 @@ def __init__(
 
     def iterator(
         self, data: DeepmdDataSystem
-    ) -> Iterator[Tuple[np.ndarray, float, str]]:
+    ) -> Iterator[tuple[np.ndarray, float, str]]:
         """Abstract method for producing data.
 
         Yields
diff --git a/deepmd/dpmodel/utils/network.py b/deepmd/dpmodel/utils/network.py
index 22e85c9890..339035ff4e 100644
--- a/deepmd/dpmodel/utils/network.py
+++ b/deepmd/dpmodel/utils/network.py
@@ -9,8 +9,6 @@
 from typing import (
     Callable,
     ClassVar,
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -86,7 +84,7 @@ def __init__(
         activation_function: Optional[str] = None,
         resnet: bool = False,
         precision: str = DEFAULT_PRECISION,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ) -> None:
         prec = PRECISION_DICT[precision.lower()]
         self.precision = precision
@@ -148,15 +146,18 @@ def deserialize(cls, data: dict) -> "NativeLayer":
             num_out,
             **data,
         )
-        obj.w, obj.b, obj.idt = (
+        w, b, idt = (
             variables["w"],
             variables.get("b", None),
             variables.get("idt", None),
         )
-        if obj.b is not None:
-            obj.b = obj.b.ravel()
-        if obj.idt is not None:
-            obj.idt = obj.idt.ravel()
+        if b is not None:
+            b = b.ravel()
+        if idt is not None:
+            idt = idt.ravel()
+        obj.w = w
+        obj.b = b
+        obj.idt = idt
         obj.check_shape_consistency()
         return obj
 
@@ -177,8 +178,11 @@ def check_type_consistency(self):
 
         def check_var(var):
             if var is not None:
+                # array api standard doesn't provide a API to get the dtype name
+                # this is really hacked
+                dtype_name = str(var.dtype).split(".")[-1]
                 # assertion "float64" == "double" would fail
-                assert PRECISION_DICT[var.dtype.name] is PRECISION_DICT[precision]
+                assert PRECISION_DICT[dtype_name] is PRECISION_DICT[precision]
 
         check_var(self.w)
         check_var(self.b)
@@ -251,7 +255,7 @@ def call(self, x: np.ndarray) -> np.ndarray:
         if self.resnet and self.w.shape[1] == self.w.shape[0]:
             y += x
         elif self.resnet and self.w.shape[1] == 2 * self.w.shape[0]:
-            y += xp.concatenate([x, x], axis=-1)
+            y += xp.concat([x, x], axis=-1)
         return y
 
 
@@ -347,7 +351,7 @@ def __init__(
         uni_init: bool = True,
         trainable: bool = True,
         precision: str = DEFAULT_PRECISION,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ) -> None:
         self.eps = eps
         self.uni_init = uni_init
@@ -362,10 +366,11 @@ def __init__(
             precision=precision,
             seed=seed,
         )
-        self.w = self.w.squeeze(0)  # keep the weight shape to be [num_in]
+        xp = array_api_compat.array_namespace(self.w, self.b)
+        self.w = xp.squeeze(self.w, 0)  # keep the weight shape to be [num_in]
         if self.uni_init:
-            self.w = np.ones_like(self.w)
-            self.b = np.zeros_like(self.b)
+            self.w = xp.ones_like(self.w)
+            self.b = xp.zeros_like(self.b)
         # only to keep consistent with other backends
         self.trainable = trainable
 
@@ -378,8 +383,8 @@ def serialize(self) -> dict:
             The serialized layer.
         """
         data = {
-            "w": self.w,
-            "b": self.b,
+            "w": to_numpy_array(self.w),
+            "b": to_numpy_array(self.b),
         }
         return {
             "@class": "LayerNorm",
@@ -473,11 +478,12 @@ def call(self, x: np.ndarray) -> np.ndarray:
 
     @staticmethod
     def layer_norm_numpy(x, shape, weight=None, bias=None, eps=1e-5):
+        xp = array_api_compat.array_namespace(x)
         # mean and variance
-        mean = np.mean(x, axis=tuple(range(-len(shape), 0)), keepdims=True)
-        var = np.var(x, axis=tuple(range(-len(shape), 0)), keepdims=True)
+        mean = xp.mean(x, axis=tuple(range(-len(shape), 0)), keepdims=True)
+        var = xp.var(x, axis=tuple(range(-len(shape), 0)), keepdims=True)
         # normalize
-        x_normalized = (x - mean) / np.sqrt(var + eps)
+        x_normalized = (x - mean) / xp.sqrt(var + eps)
         # shift and scale
         if weight is not None and bias is not None:
             x_normalized = x_normalized * weight + bias
@@ -494,7 +500,7 @@ class NN(ModuleBase):
             The layers of the network.
         """
 
-        def __init__(self, layers: Optional[List[dict]] = None) -> None:
+        def __init__(self, layers: Optional[list[dict]] = None) -> None:
             super().__init__()
             if layers is None:
                 layers = []
@@ -604,11 +610,11 @@ class EN(T_Network):
         def __init__(
             self,
             in_dim,
-            neuron: List[int] = [24, 48, 96],
+            neuron: list[int] = [24, 48, 96],
             activation_function: str = "tanh",
             resnet_dt: bool = False,
             precision: str = DEFAULT_PRECISION,
-            seed: Optional[Union[int, List[int]]] = None,
+            seed: Optional[Union[int, list[int]]] = None,
             bias: bool = True,
         ):
             layers = []
@@ -709,12 +715,12 @@ def __init__(
             self,
             in_dim,
             out_dim,
-            neuron: List[int] = [24, 48, 96],
+            neuron: list[int] = [24, 48, 96],
             activation_function: str = "tanh",
             resnet_dt: bool = False,
             precision: str = DEFAULT_PRECISION,
             bias_out: bool = True,
-            seed: Optional[Union[int, List[int]]] = None,
+            seed: Optional[Union[int, list[int]]] = None,
         ):
             super().__init__(
                 in_dim,
@@ -804,7 +810,7 @@ class NetworkCollection:
     """
 
     # subclass may override this
-    NETWORK_TYPE_MAP: ClassVar[Dict[str, type]] = {
+    NETWORK_TYPE_MAP: ClassVar[dict[str, type]] = {
         "network": NativeNet,
         "embedding_network": EmbeddingNet,
         "fitting_network": FittingNet,
@@ -815,7 +821,7 @@ def __init__(
         ndim: int,
         ntypes: int,
         network_type: str = "network",
-        networks: List[Union[NativeNet, dict]] = [],
+        networks: list[Union[NativeNet, dict]] = [],
     ):
         self.ndim = ndim
         self.ntypes = ntypes
diff --git a/deepmd/dpmodel/utils/nlist.py b/deepmd/dpmodel/utils/nlist.py
index c935377e6a..4806fa4cd8 100644
--- a/deepmd/dpmodel/utils/nlist.py
+++ b/deepmd/dpmodel/utils/nlist.py
@@ -1,13 +1,16 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    Dict,
-    List,
     Optional,
     Union,
 )
 
+import array_api_compat
 import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    xp_take_along_axis,
+)
+
 from .region import (
     normalize_coord,
     to_face_distance,
@@ -18,7 +21,7 @@ def extend_input_and_build_neighbor_list(
     coord,
     atype,
     rcut: float,
-    sel: List[int],
+    sel: list[int],
     mixed_types: bool = False,
     box: Optional[np.ndarray] = None,
 ):
@@ -51,7 +54,7 @@ def build_neighbor_list(
     atype: np.ndarray,
     nloc: int,
     rcut: float,
-    sel: Union[int, List[int]],
+    sel: Union[int, list[int]],
     distinguish_types: bool = True,
 ) -> np.ndarray:
     """Build neightbor list for a single frame. keeps nsel neighbors.
@@ -67,7 +70,7 @@ def build_neighbor_list(
         number of local atoms.
     rcut : float
         cut-off radius
-    sel : int or List[int]
+    sel : int or list[int]
         maximal number of neighbors (of each type).
         if distinguish_types==True, nsel should be list and
         the length of nsel should be equal to number of
@@ -90,34 +93,36 @@ def build_neighbor_list(
         For virtual atoms all neighboring positions are filled with -1.
 
     """
+    xp = array_api_compat.array_namespace(coord, atype)
     batch_size = coord.shape[0]
-    coord = coord.reshape(batch_size, -1)
+    coord = xp.reshape(coord, (batch_size, -1))
     nall = coord.shape[1] // 3
     # fill virtual atoms with large coords so they are not neighbors of any
     # real atom.
     if coord.size > 0:
-        xmax = np.max(coord) + 2.0 * rcut
+        xmax = xp.max(coord) + 2.0 * rcut
     else:
         xmax = 2.0 * rcut
     # nf x nall
     is_vir = atype < 0
-    coord1 = np.where(
-        is_vir[:, :, None], xmax, coord.reshape(batch_size, nall, 3)
-    ).reshape(batch_size, nall * 3)
+    coord1 = xp.where(
+        is_vir[:, :, None], xmax, xp.reshape(coord, (batch_size, nall, 3))
+    )
+    coord1 = xp.reshape(coord1, (batch_size, nall * 3))
     if isinstance(sel, int):
         sel = [sel]
     nsel = sum(sel)
     coord0 = coord1[:, : nloc * 3]
     diff = (
-        coord1.reshape([batch_size, -1, 3])[:, None, :, :]
-        - coord0.reshape([batch_size, -1, 3])[:, :, None, :]
+        xp.reshape(coord1, [batch_size, -1, 3])[:, None, :, :]
+        - xp.reshape(coord0, [batch_size, -1, 3])[:, :, None, :]
     )
     assert list(diff.shape) == [batch_size, nloc, nall, 3]
-    rr = np.linalg.norm(diff, axis=-1)
+    rr = xp.linalg.vector_norm(diff, axis=-1)
     # if central atom has two zero distances, sorting sometimes can not exclude itself
-    rr -= np.eye(nloc, nall, dtype=diff.dtype)[np.newaxis, :, :]
-    nlist = np.argsort(rr, axis=-1)
-    rr = np.sort(rr, axis=-1)
+    rr -= xp.eye(nloc, nall, dtype=diff.dtype)[xp.newaxis, :, :]
+    nlist = xp.argsort(rr, axis=-1)
+    rr = xp.sort(rr, axis=-1)
     rr = rr[:, :, 1:]
     nlist = nlist[:, :, 1:]
     nnei = rr.shape[2]
@@ -125,16 +130,20 @@ def build_neighbor_list(
         rr = rr[:, :, :nsel]
         nlist = nlist[:, :, :nsel]
     else:
-        rr = np.concatenate(
-            [rr, np.ones([batch_size, nloc, nsel - nnei]) + rcut],  # pylint: disable=no-explicit-dtype
+        rr = xp.concatenate(
+            [rr, xp.ones([batch_size, nloc, nsel - nnei]) + rcut],  # pylint: disable=no-explicit-dtype
             axis=-1,
         )
-        nlist = np.concatenate(
-            [nlist, np.ones([batch_size, nloc, nsel - nnei], dtype=nlist.dtype)],
+        nlist = xp.concatenate(
+            [nlist, xp.ones([batch_size, nloc, nsel - nnei], dtype=nlist.dtype)],
             axis=-1,
         )
     assert list(nlist.shape) == [batch_size, nloc, nsel]
-    nlist = np.where(np.logical_or((rr > rcut), is_vir[:, :nloc, None]), -1, nlist)
+    nlist = xp.where(
+        xp.logical_or((rr > rcut), is_vir[:, :nloc, None]),
+        xp.full_like(nlist, -1),
+        nlist,
+    )
 
     if distinguish_types:
         return nlist_distinguish_types(nlist, atype, sel)
@@ -145,29 +154,30 @@ def build_neighbor_list(
 def nlist_distinguish_types(
     nlist: np.ndarray,
     atype: np.ndarray,
-    sel: List[int],
+    sel: list[int],
 ):
     """Given a nlist that does not distinguish atom types, return a nlist that
     distinguish atom types.
 
     """
+    xp = array_api_compat.array_namespace(nlist, atype)
     nf, nloc, _ = nlist.shape
     ret_nlist = []
-    tmp_atype = np.tile(atype[:, None], [1, nloc, 1])
+    tmp_atype = xp.tile(atype[:, None], [1, nloc, 1])
     mask = nlist == -1
     tnlist_0 = nlist.copy()
     tnlist_0[mask] = 0
-    tnlist = np.take_along_axis(tmp_atype, tnlist_0, axis=2).squeeze()
-    tnlist = np.where(mask, -1, tnlist)
+    tnlist = xp_take_along_axis(tmp_atype, tnlist_0, axis=2).squeeze()
+    tnlist = xp.where(mask, -1, tnlist)
     snsel = tnlist.shape[2]
     for ii, ss in enumerate(sel):
-        pick_mask = (tnlist == ii).astype(np.int32)
-        sorted_indices = np.argsort(-pick_mask, kind="stable", axis=-1)
-        pick_mask_sorted = -np.sort(-pick_mask, axis=-1)
-        inlist = np.take_along_axis(nlist, sorted_indices, axis=2)
-        inlist = np.where(~pick_mask_sorted.astype(bool), -1, inlist)
-        ret_nlist.append(np.split(inlist, [ss, snsel - ss], axis=-1)[0])
-    ret = np.concatenate(ret_nlist, axis=-1)
+        pick_mask = (tnlist == ii).astype(xp.int32)
+        sorted_indices = xp.argsort(-pick_mask, kind="stable", axis=-1)
+        pick_mask_sorted = -xp.sort(-pick_mask, axis=-1)
+        inlist = xp_take_along_axis(nlist, sorted_indices, axis=2)
+        inlist = xp.where(~pick_mask_sorted.astype(bool), -1, inlist)
+        ret_nlist.append(xp.split(inlist, [ss, snsel - ss], axis=-1)[0])
+    ret = xp.concat(ret_nlist, axis=-1)
     return ret
 
 
@@ -179,9 +189,9 @@ def get_multiple_nlist_key(rcut: float, nsel: int) -> str:
 def build_multiple_neighbor_list(
     coord: np.ndarray,
     nlist: np.ndarray,
-    rcuts: List[float],
-    nsels: List[int],
-) -> Dict[str, np.ndarray]:
+    rcuts: list[float],
+    nsels: list[int],
+) -> dict[str, np.ndarray]:
     """Input one neighbor list, and produce multiple neighbor lists with
     different cutoff radius and numbers of selection out of it.  The
     required rcuts and nsels should be smaller or equal to the input nlist.
@@ -193,14 +203,14 @@ def build_multiple_neighbor_list(
     nlist : np.ndarray
         Neighbor list of shape [batch_size, nloc, nsel], the neighbors
         should be stored in an ascending order.
-    rcuts : List[float]
+    rcuts : list[float]
         list of cut-off radius in ascending order.
-    nsels : List[int]
+    nsels : list[int]
         maximal number of neighbors in ascending order.
 
     Returns
     -------
-    nlist_dict : Dict[str, np.ndarray]
+    nlist_dict : dict[str, np.ndarray]
         A dict of nlists, key given by get_multiple_nlist_key(rc, nsel)
         value being the corresponding nlist.
 
@@ -265,36 +275,46 @@ def extend_coord_with_ghosts(
         maping extended index to the local index
 
     """
+    xp = array_api_compat.array_namespace(coord, atype)
     nf, nloc = atype.shape
-    aidx = np.tile(np.arange(nloc)[np.newaxis, :], (nf, 1))  # pylint: disable=no-explicit-dtype
+    aidx = xp.tile(xp.arange(nloc)[xp.newaxis, :], (nf, 1))  # pylint: disable=no-explicit-dtype
     if cell is None:
         nall = nloc
-        extend_coord = coord.copy()
-        extend_atype = atype.copy()
-        extend_aidx = aidx.copy()
+        extend_coord = coord
+        extend_atype = atype
+        extend_aidx = aidx
     else:
-        coord = coord.reshape((nf, nloc, 3))
-        cell = cell.reshape((nf, 3, 3))
+        coord = xp.reshape(coord, (nf, nloc, 3))
+        cell = xp.reshape(cell, (nf, 3, 3))
         to_face = to_face_distance(cell)
-        nbuff = np.ceil(rcut / to_face).astype(int)
-        nbuff = np.max(nbuff, axis=0)
-        xi = np.arange(-nbuff[0], nbuff[0] + 1, 1)  # pylint: disable=no-explicit-dtype
-        yi = np.arange(-nbuff[1], nbuff[1] + 1, 1)  # pylint: disable=no-explicit-dtype
-        zi = np.arange(-nbuff[2], nbuff[2] + 1, 1)  # pylint: disable=no-explicit-dtype
-        xyz = np.outer(xi, np.array([1, 0, 0]))[:, np.newaxis, np.newaxis, :]
-        xyz = xyz + np.outer(yi, np.array([0, 1, 0]))[np.newaxis, :, np.newaxis, :]
-        xyz = xyz + np.outer(zi, np.array([0, 0, 1]))[np.newaxis, np.newaxis, :, :]
-        xyz = xyz.reshape(-1, 3)
-        shift_idx = xyz[np.argsort(np.linalg.norm(xyz, axis=1))]
+        nbuff = xp.astype(xp.ceil(rcut / to_face), xp.int64)
+        nbuff = xp.max(nbuff, axis=0)
+        xi = xp.arange(-int(nbuff[0]), int(nbuff[0]) + 1, 1)  # pylint: disable=no-explicit-dtype
+        yi = xp.arange(-int(nbuff[1]), int(nbuff[1]) + 1, 1)  # pylint: disable=no-explicit-dtype
+        zi = xp.arange(-int(nbuff[2]), int(nbuff[2]) + 1, 1)  # pylint: disable=no-explicit-dtype
+        xyz = xp.linalg.outer(xi, xp.asarray([1, 0, 0]))[:, xp.newaxis, xp.newaxis, :]
+        xyz = (
+            xyz
+            + xp.linalg.outer(yi, xp.asarray([0, 1, 0]))[xp.newaxis, :, xp.newaxis, :]
+        )
+        xyz = (
+            xyz
+            + xp.linalg.outer(zi, xp.asarray([0, 0, 1]))[xp.newaxis, xp.newaxis, :, :]
+        )
+        xyz = xp.reshape(xyz, (-1, 3))
+        xyz = xp.astype(xyz, coord.dtype)
+        shift_idx = xp.take(xyz, xp.argsort(xp.linalg.vector_norm(xyz, axis=1)), axis=0)
         ns, _ = shift_idx.shape
         nall = ns * nloc
-        shift_vec = np.einsum("sd,fdk->fsk", shift_idx, cell)
+        # shift_vec = xp.einsum("sd,fdk->fsk", shift_idx, cell)
+        shift_vec = xp.tensordot(shift_idx, cell, axes=([1], [1]))
+        shift_vec = xp.permute_dims(shift_vec, (1, 0, 2))
         extend_coord = coord[:, None, :, :] + shift_vec[:, :, None, :]
-        extend_atype = np.tile(atype[:, :, np.newaxis], (1, ns, 1))
-        extend_aidx = np.tile(aidx[:, :, np.newaxis], (1, ns, 1))
+        extend_atype = xp.tile(atype[:, :, xp.newaxis], (1, ns, 1))
+        extend_aidx = xp.tile(aidx[:, :, xp.newaxis], (1, ns, 1))
 
     return (
-        extend_coord.reshape((nf, nall * 3)),
-        extend_atype.reshape((nf, nall)),
-        extend_aidx.reshape((nf, nall)),
+        xp.reshape(extend_coord, (nf, nall * 3)),
+        xp.reshape(extend_atype, (nf, nall)),
+        xp.reshape(extend_aidx, (nf, nall)),
     )
diff --git a/deepmd/dpmodel/utils/region.py b/deepmd/dpmodel/utils/region.py
index ddbc4b29b8..8102020827 100644
--- a/deepmd/dpmodel/utils/region.py
+++ b/deepmd/dpmodel/utils/region.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import array_api_compat
 import numpy as np
 
 
@@ -21,8 +22,9 @@ def phys2inter(
         the internal coordinates
 
     """
-    rec_cell = np.linalg.inv(cell)
-    return np.matmul(coord, rec_cell)
+    xp = array_api_compat.array_namespace(coord, cell)
+    rec_cell = xp.linalg.inv(cell)
+    return xp.matmul(coord, rec_cell)
 
 
 def inter2phys(
@@ -44,7 +46,8 @@ def inter2phys(
         the physical coordinates
 
     """
-    return np.matmul(coord, cell)
+    xp = array_api_compat.array_namespace(coord, cell)
+    return xp.matmul(coord, cell)
 
 
 def normalize_coord(
@@ -66,8 +69,9 @@ def normalize_coord(
         wrapped coordinates of shape [*, na, 3].
 
     """
+    xp = array_api_compat.array_namespace(coord, cell)
     icoord = phys2inter(coord, cell)
-    icoord = np.remainder(icoord, 1.0)
+    icoord = xp.remainder(icoord, 1.0)
     return inter2phys(icoord, cell)
 
 
@@ -87,17 +91,19 @@ def to_face_distance(
         the to face distances of shape [*, 3]
 
     """
+    xp = array_api_compat.array_namespace(cell)
     cshape = cell.shape
-    dist = b_to_face_distance(cell.reshape([-1, 3, 3]))
-    return dist.reshape(list(cshape[:-2]) + [3])  # noqa:RUF005
+    dist = b_to_face_distance(xp.reshape(cell, [-1, 3, 3]))
+    return xp.reshape(dist, list(cshape[:-2]) + [3])  # noqa:RUF005
 
 
 def b_to_face_distance(cell):
-    volume = np.linalg.det(cell)
-    c_yz = np.cross(cell[:, 1], cell[:, 2], axis=-1)
-    _h2yz = volume / np.linalg.norm(c_yz, axis=-1)
-    c_zx = np.cross(cell[:, 2], cell[:, 0], axis=-1)
-    _h2zx = volume / np.linalg.norm(c_zx, axis=-1)
-    c_xy = np.cross(cell[:, 0], cell[:, 1], axis=-1)
-    _h2xy = volume / np.linalg.norm(c_xy, axis=-1)
-    return np.stack([_h2yz, _h2zx, _h2xy], axis=1)
+    xp = array_api_compat.array_namespace(cell)
+    volume = xp.linalg.det(cell)
+    c_yz = xp.linalg.cross(cell[:, 1, ...], cell[:, 2, ...], axis=-1)
+    _h2yz = volume / xp.linalg.vector_norm(c_yz, axis=-1)
+    c_zx = xp.linalg.cross(cell[:, 2, ...], cell[:, 0, ...], axis=-1)
+    _h2zx = volume / xp.linalg.vector_norm(c_zx, axis=-1)
+    c_xy = xp.linalg.cross(cell[:, 0, ...], cell[:, 1, ...], axis=-1)
+    _h2xy = volume / xp.linalg.vector_norm(c_xy, axis=-1)
+    return xp.stack([_h2yz, _h2zx, _h2xy], axis=1)
diff --git a/deepmd/dpmodel/utils/seed.py b/deepmd/dpmodel/utils/seed.py
index 4ceab80066..165ff558b9 100644
--- a/deepmd/dpmodel/utils/seed.py
+++ b/deepmd/dpmodel/utils/seed.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
     Union,
     overload,
@@ -12,10 +11,10 @@ def child_seed(seed: None, idx: int) -> None: ...
 
 
 @overload
-def child_seed(seed: Union[int, List[int]], idx: int) -> List[int]: ...
+def child_seed(seed: Union[int, list[int]], idx: int) -> list[int]: ...
 
 
-def child_seed(seed: Optional[Union[int, List[int]]], idx: int) -> Optional[List[int]]:
+def child_seed(seed: Optional[Union[int, list[int]]], idx: int) -> Optional[list[int]]:
     """Generate a child seed from a parent seed.
 
     Parameters
@@ -27,7 +26,7 @@ def child_seed(seed: Optional[Union[int, List[int]]], idx: int) -> Optional[List
 
     Returns
     -------
-    Optional[List[int]]
+    Optional[list[int]]
         The child seed.
     """
     # See https://numpy.org/doc/stable/reference/random/parallel.html#sequence-of-integer-seeds
diff --git a/deepmd/dpmodel/utils/type_embed.py b/deepmd/dpmodel/utils/type_embed.py
index 04c05b6a39..e28b6abb31 100644
--- a/deepmd/dpmodel/utils/type_embed.py
+++ b/deepmd/dpmodel/utils/type_embed.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -51,7 +50,7 @@ class TypeEmbedNet(NativeOP):
         Whether to use electronic configuration type embedding.
     use_tebd_bias : bool, Optional
         Whether to use bias in the type embedding layer.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
         A list of strings. Give the name to each type of atoms.
     """
 
@@ -59,16 +58,16 @@ def __init__(
         self,
         *,
         ntypes: int,
-        neuron: List[int],
+        neuron: list[int],
         resnet_dt: bool = False,
         activation_function: str = "tanh",
         precision: str = "default",
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
         padding: bool = False,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
     ) -> None:
         self.ntypes = ntypes
         self.neuron = neuron
@@ -107,7 +106,7 @@ def call(self) -> np.ndarray:
             embed = self.embedding_net(self.econf_tebd)
         if self.padding:
             embed_pad = xp.zeros((1, embed.shape[-1]), dtype=embed.dtype)
-            embed = xp.concatenate([embed, embed_pad], axis=0)
+            embed = xp.concat([embed, embed_pad], axis=0)
         return embed
 
     @classmethod
@@ -162,7 +161,7 @@ def serialize(self) -> dict:
         }
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
diff --git a/deepmd/dpmodel/utils/update_sel.py b/deepmd/dpmodel/utils/update_sel.py
index dc38a6a041..3f2900771f 100644
--- a/deepmd/dpmodel/utils/update_sel.py
+++ b/deepmd/dpmodel/utils/update_sel.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    Type,
-)
 
 from deepmd.dpmodel.utils.neighbor_stat import (
     NeighborStat,
@@ -13,5 +10,5 @@
 
 class UpdateSel(BaseUpdateSel):
     @property
-    def neighbor_stat(self) -> Type[NeighborStat]:
+    def neighbor_stat(self) -> type[NeighborStat]:
         return NeighborStat
diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py
index ba2eb90247..05f660cb9a 100644
--- a/deepmd/entrypoints/main.py
+++ b/deepmd/entrypoints/main.py
@@ -43,7 +43,7 @@ def main(args: argparse.Namespace):
 
     Parameters
     ----------
-    args : List[str] or argparse.Namespace, optional
+    args : list[str] or argparse.Namespace, optional
         list of command line arguments, used to avoid calling from the subprocess,
         as it is quite slow to import tensorflow; if Namespace is given, it will
         be used directly
diff --git a/deepmd/entrypoints/neighbor_stat.py b/deepmd/entrypoints/neighbor_stat.py
index 8840851b91..62dceb24fd 100644
--- a/deepmd/entrypoints/neighbor_stat.py
+++ b/deepmd/entrypoints/neighbor_stat.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
-    List,
     Optional,
 )
 
@@ -22,7 +21,7 @@ def neighbor_stat(
     *,
     system: str,
     rcut: float,
-    type_map: Optional[List[str]],
+    type_map: Optional[list[str]],
     mixed_type: bool = False,
     backend: str = "tensorflow",
     **kwargs,
diff --git a/deepmd/entrypoints/show.py b/deepmd/entrypoints/show.py
index 6f72c4614d..4cad5f312c 100644
--- a/deepmd/entrypoints/show.py
+++ b/deepmd/entrypoints/show.py
@@ -1,8 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
-from typing import (
-    List,
-)
 
 from deepmd.infer.deep_eval import (
     DeepEval,
@@ -14,7 +11,7 @@
 def show(
     *,
     INPUT: str,
-    ATTRIBUTES: List[str],
+    ATTRIBUTES: list[str],
     **kwargs,
 ):
     model = DeepEval(INPUT, head=0)
diff --git a/deepmd/entrypoints/test.py b/deepmd/entrypoints/test.py
index 6709a9cb29..ad445fdea1 100644
--- a/deepmd/entrypoints/test.py
+++ b/deepmd/entrypoints/test.py
@@ -7,10 +7,7 @@
 )
 from typing import (
     TYPE_CHECKING,
-    Dict,
-    List,
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -266,7 +263,7 @@ def test_ener(
     detail_file: Optional[str],
     has_atom_ener: bool,
     append_detail: bool = False,
-) -> Tuple[List[np.ndarray], List[int]]:
+) -> tuple[list[np.ndarray], list[int]]:
     """Test energy type model.
 
     Parameters
@@ -288,7 +285,7 @@ def test_ener(
 
     Returns
     -------
-    Tuple[List[np.ndarray], List[int]]
+    tuple[list[np.ndarray], list[int]]
         arrays with results and their shapes
     """
     data.add("energy", 1, atomic=False, must=False, high_prec=True)
@@ -564,7 +561,7 @@ def test_ener(
         }
 
 
-def print_ener_sys_avg(avg: Dict[str, float]):
+def print_ener_sys_avg(avg: dict[str, float]):
     """Print errors summary for energy type potential.
 
     Parameters
@@ -598,7 +595,7 @@ def test_dos(
     detail_file: Optional[str],
     has_atom_dos: bool,
     append_detail: bool = False,
-) -> Tuple[List[np.ndarray], List[int]]:
+) -> tuple[list[np.ndarray], list[int]]:
     """Test DOS type model.
 
     Parameters
@@ -620,7 +617,7 @@ def test_dos(
 
     Returns
     -------
-    Tuple[List[np.ndarray], List[int]]
+    tuple[list[np.ndarray], list[int]]
         arrays with results and their shapes
     """
     data.add("dos", dp.numb_dos, atomic=False, must=True, high_prec=True)
@@ -736,7 +733,7 @@ def test_dos(
     }
 
 
-def print_dos_sys_avg(avg: Dict[str, float]):
+def print_dos_sys_avg(avg: dict[str, float]):
     """Print errors summary for DOS type potential.
 
     Parameters
@@ -758,7 +755,7 @@ def test_property(
     detail_file: Optional[str],
     has_atom_property: bool,
     append_detail: bool = False,
-) -> Tuple[List[np.ndarray], List[int]]:
+) -> tuple[list[np.ndarray], list[int]]:
     """Test Property type model.
 
     Parameters
@@ -780,7 +777,7 @@ def test_property(
 
     Returns
     -------
-    Tuple[List[np.ndarray], List[int]]
+    tuple[list[np.ndarray], list[int]]
         arrays with results and their shapes
     """
     data.add("property", dp.task_dim, atomic=False, must=True, high_prec=True)
@@ -890,7 +887,7 @@ def test_property(
     }
 
 
-def print_property_sys_avg(avg: Dict[str, float]):
+def print_property_sys_avg(avg: dict[str, float]):
     """Print errors summary for Property type potential.
 
     Parameters
@@ -940,7 +937,7 @@ def test_wfc(
     data: DeepmdData,
     numb_test: int,
     detail_file: Optional[str],
-) -> Tuple[List[np.ndarray], List[int]]:
+) -> tuple[list[np.ndarray], list[int]]:
     """Test energy type model.
 
     Parameters
@@ -956,7 +953,7 @@ def test_wfc(
 
     Returns
     -------
-    Tuple[List[np.ndarray], List[int]]
+    tuple[list[np.ndarray], list[int]]
         arrays with results and their shapes
     """
     data.add(
@@ -1004,7 +1001,7 @@ def test_polar(
     detail_file: Optional[str],
     *,
     atomic: bool,
-) -> Tuple[List[np.ndarray], List[int]]:
+) -> tuple[list[np.ndarray], list[int]]:
     """Test energy type model.
 
     Parameters
@@ -1022,7 +1019,7 @@ def test_polar(
 
     Returns
     -------
-    Tuple[List[np.ndarray], List[int]]
+    tuple[list[np.ndarray], list[int]]
         arrays with results and their shapes
     """
     data.add(
@@ -1145,7 +1142,7 @@ def test_dipole(
     numb_test: int,
     detail_file: Optional[str],
     atomic: bool,
-) -> Tuple[List[np.ndarray], List[int]]:
+) -> tuple[list[np.ndarray], list[int]]:
     """Test energy type model.
 
     Parameters
@@ -1163,7 +1160,7 @@ def test_dipole(
 
     Returns
     -------
-    Tuple[List[np.ndarray], List[int]]
+    tuple[list[np.ndarray], list[int]]
         arrays with results and their shapes
     """
     data.add(
diff --git a/deepmd/env.py b/deepmd/env.py
index 139e95b824..605dfeed99 100644
--- a/deepmd/env.py
+++ b/deepmd/env.py
@@ -7,10 +7,6 @@
 from pathlib import (
     Path,
 )
-from typing import (
-    Dict,
-    Tuple,
-)
 
 import numpy as np
 
@@ -105,7 +101,7 @@ def set_default_nthreads():
         set_env_if_empty("DP_INTER_OP_PARALLELISM_THREADS", "0", verbose=False)
 
 
-def get_default_nthreads() -> Tuple[int, int]:
+def get_default_nthreads() -> tuple[int, int]:
     """Get paralellism settings.
 
     The method will first read the environment variables with the prefix `DP_`.
@@ -114,7 +110,7 @@ def get_default_nthreads() -> Tuple[int, int]:
 
     Returns
     -------
-    Tuple[int, int]
+    tuple[int, int]
         number of `DP_INTRA_OP_PARALLELISM_THREADS` and
         `DP_INTER_OP_PARALLELISM_THREADS`
     """
@@ -133,7 +129,7 @@ def get_default_nthreads() -> Tuple[int, int]:
 
 def _get_package_constants(
     config_file: Path = CONFIG_FILE,
-) -> Dict[str, str]:
+) -> dict[str, str]:
     """Read package constants set at compile time by CMake to dictionary.
 
     Parameters
@@ -143,7 +139,7 @@ def _get_package_constants(
 
     Returns
     -------
-    Dict[str, str]
+    dict[str, str]
         dictionary with package constants
     """
     if not config_file.is_file():
diff --git a/deepmd/infer/deep_dos.py b/deepmd/infer/deep_dos.py
index b26555627f..0d7ccee2b6 100644
--- a/deepmd/infer/deep_dos.py
+++ b/deepmd/infer/deep_dos.py
@@ -1,9 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Any,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -64,13 +62,13 @@ def eval(
         self,
         coords: np.ndarray,
         cells: Optional[np.ndarray],
-        atom_types: Union[List[int], np.ndarray],
+        atom_types: Union[list[int], np.ndarray],
         atomic: bool = False,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
         mixed_type: bool = False,
         **kwargs: Any,
-    ) -> Tuple[np.ndarray, ...]:
+    ) -> tuple[np.ndarray, ...]:
         """Evaluate energy, force, and virial. If atomic is True,
         also return atomic energy and atomic virial.
 
@@ -81,7 +79,7 @@ def eval(
         cells : np.ndarray
             The cell vectors of the system, in shape (nframes, 9). If the system
             is not periodic, set it to None.
-        atom_types : List[int] or np.ndarray
+        atom_types : list[int] or np.ndarray
             The types of the atoms. If mixed_type is False, the shape is (natoms,);
             otherwise, the shape is (nframes, natoms).
         atomic : bool, optional
@@ -92,7 +90,7 @@ def eval(
             The atomic parameters, by default None.
         mixed_type : bool, optional
             Whether the atom_types is mixed type, by default False.
-        **kwargs : Dict[str, Any]
+        **kwargs : dict[str, Any]
             Keyword arguments.
 
         Returns
diff --git a/deepmd/infer/deep_eval.py b/deepmd/infer/deep_eval.py
index f35094df3d..4d0134c37c 100644
--- a/deepmd/infer/deep_eval.py
+++ b/deepmd/infer/deep_eval.py
@@ -7,11 +7,7 @@
     TYPE_CHECKING,
     Any,
     ClassVar,
-    Dict,
-    List,
     Optional,
-    Tuple,
-    Type,
     Union,
 )
 
@@ -111,7 +107,7 @@ def eval(
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
         **kwargs: Any,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         """Evaluate the energy, force and virial by using this DP.
 
         Parameters
@@ -158,7 +154,7 @@ def get_ntypes(self) -> int:
         """Get the number of atom types of this model."""
 
     @abstractmethod
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map (element name of the atom types) of this model."""
 
     @abstractmethod
@@ -256,11 +252,11 @@ def _check_mixed_types(self, atom_types: np.ndarray) -> bool:
 
     @property
     @abstractmethod
-    def model_type(self) -> Type["DeepEval"]:
+    def model_type(self) -> type["DeepEval"]:
         """The the evaluator of the model type."""
 
     @abstractmethod
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
@@ -355,7 +351,7 @@ def get_ntypes(self) -> int:
         """Get the number of atom types of this model."""
         return self.deep_eval.get_ntypes()
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map (element name of the atom types) of this model."""
         return self.deep_eval.get_type_map()
 
@@ -372,7 +368,7 @@ def _get_natoms_and_nframes(
         coords: np.ndarray,
         atom_types: np.ndarray,
         mixed_type: bool = False,
-    ) -> Tuple[int, int]:
+    ) -> tuple[int, int]:
         if mixed_type or atom_types.ndim > 1:
             natoms = len(atom_types[0])
         else:
@@ -525,7 +521,7 @@ def _standard_input(self, coords, cells, atom_types, fparam, aparam, mixed_type)
                 )
         return coords, cells, atom_types, fparam, aparam, nframes, natoms
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
diff --git a/deepmd/infer/deep_polar.py b/deepmd/infer/deep_polar.py
index 22561a0685..7220e53637 100644
--- a/deepmd/infer/deep_polar.py
+++ b/deepmd/infer/deep_polar.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -51,7 +50,7 @@ def eval(
         self,
         coords: np.ndarray,
         cells: Optional[np.ndarray],
-        atom_types: Union[List[int], np.ndarray],
+        atom_types: Union[list[int], np.ndarray],
         atomic: bool = False,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
diff --git a/deepmd/infer/deep_pot.py b/deepmd/infer/deep_pot.py
index 0632fd1c84..4755bc276a 100644
--- a/deepmd/infer/deep_pot.py
+++ b/deepmd/infer/deep_pot.py
@@ -1,10 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Any,
-    List,
     Literal,
     Optional,
-    Tuple,
     Union,
     overload,
 )
@@ -95,13 +93,13 @@ def eval(
         self,
         coords: np.ndarray,
         cells: Optional[np.ndarray],
-        atom_types: Union[List[int], np.ndarray],
+        atom_types: Union[list[int], np.ndarray],
         atomic: Literal[True],
         fparam: Optional[np.ndarray],
         aparam: Optional[np.ndarray],
         mixed_type: bool,
         **kwargs: Any,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
         pass
 
     @overload
@@ -109,13 +107,13 @@ def eval(
         self,
         coords: np.ndarray,
         cells: Optional[np.ndarray],
-        atom_types: Union[List[int], np.ndarray],
+        atom_types: Union[list[int], np.ndarray],
         atomic: Literal[False],
         fparam: Optional[np.ndarray],
         aparam: Optional[np.ndarray],
         mixed_type: bool,
         **kwargs: Any,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         pass
 
     @overload
@@ -123,26 +121,26 @@ def eval(
         self,
         coords: np.ndarray,
         cells: Optional[np.ndarray],
-        atom_types: Union[List[int], np.ndarray],
+        atom_types: Union[list[int], np.ndarray],
         atomic: bool,
         fparam: Optional[np.ndarray],
         aparam: Optional[np.ndarray],
         mixed_type: bool,
         **kwargs: Any,
-    ) -> Tuple[np.ndarray, ...]:
+    ) -> tuple[np.ndarray, ...]:
         pass
 
     def eval(
         self,
         coords: np.ndarray,
         cells: Optional[np.ndarray],
-        atom_types: Union[List[int], np.ndarray],
+        atom_types: Union[list[int], np.ndarray],
         atomic: bool = False,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
         mixed_type: bool = False,
         **kwargs: Any,
-    ) -> Tuple[np.ndarray, ...]:
+    ) -> tuple[np.ndarray, ...]:
         """Evaluate energy, force, and virial. If atomic is True,
         also return atomic energy and atomic virial.
 
@@ -153,7 +151,7 @@ def eval(
         cells : np.ndarray
             The cell vectors of the system, in shape (nframes, 9). If the system
             is not periodic, set it to None.
-        atom_types : List[int] or np.ndarray
+        atom_types : list[int] or np.ndarray
             The types of the atoms. If mixed_type is False, the shape is (natoms,);
             otherwise, the shape is (nframes, natoms).
         atomic : bool, optional
@@ -164,7 +162,7 @@ def eval(
             The atomic parameters, by default None.
         mixed_type : bool, optional
             Whether the atom_types is mixed type, by default False.
-        **kwargs : Dict[str, Any]
+        **kwargs : dict[str, Any]
             Keyword arguments.
 
         Returns
diff --git a/deepmd/infer/deep_property.py b/deepmd/infer/deep_property.py
index 5376fb1efc..4a3283cf32 100644
--- a/deepmd/infer/deep_property.py
+++ b/deepmd/infer/deep_property.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Any,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -69,13 +66,13 @@ def eval(
         self,
         coords: np.ndarray,
         cells: Optional[np.ndarray],
-        atom_types: Union[List[int], np.ndarray],
+        atom_types: Union[list[int], np.ndarray],
         atomic: bool = False,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
         mixed_type: bool = False,
-        **kwargs: Dict[str, Any],
-    ) -> Tuple[np.ndarray, ...]:
+        **kwargs: dict[str, Any],
+    ) -> tuple[np.ndarray, ...]:
         """Evaluate properties. If atomic is True, also return atomic property.
 
         Parameters
@@ -85,7 +82,7 @@ def eval(
         cells : np.ndarray
             The cell vectors of the system, in shape (nframes, 9). If the system
             is not periodic, set it to None.
-        atom_types : List[int] or np.ndarray
+        atom_types : list[int] or np.ndarray
             The types of the atoms. If mixed_type is False, the shape is (natoms,);
             otherwise, the shape is (nframes, natoms).
         atomic : bool, optional
@@ -96,7 +93,7 @@ def eval(
             The atomic parameters, by default None.
         mixed_type : bool, optional
             Whether the atom_types is mixed type, by default False.
-        **kwargs : Dict[str, Any]
+        **kwargs : dict[str, Any]
             Keyword arguments.
 
         Returns
diff --git a/deepmd/infer/deep_tensor.py b/deepmd/infer/deep_tensor.py
index 48918e7c75..bb5bc12697 100644
--- a/deepmd/infer/deep_tensor.py
+++ b/deepmd/infer/deep_tensor.py
@@ -3,9 +3,7 @@
     abstractmethod,
 )
 from typing import (
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -44,7 +42,7 @@ def eval(
         self,
         coords: np.ndarray,
         cells: Optional[np.ndarray],
-        atom_types: Union[List[int], np.ndarray],
+        atom_types: Union[list[int], np.ndarray],
         atomic: bool = True,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
@@ -119,7 +117,7 @@ def eval_full(
         aparam: Optional[np.ndarray] = None,
         mixed_type: bool = False,
         **kwargs: dict,
-    ) -> Tuple[np.ndarray, ...]:
+    ) -> tuple[np.ndarray, ...]:
         """Evaluate the model with interface similar to the energy model.
         Will return global tensor, component-wise force and virial
         and optionally atomic tensor and atomic virial.
@@ -250,7 +248,7 @@ def eval_full(
         aparam: Optional[np.ndarray] = None,
         mixed_type: bool = False,
         **kwargs: dict,
-    ) -> Tuple[np.ndarray, ...]:
+    ) -> tuple[np.ndarray, ...]:
         """Unsupported method."""
         raise RuntimeError(
             "This model does not support eval_full method. Use eval instead."
diff --git a/deepmd/infer/model_devi.py b/deepmd/infer/model_devi.py
index 83708c7114..29e1eec741 100644
--- a/deepmd/infer/model_devi.py
+++ b/deepmd/infer/model_devi.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Optional,
-    Tuple,
     overload,
 )
 
@@ -29,7 +28,7 @@ def calc_model_devi_f(
     real_f: Optional[np.ndarray] = None,
     relative: Optional[float] = None,
     atomic: Literal[False] = ...,
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: ...
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ...
 
 
 @overload
@@ -38,7 +37,7 @@ def calc_model_devi_f(
     real_f: Optional[np.ndarray] = None,
     relative: Optional[float] = None,
     atomic: Literal[True] = ...,
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: ...
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: ...
 
 
 @overload
@@ -47,7 +46,7 @@ def calc_model_devi_f(
     real_f: Optional[np.ndarray] = None,
     relative: Optional[float] = None,
     atomic: bool = False,
-) -> Tuple[np.ndarray, ...]: ...
+) -> tuple[np.ndarray, ...]: ...
 
 
 def calc_model_devi_f(
@@ -55,7 +54,7 @@ def calc_model_devi_f(
     real_f: Optional[np.ndarray] = None,
     relative: Optional[float] = None,
     atomic: bool = False,
-) -> Tuple[np.ndarray, ...]:
+) -> tuple[np.ndarray, ...]:
     """Calculate model deviation of force.
 
     Parameters
@@ -141,7 +140,7 @@ def calc_model_devi_v(
     vs: np.ndarray,
     real_v: Optional[np.ndarray] = None,
     relative: Optional[float] = None,
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
     """Calculate model deviation of virial.
 
     Parameters
diff --git a/deepmd/jax/common.py b/deepmd/jax/common.py
index 550b168b29..9c144a41d1 100644
--- a/deepmd/jax/common.py
+++ b/deepmd/jax/common.py
@@ -1,13 +1,18 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    Union,
+    Any,
+    Optional,
     overload,
 )
 
 import numpy as np
 
+from deepmd.dpmodel.common import (
+    NativeOP,
+)
 from deepmd.jax.env import (
     jnp,
+    nnx,
 )
 
 
@@ -19,7 +24,7 @@ def to_jax_array(array: np.ndarray) -> jnp.ndarray: ...
 def to_jax_array(array: None) -> None: ...
 
 
-def to_jax_array(array: Union[np.ndarray]) -> Union[jnp.ndarray]:
+def to_jax_array(array: Optional[np.ndarray]) -> Optional[jnp.ndarray]:
     """Convert a numpy array to a JAX array.
 
     Parameters
@@ -35,3 +40,44 @@ def to_jax_array(array: Union[np.ndarray]) -> Union[jnp.ndarray]:
     if array is None:
         return None
     return jnp.array(array)
+
+
+def flax_module(
+    module: NativeOP,
+) -> nnx.Module:
+    """Convert a NativeOP to a Flax module.
+
+    Parameters
+    ----------
+    module : NativeOP
+        The NativeOP to convert.
+
+    Returns
+    -------
+    flax.nnx.Module
+        The Flax module.
+
+    Examples
+    --------
+    >>> @flax_module
+    ... class MyModule(NativeOP):
+    ...     pass
+    """
+    metas = set()
+    if not issubclass(type(nnx.Module), type(module)):
+        metas.add(type(module))
+    if not issubclass(type(module), type(nnx.Module)):
+        metas.add(type(nnx.Module))
+
+    class MixedMetaClass(*metas):
+        def __call__(self, *args, **kwargs):
+            return type(nnx.Module).__call__(self, *args, **kwargs)
+
+    class FlaxModule(module, nnx.Module, metaclass=MixedMetaClass):
+        def __init_subclass__(cls, **kwargs) -> None:
+            return super().__init_subclass__(**kwargs)
+
+        def __setattr__(self, name: str, value: Any) -> None:
+            return super().__setattr__(name, value)
+
+    return FlaxModule
diff --git a/deepmd/jax/descriptor/__init__.py b/deepmd/jax/descriptor/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/deepmd/jax/descriptor/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/deepmd/jax/descriptor/dpa1.py b/deepmd/jax/descriptor/dpa1.py
new file mode 100644
index 0000000000..a9b0404970
--- /dev/null
+++ b/deepmd/jax/descriptor/dpa1.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
+from deepmd.dpmodel.descriptor.dpa1 import DescrptBlockSeAtten as DescrptBlockSeAttenDP
+from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1 as DescrptDPA1DP
+from deepmd.dpmodel.descriptor.dpa1 import GatedAttentionLayer as GatedAttentionLayerDP
+from deepmd.dpmodel.descriptor.dpa1 import (
+    NeighborGatedAttention as NeighborGatedAttentionDP,
+)
+from deepmd.dpmodel.descriptor.dpa1 import (
+    NeighborGatedAttentionLayer as NeighborGatedAttentionLayerDP,
+)
+from deepmd.jax.common import (
+    flax_module,
+    to_jax_array,
+)
+from deepmd.jax.utils.exclude_mask import (
+    PairExcludeMask,
+)
+from deepmd.jax.utils.network import (
+    LayerNorm,
+    NativeLayer,
+    NetworkCollection,
+)
+from deepmd.jax.utils.type_embed import (
+    TypeEmbedNet,
+)
+
+
+@flax_module
+class GatedAttentionLayer(GatedAttentionLayerDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name in {"in_proj", "out_proj"}:
+            value = NativeLayer.deserialize(value.serialize())
+        return super().__setattr__(name, value)
+
+
+@flax_module
+class NeighborGatedAttentionLayer(NeighborGatedAttentionLayerDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == "attention_layer":
+            value = GatedAttentionLayer.deserialize(value.serialize())
+        elif name == "attn_layer_norm":
+            value = LayerNorm.deserialize(value.serialize())
+        return super().__setattr__(name, value)
+
+
+@flax_module
+class NeighborGatedAttention(NeighborGatedAttentionDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == "attention_layers":
+            value = [
+                NeighborGatedAttentionLayer.deserialize(ii.serialize()) for ii in value
+            ]
+        return super().__setattr__(name, value)
+
+
+@flax_module
+class DescrptBlockSeAtten(DescrptBlockSeAttenDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name in {"mean", "stddev"}:
+            value = to_jax_array(value)
+        elif name in {"embeddings", "embeddings_strip"}:
+            if value is not None:
+                value = NetworkCollection.deserialize(value.serialize())
+        elif name == "dpa1_attention":
+            value = NeighborGatedAttention.deserialize(value.serialize())
+        elif name == "env_mat":
+            # env_mat doesn't store any value
+            pass
+        elif name == "emask":
+            value = PairExcludeMask(value.ntypes, value.exclude_types)
+
+        return super().__setattr__(name, value)
+
+
+@flax_module
+class DescrptDPA1(DescrptDPA1DP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == "se_atten":
+            value = DescrptBlockSeAtten.deserialize(value.serialize())
+        elif name == "type_embedding":
+            value = TypeEmbedNet.deserialize(value.serialize())
+        return super().__setattr__(name, value)
diff --git a/deepmd/jax/env.py b/deepmd/jax/env.py
index 34e4aa6240..5a5a7f6bf0 100644
--- a/deepmd/jax/env.py
+++ b/deepmd/jax/env.py
@@ -5,10 +5,14 @@
 
 import jax
 import jax.numpy as jnp
+from flax import (
+    nnx,
+)
 
 jax.config.update("jax_enable_x64", True)
 
 __all__ = [
     "jax",
     "jnp",
+    "nnx",
 ]
diff --git a/deepmd/jax/fitting/__init__.py b/deepmd/jax/fitting/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/deepmd/jax/fitting/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/deepmd/jax/fitting/fitting.py b/deepmd/jax/fitting/fitting.py
new file mode 100644
index 0000000000..27ad791db9
--- /dev/null
+++ b/deepmd/jax/fitting/fitting.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
+from deepmd.dpmodel.fitting.ener_fitting import EnergyFittingNet as EnergyFittingNetDP
+from deepmd.jax.common import (
+    flax_module,
+    to_jax_array,
+)
+from deepmd.jax.utils.exclude_mask import (
+    AtomExcludeMask,
+)
+from deepmd.jax.utils.network import (
+    NetworkCollection,
+)
+
+
+def setattr_for_general_fitting(name: str, value: Any) -> Any:
+    if name in {
+        "bias_atom_e",
+        "fparam_avg",
+        "fparam_inv_std",
+        "aparam_avg",
+        "aparam_inv_std",
+    }:
+        value = to_jax_array(value)
+    elif name == "emask":
+        value = AtomExcludeMask(value.ntypes, value.exclude_types)
+    elif name == "nets":
+        value = NetworkCollection.deserialize(value.serialize())
+    return value
+
+
+@flax_module
+class EnergyFittingNet(EnergyFittingNetDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        value = setattr_for_general_fitting(name, value)
+        return super().__setattr__(name, value)
diff --git a/deepmd/jax/utils/exclude_mask.py b/deepmd/jax/utils/exclude_mask.py
new file mode 100644
index 0000000000..a6cf210f94
--- /dev/null
+++ b/deepmd/jax/utils/exclude_mask.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
+from deepmd.dpmodel.utils.exclude_mask import AtomExcludeMask as AtomExcludeMaskDP
+from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask as PairExcludeMaskDP
+from deepmd.jax.common import (
+    flax_module,
+    to_jax_array,
+)
+
+
+@flax_module
+class AtomExcludeMask(AtomExcludeMaskDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name in {"type_mask"}:
+            value = to_jax_array(value)
+        return super().__setattr__(name, value)
+
+
+@flax_module
+class PairExcludeMask(PairExcludeMaskDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name in {"type_mask"}:
+            value = to_jax_array(value)
+        return super().__setattr__(name, value)
diff --git a/deepmd/jax/utils/network.py b/deepmd/jax/utils/network.py
index 629b51b8cd..2c406095cd 100644
--- a/deepmd/jax/utils/network.py
+++ b/deepmd/jax/utils/network.py
@@ -1,29 +1,74 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Any,
+    ClassVar,
 )
 
 from deepmd.dpmodel.common import (
     NativeOP,
 )
+from deepmd.dpmodel.utils.network import LayerNorm as LayerNormDP
 from deepmd.dpmodel.utils.network import NativeLayer as NativeLayerDP
+from deepmd.dpmodel.utils.network import NetworkCollection as NetworkCollectionDP
 from deepmd.dpmodel.utils.network import (
     make_embedding_network,
     make_fitting_network,
     make_multilayer_network,
 )
 from deepmd.jax.common import (
+    flax_module,
     to_jax_array,
 )
+from deepmd.jax.env import (
+    nnx,
+)
+
+
+class ArrayAPIParam(nnx.Param):
+    def __array__(self, *args, **kwargs):
+        return self.value.__array__(*args, **kwargs)
+
+    def __array_namespace__(self, *args, **kwargs):
+        return self.value.__array_namespace__(*args, **kwargs)
 
+    def __dlpack__(self, *args, **kwargs):
+        return self.value.__dlpack__(*args, **kwargs)
 
+    def __dlpack_device__(self, *args, **kwargs):
+        return self.value.__dlpack_device__(*args, **kwargs)
+
+
+@flax_module
 class NativeLayer(NativeLayerDP):
     def __setattr__(self, name: str, value: Any) -> None:
         if name in {"w", "b", "idt"}:
             value = to_jax_array(value)
+            if value is not None:
+                value = ArrayAPIParam(value)
         return super().__setattr__(name, value)
 
 
-NativeNet = make_multilayer_network(NativeLayer, NativeOP)
-EmbeddingNet = make_embedding_network(NativeNet, NativeLayer)
-FittingNet = make_fitting_network(EmbeddingNet, NativeNet, NativeLayer)
+@flax_module
+class NativeNet(make_multilayer_network(NativeLayer, NativeOP)):
+    pass
+
+
+class EmbeddingNet(make_embedding_network(NativeNet, NativeLayer)):
+    pass
+
+
+class FittingNet(make_fitting_network(EmbeddingNet, NativeNet, NativeLayer)):
+    pass
+
+
+@flax_module
+class NetworkCollection(NetworkCollectionDP):
+    NETWORK_TYPE_MAP: ClassVar[dict[str, type]] = {
+        "network": NativeNet,
+        "embedding_network": EmbeddingNet,
+        "fitting_network": FittingNet,
+    }
+
+
+class LayerNorm(LayerNormDP, NativeLayer):
+    pass
diff --git a/deepmd/jax/utils/type_embed.py b/deepmd/jax/utils/type_embed.py
index bc7c469524..3143460244 100644
--- a/deepmd/jax/utils/type_embed.py
+++ b/deepmd/jax/utils/type_embed.py
@@ -5,6 +5,7 @@
 
 from deepmd.dpmodel.utils.type_embed import TypeEmbedNet as TypeEmbedNetDP
 from deepmd.jax.common import (
+    flax_module,
     to_jax_array,
 )
 from deepmd.jax.utils.network import (
@@ -12,6 +13,7 @@
 )
 
 
+@flax_module
 class TypeEmbedNet(TypeEmbedNetDP):
     def __setattr__(self, name: str, value: Any) -> None:
         if name in {"econf_tebd"}:
diff --git a/deepmd/loggers/training.py b/deepmd/loggers/training.py
index 954473e309..b2fff4788b 100644
--- a/deepmd/loggers/training.py
+++ b/deepmd/loggers/training.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    Dict,
     Optional,
 )
 
@@ -16,7 +15,7 @@ def format_training_message(
 def format_training_message_per_task(
     batch: int,
     task_name: str,
-    rmse: Dict[str, float],
+    rmse: dict[str, float],
     learning_rate: Optional[float],
 ):
     if task_name:
diff --git a/deepmd/main.py b/deepmd/main.py
index c271152a06..60b8da2850 100644
--- a/deepmd/main.py
+++ b/deepmd/main.py
@@ -14,10 +14,7 @@
     defaultdict,
 )
 from typing import (
-    Dict,
-    List,
     Optional,
-    Type,
 )
 
 from deepmd.backend.backend import (
@@ -57,10 +54,10 @@ class RawTextArgumentDefaultsHelpFormatter(
     """This formatter is used to print multile-line help message with default value."""
 
 
-BACKENDS: Dict[str, Type[Backend]] = Backend.get_backends_by_feature(
+BACKENDS: dict[str, type[Backend]] = Backend.get_backends_by_feature(
     Backend.Feature.ENTRY_POINT
 )
-BACKEND_TABLE: Dict[str, str] = {kk: vv.name.lower() for kk, vv in BACKENDS.items()}
+BACKEND_TABLE: dict[str, str] = {kk: vv.name.lower() for kk, vv in BACKENDS.items()}
 
 
 class BackendOption(argparse.Action):
@@ -130,7 +127,7 @@ def main_parser() -> argparse.ArgumentParser:
         ),
     )
 
-    BACKEND_ALIAS: Dict[str, List[str]] = defaultdict(list)
+    BACKEND_ALIAS: dict[str, list[str]] = defaultdict(list)
     for alias, backend in BACKEND_TABLE.items():
         BACKEND_ALIAS[backend].append(alias)
     for backend, alias in BACKEND_ALIAS.items():
@@ -856,12 +853,12 @@ def main_parser() -> argparse.ArgumentParser:
     return parser
 
 
-def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
+def parse_args(args: Optional[list[str]] = None) -> argparse.Namespace:
     """Parse arguments and convert argument strings to objects.
 
     Parameters
     ----------
-    args : List[str]
+    args : list[str]
         list of command line arguments, main purpose is testing default option None
         takes arguments from sys.argv
 
@@ -880,12 +877,12 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
     return parsed_args
 
 
-def main(args: Optional[List[str]] = None):
+def main(args: Optional[list[str]] = None):
     """DeePMD-kit new entry point.
 
     Parameters
     ----------
-    args : List[str]
+    args : list[str]
         list of command line arguments, main purpose is testing default option None
         takes arguments from sys.argv
 
diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py
index 3df05cbb47..a0694c41c5 100644
--- a/deepmd/pt/entrypoints/main.py
+++ b/deepmd/pt/entrypoints/main.py
@@ -8,7 +8,6 @@
     Path,
 )
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -485,7 +484,7 @@ def change_bias(FLAGS):
 
 
 @record
-def main(args: Optional[Union[List[str], argparse.Namespace]] = None):
+def main(args: Optional[Union[list[str], argparse.Namespace]] = None):
     if not isinstance(args, argparse.Namespace):
         FLAGS = parse_args(args=args)
     else:
diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py
index d5eae71731..0a77a38135 100644
--- a/deepmd/pt/infer/deep_eval.py
+++ b/deepmd/pt/infer/deep_eval.py
@@ -4,11 +4,7 @@
     TYPE_CHECKING,
     Any,
     Callable,
-    Dict,
-    List,
     Optional,
-    Tuple,
-    Type,
     Union,
 )
 
@@ -170,7 +166,7 @@ def get_ntypes(self) -> int:
         """Get the number of atom types of this model."""
         return len(self.type_map)
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map (element name of the atom types) of this model."""
         return self.type_map
 
@@ -186,7 +182,7 @@ def get_intensive(self) -> bool:
         return self.dp.model["Default"].get_intensive()
 
     @property
-    def model_type(self) -> Type["DeepEvalWrapper"]:
+    def model_type(self) -> type["DeepEvalWrapper"]:
         """The the evaluator of the model type."""
         model_output_type = self.dp.model["Default"].model_output_type()
         if "energy" in model_output_type:
@@ -206,7 +202,7 @@ def model_type(self) -> Type["DeepEvalWrapper"]:
         else:
             raise RuntimeError("Unknown model type")
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
@@ -244,7 +240,7 @@ def eval(
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
         **kwargs: Any,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         """Evaluate the energy, force and virial by using this DP.
 
         Parameters
@@ -311,7 +307,7 @@ def eval(
             )
         )
 
-    def _get_request_defs(self, atomic: bool) -> List[OutputVariableDef]:
+    def _get_request_defs(self, atomic: bool) -> list[OutputVariableDef]:
         """Get the requested output definitions.
 
         When atomic is True, all output_def are requested.
@@ -376,7 +372,7 @@ def _get_natoms_and_nframes(
         coords: np.ndarray,
         atom_types: np.ndarray,
         mixed_type: bool = False,
-    ) -> Tuple[int, int]:
+    ) -> tuple[int, int]:
         if mixed_type:
             natoms = len(atom_types[0])
         else:
@@ -395,7 +391,7 @@ def _eval_model(
         atom_types: np.ndarray,
         fparam: Optional[np.ndarray],
         aparam: Optional[np.ndarray],
-        request_defs: List[OutputVariableDef],
+        request_defs: list[OutputVariableDef],
     ):
         model = self.dp.to(DEVICE)
 
@@ -476,7 +472,7 @@ def _eval_model_spin(
         spins: np.ndarray,
         fparam: Optional[np.ndarray],
         aparam: Optional[np.ndarray],
-        request_defs: List[OutputVariableDef],
+        request_defs: list[OutputVariableDef],
     ):
         model = self.dp.to(DEVICE)
 
@@ -602,3 +598,58 @@ def eval_typeebd(self) -> np.ndarray:
     def get_model_def_script(self) -> str:
         """Get model defination script."""
         return self.model_def_script
+
+    def eval_descriptor(
+        self,
+        coords: np.ndarray,
+        cells: Optional[np.ndarray],
+        atom_types: np.ndarray,
+        fparam: Optional[np.ndarray] = None,
+        aparam: Optional[np.ndarray] = None,
+        **kwargs: Any,
+    ) -> np.ndarray:
+        """Evaluate descriptors by using this DP.
+
+        Parameters
+        ----------
+        coords
+            The coordinates of atoms.
+            The array should be of size nframes x natoms x 3
+        cells
+            The cell of the region.
+            If None then non-PBC is assumed, otherwise using PBC.
+            The array should be of size nframes x 9
+        atom_types
+            The atom types
+            The list should contain natoms ints
+        fparam
+            The frame parameter.
+            The array can be of size :
+            - nframes x dim_fparam.
+            - dim_fparam. Then all frames are assumed to be provided with the same fparam.
+        aparam
+            The atomic parameter
+            The array can be of size :
+            - nframes x natoms x dim_aparam.
+            - natoms x dim_aparam. Then all frames are assumed to be provided with the same aparam.
+            - dim_aparam. Then all frames and atoms are provided with the same aparam.
+
+        Returns
+        -------
+        descriptor
+            Descriptors.
+        """
+        model = self.dp.model["Default"]
+        model.set_eval_descriptor_hook(True)
+        self.eval(
+            coords,
+            cells,
+            atom_types,
+            atomic=False,
+            fparam=fparam,
+            aparam=aparam,
+            **kwargs,
+        )
+        descriptor = model.eval_descriptor()
+        model.set_eval_descriptor_hook(False)
+        return to_numpy_array(descriptor)
diff --git a/deepmd/pt/loss/dos.py b/deepmd/pt/loss/dos.py
index 7fd2e04ff2..84513b6bf9 100644
--- a/deepmd/pt/loss/dos.py
+++ b/deepmd/pt/loss/dos.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    List,
-)
 
 import torch
 
@@ -230,7 +227,7 @@ def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False
         return model_pred, loss, more_loss
 
     @property
-    def label_requirement(self) -> List[DataRequirementItem]:
+    def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
         label_requirement = []
         if self.has_ados or self.has_acdf:
diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py
index 092fbc1f76..f40110a749 100644
--- a/deepmd/pt/loss/ener.py
+++ b/deepmd/pt/loss/ener.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
 )
 
@@ -336,7 +335,7 @@ def forward(self, input_dict, model, label, natoms, learning_rate, mae=False):
         return model_pred, loss, more_loss
 
     @property
-    def label_requirement(self) -> List[DataRequirementItem]:
+    def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
         label_requirement = []
         if self.has_e:
diff --git a/deepmd/pt/loss/ener_spin.py b/deepmd/pt/loss/ener_spin.py
index 78210a778b..09a053451f 100644
--- a/deepmd/pt/loss/ener_spin.py
+++ b/deepmd/pt/loss/ener_spin.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    List,
-)
 
 import torch
 import torch.nn.functional as F
@@ -276,7 +273,7 @@ def forward(self, input_dict, model, label, natoms, learning_rate, mae=False):
         return model_pred, loss, more_loss
 
     @property
-    def label_requirement(self) -> List[DataRequirementItem]:
+    def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
         label_requirement = []
         if self.has_e:
diff --git a/deepmd/pt/loss/loss.py b/deepmd/pt/loss/loss.py
index 7e26f6571a..1a091e074e 100644
--- a/deepmd/pt/loss/loss.py
+++ b/deepmd/pt/loss/loss.py
@@ -3,9 +3,6 @@
     ABC,
     abstractmethod,
 )
-from typing import (
-    List,
-)
 
 import torch
 
@@ -25,7 +22,7 @@ def forward(self, input_dict, model, label, natoms, learning_rate):
 
     @property
     @abstractmethod
-    def label_requirement(self) -> List[DataRequirementItem]:
+    def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
         pass
 
diff --git a/deepmd/pt/loss/property.py b/deepmd/pt/loss/property.py
index e4f86091bc..ba120e3d6c 100644
--- a/deepmd/pt/loss/property.py
+++ b/deepmd/pt/loss/property.py
@@ -1,8 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
-from typing import (
-    List,
-)
 
 import torch
 import torch.nn.functional as F
@@ -138,7 +135,7 @@ def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False
         return model_pred, loss, more_loss
 
     @property
-    def label_requirement(self) -> List[DataRequirementItem]:
+    def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
         label_requirement = []
         label_requirement.append(
diff --git a/deepmd/pt/loss/tensor.py b/deepmd/pt/loss/tensor.py
index 3dcf21af1d..32d25cc9f1 100644
--- a/deepmd/pt/loss/tensor.py
+++ b/deepmd/pt/loss/tensor.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    List,
-)
 
 import torch
 
@@ -151,7 +148,7 @@ def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False
         return model_pred, loss, more_loss
 
     @property
-    def label_requirement(self) -> List[DataRequirementItem]:
+    def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
         label_requirement = []
         if self.has_local_weight:
diff --git a/deepmd/pt/model/atomic_model/base_atomic_model.py b/deepmd/pt/model/atomic_model/base_atomic_model.py
index 4742fe66a3..bd3c2b49ab 100644
--- a/deepmd/pt/model/atomic_model/base_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/base_atomic_model.py
@@ -4,10 +4,7 @@
 import logging
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -67,7 +64,7 @@ class BaseAtomicModel(torch.nn.Module, BaseAtomicModel_):
         of the atomic model. Implemented by removing the pairs from the nlist.
     rcond : float, optional
         The condition number for the regression of atomic energy.
-    preset_out_bias : Dict[str, List[Optional[np.ndarray]]], optional
+    preset_out_bias : dict[str, list[Optional[np.ndarray]]], optional
         Specifying atomic energy contribution in vacuum. Given by key:value pairs.
         The value is a list specifying the bias. the elements can be None or np.ndarray of output shape.
         For example: [None, [2.]] means type 0 is not set, type 1 is set to [2.]
@@ -77,11 +74,11 @@ class BaseAtomicModel(torch.nn.Module, BaseAtomicModel_):
 
     def __init__(
         self,
-        type_map: List[str],
-        atom_exclude_types: List[int] = [],
-        pair_exclude_types: List[Tuple[int, int]] = [],
+        type_map: list[str],
+        atom_exclude_types: list[int] = [],
+        pair_exclude_types: list[tuple[int, int]] = [],
         rcond: Optional[float] = None,
-        preset_out_bias: Optional[Dict[str, np.ndarray]] = None,
+        preset_out_bias: Optional[dict[str, np.ndarray]] = None,
     ):
         torch.nn.Module.__init__(self)
         BaseAtomicModel_.__init__(self)
@@ -94,7 +91,7 @@ def __init__(
     def init_out_stat(self):
         """Initialize the output bias."""
         ntypes = self.get_ntypes()
-        self.bias_keys: List[str] = list(self.fitting_output_def().keys())
+        self.bias_keys: list[str] = list(self.fitting_output_def().keys())
         self.max_out_size = max(
             [self.atomic_output_def()[kk].size for kk in self.bias_keys]
         )
@@ -124,13 +121,13 @@ def __getitem__(self, key):
             raise KeyError(key)
 
     @torch.jit.export
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map."""
         return self.type_map
 
     def reinit_atom_exclude(
         self,
-        exclude_types: List[int] = [],
+        exclude_types: list[int] = [],
     ):
         self.atom_exclude_types = exclude_types
         if exclude_types == []:
@@ -140,7 +137,7 @@ def reinit_atom_exclude(
 
     def reinit_pair_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.pair_exclude_types = exclude_types
         if exclude_types == []:
@@ -195,8 +192,8 @@ def forward_common_atomic(
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> Dict[str, torch.Tensor]:
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
+    ) -> dict[str, torch.Tensor]:
         """Common interface for atomic inference.
 
         This method accept extended coordinates, extended atom typs, neighbor list,
@@ -276,8 +273,8 @@ def forward(
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> Dict[str, torch.Tensor]:
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
+    ) -> dict[str, torch.Tensor]:
         return self.forward_common_atomic(
             extended_coord,
             extended_atype,
@@ -289,7 +286,7 @@ def forward(
         )
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -354,7 +351,7 @@ def deserialize(cls, data: dict) -> "BaseAtomicModel":
 
     def compute_or_load_stat(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         stat_file_path: Optional[DPPath] = None,
     ):
         """
@@ -362,11 +359,11 @@ def compute_or_load_stat(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         stat_file_path : Optional[DPPath]
@@ -377,7 +374,7 @@ def compute_or_load_stat(
 
     def compute_or_load_out_stat(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         stat_file_path: Optional[DPPath] = None,
     ):
         """
@@ -385,11 +382,11 @@ def compute_or_load_out_stat(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         stat_file_path : Optional[DPPath]
@@ -404,7 +401,7 @@ def compute_or_load_out_stat(
 
     def apply_out_stat(
         self,
-        ret: Dict[str, torch.Tensor],
+        ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
     ):
         """Apply the stat to each atomic output.
@@ -435,11 +432,11 @@ def change_out_bias(
 
         Parameters
         ----------
-        sample_merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        sample_merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         bias_adjust_mode : str
@@ -480,7 +477,9 @@ def _get_forward_wrapper_func(self) -> Callable[..., torch.Tensor]:
         """Get a forward wrapper of the atomic model for output bias calculation."""
 
         def model_forward(coord, atype, box, fparam=None, aparam=None):
-            with torch.no_grad():  # it's essential for pure torch forward function to use auto_batchsize
+            with (
+                torch.no_grad()
+            ):  # it's essential for pure torch forward function to use auto_batchsize
                 (
                     extended_coord,
                     extended_atype,
@@ -520,7 +519,7 @@ def _default_std(self):
 
     def _varsize(
         self,
-        shape: List[int],
+        shape: list[int],
     ) -> int:
         output_size = 1
         len_shape = len(shape)
@@ -532,7 +531,7 @@ def _get_bias_index(
         self,
         kk: str,
     ) -> int:
-        res: List[int] = []
+        res: list[int] = []
         for i, e in enumerate(self.bias_keys):
             if e == kk:
                 res.append(i)
@@ -541,8 +540,8 @@ def _get_bias_index(
 
     def _store_out_stat(
         self,
-        out_bias: Dict[str, torch.Tensor],
-        out_std: Dict[str, torch.Tensor],
+        out_bias: dict[str, torch.Tensor],
+        out_std: dict[str, torch.Tensor],
         add: bool = False,
     ):
         ntypes = self.get_ntypes()
@@ -562,8 +561,8 @@ def _store_out_stat(
 
     def _fetch_out_stat(
         self,
-        keys: List[str],
-    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
+        keys: list[str],
+    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
         ret_bias = {}
         ret_std = {}
         ntypes = self.get_ntypes()
diff --git a/deepmd/pt/model/atomic_model/dipole_atomic_model.py b/deepmd/pt/model/atomic_model/dipole_atomic_model.py
index 1723a30f2d..aa28294cc5 100644
--- a/deepmd/pt/model/atomic_model/dipole_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/dipole_atomic_model.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    Dict,
-)
 
 import torch
 
@@ -21,7 +18,7 @@ def __init__(self, descriptor, fitting, type_map, **kwargs):
 
     def apply_out_stat(
         self,
-        ret: Dict[str, torch.Tensor],
+        ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
     ):
         # dipole not applying bias
diff --git a/deepmd/pt/model/atomic_model/dp_atomic_model.py b/deepmd/pt/model/atomic_model/dp_atomic_model.py
index 8def2e48de..edb1253234 100644
--- a/deepmd/pt/model/atomic_model/dp_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/dp_atomic_model.py
@@ -3,8 +3,6 @@
 import functools
 import logging
 from typing import (
-    Dict,
-    List,
     Optional,
 )
 
@@ -52,7 +50,7 @@ def __init__(
         self,
         descriptor,
         fitting,
-        type_map: List[str],
+        type_map: list[str],
         **kwargs,
     ):
         super().__init__(type_map, **kwargs)
@@ -64,6 +62,19 @@ def __init__(
         self.sel = self.descriptor.get_sel()
         self.fitting_net = fitting
         super().init_out_stat()
+        self.enable_eval_descriptor_hook = False
+        self.eval_descriptor_list = []
+
+    eval_descriptor_list: list[torch.Tensor]
+
+    def set_eval_descriptor_hook(self, enable: bool) -> None:
+        """Set the hook for evaluating descriptor and clear the cache for descriptor list."""
+        self.enable_eval_descriptor_hook = enable
+        self.eval_descriptor_list = []
+
+    def eval_descriptor(self) -> torch.Tensor:
+        """Evaluate the descriptor."""
+        return torch.concat(self.eval_descriptor_list)
 
     @torch.jit.export
     def fitting_output_def(self) -> FittingOutputDef:
@@ -79,7 +90,7 @@ def get_rcut(self) -> float:
         """Get the cut-off radius."""
         return self.rcut
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Get the neighbor selection."""
         return self.sel
 
@@ -96,7 +107,7 @@ def mixed_types(self) -> bool:
         return self.descriptor.mixed_types()
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -157,8 +168,8 @@ def forward_atomic(
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> Dict[str, torch.Tensor]:
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
+    ) -> dict[str, torch.Tensor]:
         """Return atomic prediction.
 
         Parameters
@@ -194,6 +205,8 @@ def forward_atomic(
             comm_dict=comm_dict,
         )
         assert descriptor is not None
+        if self.enable_eval_descriptor_hook:
+            self.eval_descriptor_list.append(descriptor)
         # energy, force
         fit_ret = self.fitting_net(
             descriptor,
@@ -258,7 +271,7 @@ def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.fitting_net.get_dim_aparam()
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
diff --git a/deepmd/pt/model/atomic_model/linear_atomic_model.py b/deepmd/pt/model/atomic_model/linear_atomic_model.py
index 3c7692212e..8d27fbcac4 100644
--- a/deepmd/pt/model/atomic_model/linear_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/linear_atomic_model.py
@@ -2,10 +2,7 @@
 import copy
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -51,12 +48,15 @@ class LinearEnergyAtomicModel(BaseAtomicModel):
     type_map : list[str]
         Mapping atom type to the name (str) of the type.
         For example `type_map[1]` gives the name of the type 1.
+    weights : Optional[Union[str,list[float]]]
+        Weights of the models. If str, must be `sum` or `mean`. If list, must be a list of float.
     """
 
     def __init__(
         self,
-        models: List[BaseAtomicModel],
-        type_map: List[str],
+        models: list[BaseAtomicModel],
+        type_map: list[str],
+        weights: Optional[Union[str, list[float]]] = "mean",
         **kwargs,
     ):
         super().__init__(type_map, **kwargs)
@@ -92,6 +92,16 @@ def __init__(
         )
         self.nsels = torch.tensor(self.get_model_nsels(), device=env.DEVICE)  # pylint: disable=no-explicit-dtype
 
+        if isinstance(weights, str):
+            assert weights in ["sum", "mean"]
+        elif isinstance(weights, list):
+            assert len(weights) == len(models)
+        else:
+            raise ValueError(
+                f"'weights' must be a string ('sum' or 'mean') or a list of float of length {len(models)}."
+            )
+        self.weights = weights
+
     def mixed_types(self) -> bool:
         """If true, the model
         1. assumes total number of atoms aligned across frames;
@@ -119,12 +129,12 @@ def get_rcut(self) -> float:
         """Get the cut-off radius."""
         return max(self.get_model_rcuts())
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map."""
         return self.type_map
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -140,22 +150,22 @@ def change_type_map(
                 else None,
             )
 
-    def get_model_rcuts(self) -> List[float]:
+    def get_model_rcuts(self) -> list[float]:
         """Get the cut-off radius for each individual models."""
         return [model.get_rcut() for model in self.models]
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         return [max([model.get_nsel() for model in self.models])]
 
-    def get_model_nsels(self) -> List[int]:
+    def get_model_nsels(self) -> list[int]:
         """Get the processed sels for each individual models. Not distinguishing types."""
         return [model.get_nsel() for model in self.models]
 
-    def get_model_sels(self) -> List[List[int]]:
+    def get_model_sels(self) -> list[list[int]]:
         """Get the sels for each individual models."""
         return [model.get_sel() for model in self.models]
 
-    def _sort_rcuts_sels(self) -> Tuple[List[float], List[int]]:
+    def _sort_rcuts_sels(self) -> tuple[list[float], list[int]]:
         # sort the pair of rcut and sels in ascending order, first based on sel, then on rcut.
         zipped = torch.stack(
             [
@@ -168,8 +178,8 @@ def _sort_rcuts_sels(self) -> Tuple[List[float], List[int]]:
         inner_sorted = zipped[inner_sorting]
         outer_sorting = torch.argsort(inner_sorted[:, 0], stable=True)
         outer_sorted = inner_sorted[outer_sorting]
-        sorted_rcuts: List[float] = outer_sorted[:, 0].tolist()
-        sorted_sels: List[int] = outer_sorted[:, 1].to(torch.int64).tolist()
+        sorted_rcuts: list[float] = outer_sorted[:, 0].tolist()
+        sorted_sels: list[int] = outer_sorted[:, 1].to(torch.int64).tolist()
         return sorted_rcuts, sorted_sels
 
     def forward_atomic(
@@ -180,8 +190,8 @@ def forward_atomic(
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> Dict[str, torch.Tensor]:
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
+    ) -> dict[str, torch.Tensor]:
         """Return atomic prediction.
 
         Parameters
@@ -252,7 +262,7 @@ def forward_atomic(
 
     def apply_out_stat(
         self,
-        ret: Dict[str, torch.Tensor],
+        ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
     ):
         """Apply the stat to each atomic output.
@@ -270,16 +280,16 @@ def apply_out_stat(
         return ret
 
     @staticmethod
-    def remap_atype(ori_map: List[str], new_map: List[str]) -> torch.Tensor:
+    def remap_atype(ori_map: list[str], new_map: list[str]) -> torch.Tensor:
         """
         This method is used to map the atype from the common type_map to the original type_map of
         indivial AtomicModels. It creates a index mapping for the conversion.
 
         Parameters
         ----------
-        ori_map : List[str]
+        ori_map : list[str]
             The original type map of an AtomicModel.
-        new_map : List[str]
+        new_map : list[str]
             The common type map of the DPZBLLinearEnergyAtomicModel, created by the `get_type_map` method,
             must be a subset of the ori_map.
 
@@ -323,7 +333,7 @@ def serialize(self) -> dict:
     @classmethod
     def deserialize(cls, data: dict) -> "LinearEnergyAtomicModel":
         data = copy.deepcopy(data)
-        check_version_compatibility(data.get("@version", 2), 2, 1)
+        check_version_compatibility(data.pop("@version", 2), 2, 1)
         data.pop("@class", None)
         data.pop("type", None)
         models = [
@@ -334,16 +344,42 @@ def deserialize(cls, data: dict) -> "LinearEnergyAtomicModel":
         return super().deserialize(data)
 
     def _compute_weight(
-        self, extended_coord, extended_atype, nlists_
-    ) -> List[torch.Tensor]:
+        self,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlists_: list[torch.Tensor],
+    ) -> list[torch.Tensor]:
         """This should be a list of user defined weights that matches the number of models to be combined."""
         nmodels = len(self.models)
         nframes, nloc, _ = nlists_[0].shape
-        return [
-            torch.ones((nframes, nloc, 1), dtype=torch.float64, device=env.DEVICE)
-            / nmodels
-            for _ in range(nmodels)
-        ]
+        if isinstance(self.weights, str):
+            if self.weights == "sum":
+                return [
+                    torch.ones(
+                        (nframes, nloc, 1), dtype=torch.float64, device=env.DEVICE
+                    )
+                    for _ in range(nmodels)
+                ]
+            elif self.weights == "mean":
+                return [
+                    torch.ones(
+                        (nframes, nloc, 1), dtype=torch.float64, device=env.DEVICE
+                    )
+                    / nmodels
+                    for _ in range(nmodels)
+                ]
+            else:
+                raise ValueError(
+                    "`weights` must be 'sum' or 'mean' when provided as a string."
+                )
+        elif isinstance(self.weights, list):
+            return [
+                torch.ones((nframes, nloc, 1), dtype=torch.float64, device=env.DEVICE)
+                * w
+                for w in self.weights
+            ]
+        else:
+            raise NotImplementedError
 
     def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this atomic model."""
@@ -354,7 +390,7 @@ def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return max([model.get_dim_aparam() for model in self.models])
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
@@ -368,7 +404,9 @@ def get_sel_type(self) -> List[int]:
         return torch.unique(
             torch.cat(
                 [
-                    torch.as_tensor(model.get_sel_type(), dtype=torch.int32)
+                    torch.as_tensor(
+                        model.get_sel_type(), dtype=torch.int64, device=env.DEVICE
+                    )
                     for model in self.models
                 ]
             )
@@ -383,7 +421,7 @@ def is_aparam_nall(self) -> bool:
 
     def compute_or_load_out_stat(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         stat_file_path: Optional[DPPath] = None,
     ):
         """
@@ -391,11 +429,11 @@ def compute_or_load_out_stat(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         stat_file_path : Optional[DPPath]
@@ -456,7 +494,7 @@ def __init__(
         zbl_model: PairTabAtomicModel,
         sw_rmin: float,
         sw_rmax: float,
-        type_map: List[str],
+        type_map: list[str],
         smin_alpha: Optional[float] = 0.1,
         **kwargs,
     ):
@@ -503,13 +541,13 @@ def _compute_weight(
         self,
         extended_coord: torch.Tensor,
         extended_atype: torch.Tensor,
-        nlists_: List[torch.Tensor],
-    ) -> List[torch.Tensor]:
+        nlists_: list[torch.Tensor],
+    ) -> list[torch.Tensor]:
         """ZBL weight.
 
         Returns
         -------
-        List[torch.Tensor]
+        list[torch.Tensor]
             the atomic ZBL weight for interpolation. (nframes, nloc, 1)
         """
         assert (
diff --git a/deepmd/pt/model/atomic_model/pairtab_atomic_model.py b/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
index 7ef87524dd..2918bba947 100644
--- a/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
@@ -2,8 +2,6 @@
 import copy
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -55,7 +53,7 @@ class PairTabAtomicModel(BaseAtomicModel):
         The cutoff radius.
     sel : int or list[int]
         The maxmum number of atoms in the cut-off radius.
-    type_map : List[str]
+    type_map : list[str]
         Mapping atom type to the name (str) of the type.
         For example `type_map[1]` gives the name of the type 1.
     rcond : float, optional
@@ -69,8 +67,8 @@ def __init__(
         self,
         tab_file: str,
         rcut: float,
-        sel: Union[int, List[int]],
-        type_map: List[str],
+        sel: Union[int, list[int]],
+        type_map: list[str],
         **kwargs,
     ):
         super().__init__(type_map, **kwargs)
@@ -87,7 +85,7 @@ def __init__(
             (
                 tab_info,
                 tab_data,
-            ) = self.tab.get()  # this returns -> Tuple[np.array, np.array]
+            ) = self.tab.get()  # this returns -> tuple[np.array, np.array]
             nspline, ntypes_tab = tab_info[-2:].astype(int)
             self.register_buffer("tab_info", torch.from_numpy(tab_info))
             self.register_buffer(
@@ -138,10 +136,10 @@ def get_out_bias(self) -> torch.Tensor:
     def get_rcut(self) -> float:
         return self.rcut
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         return self.type_map
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         return [self.sel]
 
     def get_nsel(self) -> int:
@@ -169,7 +167,7 @@ def need_sorted_nlist_for_lower(self) -> bool:
         return False
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -218,7 +216,7 @@ def deserialize(cls, data) -> "PairTabAtomicModel":
 
     def compute_or_load_stat(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         stat_file_path: Optional[DPPath] = None,
     ):
         """
@@ -226,11 +224,11 @@ def compute_or_load_stat(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         stat_file_path : Optional[DPPath]
@@ -248,8 +246,8 @@ def forward_atomic(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> Dict[str, torch.Tensor]:
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
+    ) -> dict[str, torch.Tensor]:
         nframes, nloc, nnei = nlist.shape
         extended_coord = extended_coord.view(nframes, -1, 3)
         if self.do_grad_r() or self.do_grad_c():
@@ -470,7 +468,7 @@ def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return 0
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
diff --git a/deepmd/pt/model/atomic_model/polar_atomic_model.py b/deepmd/pt/model/atomic_model/polar_atomic_model.py
index 81cf8a23b6..39cda2650d 100644
--- a/deepmd/pt/model/atomic_model/polar_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/polar_atomic_model.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    Dict,
-)
 
 import torch
 
@@ -21,7 +18,7 @@ def __init__(self, descriptor, fitting, type_map, **kwargs):
 
     def apply_out_stat(
         self,
-        ret: Dict[str, torch.Tensor],
+        ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
     ):
         """Apply the stat to each atomic output.
diff --git a/deepmd/pt/model/atomic_model/property_atomic_model.py b/deepmd/pt/model/atomic_model/property_atomic_model.py
index 1fb8a5957f..2fac90100f 100644
--- a/deepmd/pt/model/atomic_model/property_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/property_atomic_model.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    Dict,
-)
 
 import torch
 
@@ -21,7 +18,7 @@ def __init__(self, descriptor, fitting, type_map, **kwargs):
 
     def apply_out_stat(
         self,
-        ret: Dict[str, torch.Tensor],
+        ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
     ):
         """Apply the stat to each atomic output.
diff --git a/deepmd/pt/model/descriptor/descriptor.py b/deepmd/pt/model/descriptor/descriptor.py
index 16c3d96301..78a4608108 100644
--- a/deepmd/pt/model/descriptor/descriptor.py
+++ b/deepmd/pt/model/descriptor/descriptor.py
@@ -6,8 +6,6 @@
 )
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -71,7 +69,7 @@ def get_nsel(self) -> int:
         pass
 
     @abstractmethod
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         pass
 
@@ -102,7 +100,7 @@ def get_env_protection(self) -> float:
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -110,11 +108,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -123,7 +121,7 @@ def compute_input_stats(
         """
         raise NotImplementedError
 
-    def get_stats(self) -> Dict[str, StatItem]:
+    def get_stats(self) -> dict[str, StatItem]:
         """Get the statistics of the descriptor."""
         raise NotImplementedError
 
@@ -203,7 +201,7 @@ def extend_descrpt_stat(des, type_map, des_with_stat=None):
     ----------
     des : DescriptorBlock
         The descriptor block to be extended.
-    type_map : List[str]
+    type_map : list[str]
         The name of each type of atoms to be extended.
     des_with_stat : DescriptorBlock, Optional
         The descriptor block has additional statistics of types from newly provided `type_map`.
diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py
index 14767cb100..617e8b49b6 100644
--- a/deepmd/pt/model/descriptor/dpa1.py
+++ b/deepmd/pt/model/descriptor/dpa1.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -157,7 +154,7 @@ class DescrptDPA1(BaseDescriptor, torch.nn.Module):
             (Only support False to keep consistent with other backend references.)
             (Not used in this version. True option is not implemented.)
             If mask the diagonal of attention weights
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     env_protection: float
@@ -191,7 +188,7 @@ class DescrptDPA1(BaseDescriptor, torch.nn.Module):
             Whether to use electronic configuration type embedding.
     use_tebd_bias : bool, Optional
             Whether to use bias in the type embedding layer.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     spin
             (Only support None to keep consistent with other backend references.)
@@ -215,7 +212,7 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: Union[List[int], int],
+        sel: Union[list[int], int],
         ntypes: int,
         neuron: list = [25, 50, 100],
         axis_neuron: int = 16,
@@ -229,7 +226,7 @@ def __init__(
         activation_function: str = "tanh",
         precision: str = "float64",
         resnet_dt: bool = False,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         scaling_factor: int = 1.0,
         normalize=True,
@@ -241,10 +238,10 @@ def __init__(
         smooth_type_embedding: bool = True,
         type_one_side: bool = False,
         stripped_type_embedding: Optional[bool] = None,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         # not implemented
         spin=None,
         type: Optional[str] = None,
@@ -326,7 +323,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return self.se_atten.get_nsel()
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.se_atten.get_sel()
 
@@ -334,7 +331,7 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.se_atten.get_ntypes()
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
@@ -405,7 +402,7 @@ def dim_emb(self):
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -413,11 +410,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -435,12 +432,12 @@ def set_stat_mean_and_stddev(
         self.se_atten.mean = mean
         self.se_atten.stddev = stddev
 
-    def get_stat_mean_and_stddev(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def get_stat_mean_and_stddev(self) -> tuple[torch.Tensor, torch.Tensor]:
         """Get mean and stddev for descriptor."""
         return self.se_atten.mean, self.se_atten.stddev
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -562,7 +559,7 @@ def forward(
         extended_atype: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -617,9 +614,9 @@ def forward(
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py
index 9fc4fc4a21..f1ef200b09 100644
--- a/deepmd/pt/model/descriptor/dpa2.py
+++ b/deepmd/pt/model/descriptor/dpa2.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -87,14 +84,14 @@ def __init__(
         concat_output_tebd: bool = True,
         precision: str = "float64",
         smooth: bool = True,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
         add_tebd_to_repinit_out: bool = False,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         old_impl: bool = False,
     ):
         r"""The DPA-2 descriptor. see https://arxiv.org/abs/2312.15492.
@@ -111,7 +108,7 @@ def __init__(
             The precision of the embedding net parameters.
         smooth : bool, optional
             Whether to use smoothness in processes such as attention weights calculation.
-        exclude_types : List[List[int]], optional
+        exclude_types : list[list[int]], optional
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
         env_protection : float, optional
@@ -127,7 +124,7 @@ def __init__(
             Whether to use electronic configuration type embedding.
         use_tebd_bias : bool, Optional
             Whether to use bias in the type embedding layer.
-        type_map : List[str], Optional
+        type_map : list[str], Optional
             A list of strings. Give the name to each type of atoms.
 
         Returns
@@ -324,7 +321,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
@@ -332,7 +329,7 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.ntypes
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
@@ -423,7 +420,7 @@ def share_params(self, base_class, shared_level, resume=False):
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -488,7 +485,7 @@ def dim_emb(self):
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -496,11 +493,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -515,8 +512,8 @@ def compute_input_stats(
 
     def set_stat_mean_and_stddev(
         self,
-        mean: List[torch.Tensor],
-        stddev: List[torch.Tensor],
+        mean: list[torch.Tensor],
+        stddev: list[torch.Tensor],
     ) -> None:
         """Update mean and stddev for descriptor."""
         descrpt_list = [self.repinit, self.repformers]
@@ -526,7 +523,7 @@ def set_stat_mean_and_stddev(
             descrpt.mean = mean[ii]
             descrpt.stddev = stddev[ii]
 
-    def get_stat_mean_and_stddev(self) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    def get_stat_mean_and_stddev(self) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
         """Get mean and stddev for descriptor."""
         mean_list = [self.repinit.mean, self.repformers.mean]
         stddev_list = [
@@ -711,7 +708,7 @@ def forward(
         extended_atype: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -816,9 +813,9 @@ def forward(
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/pt/model/descriptor/gaussian_lcc.py b/deepmd/pt/model/descriptor/gaussian_lcc.py
index 2ae14bd432..8ac52215c0 100644
--- a/deepmd/pt/model/descriptor/gaussian_lcc.py
+++ b/deepmd/pt/model/descriptor/gaussian_lcc.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
 )
 
@@ -162,7 +161,7 @@ def dim_emb(self):
         """Returns the output dimension of pair representation."""
         return self.pair_embed_dim
 
-    def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None):
+    def compute_input_stats(self, merged: list[dict], path: Optional[DPPath] = None):
         """Update mean and stddev for descriptor elements."""
         pass
 
diff --git a/deepmd/pt/model/descriptor/hybrid.py b/deepmd/pt/model/descriptor/hybrid.py
index 7156396c48..c8730e3465 100644
--- a/deepmd/pt/model/descriptor/hybrid.py
+++ b/deepmd/pt/model/descriptor/hybrid.py
@@ -2,10 +2,7 @@
 import math
 from typing import (
     Any,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -38,16 +35,16 @@ class DescrptHybrid(BaseDescriptor, torch.nn.Module):
 
     Parameters
     ----------
-    list : list : List[Union[BaseDescriptor, Dict[str, Any]]]
+    list : list : list[Union[BaseDescriptor, dict[str, Any]]]
         Build a descriptor from the concatenation of the list of descriptors.
         The descriptor can be either an object or a dictionary.
     """
 
-    nlist_cut_idx: List[torch.Tensor]
+    nlist_cut_idx: list[torch.Tensor]
 
     def __init__(
         self,
-        list: List[Union[BaseDescriptor, Dict[str, Any]]],
+        list: list[Union[BaseDescriptor, dict[str, Any]]],
         **kwargs,
     ) -> None:
         super().__init__()
@@ -57,7 +54,7 @@ def __init__(
             raise RuntimeError(
                 "cannot build descriptor from an empty list of descriptors."
             )
-        formatted_descript_list: List[BaseDescriptor] = []
+        formatted_descript_list: list[BaseDescriptor] = []
         for ii in descrpt_list:
             if isinstance(ii, BaseDescriptor):
                 formatted_descript_list.append(ii)
@@ -75,7 +72,7 @@ def __init__(
                 self.descrpt_list[ii].get_ntypes() == self.descrpt_list[0].get_ntypes()
             ), f"number of atom types in {ii}th descrptor does not match others"
         # if hybrid sel is larger than sub sel, the nlist needs to be cut for each type
-        self.nlist_cut_idx: List[torch.Tensor] = []
+        self.nlist_cut_idx: list[torch.Tensor] = []
         if self.mixed_types() and not all(
             descrpt.mixed_types() for descrpt in self.descrpt_list
         ):
@@ -114,7 +111,7 @@ def get_rcut_smth(self) -> float:
         # Note: Using the minimum rcut_smth might not be appropriate in all scenarios. Consider using a different approach or provide detailed documentation on why the minimum value is chosen.
         return min([descrpt.get_rcut_smth() for descrpt in self.descrpt_list])
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         if self.mixed_types():
             return [
@@ -131,7 +128,7 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.descrpt_list[0].get_ntypes()
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.descrpt_list[0].get_type_map()
 
@@ -185,7 +182,7 @@ def share_params(self, base_class, shared_level, resume=False):
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -198,15 +195,15 @@ def change_type_map(
                 else None,
             )
 
-    def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None):
+    def compute_input_stats(self, merged: list[dict], path: Optional[DPPath] = None):
         """Update mean and stddev for descriptor elements."""
         for descrpt in self.descrpt_list:
             descrpt.compute_input_stats(merged, path)
 
     def set_stat_mean_and_stddev(
         self,
-        mean: List[Union[torch.Tensor, List[torch.Tensor]]],
-        stddev: List[Union[torch.Tensor, List[torch.Tensor]]],
+        mean: list[Union[torch.Tensor, list[torch.Tensor]]],
+        stddev: list[Union[torch.Tensor, list[torch.Tensor]]],
     ) -> None:
         """Update mean and stddev for descriptor."""
         for ii, descrpt in enumerate(self.descrpt_list):
@@ -214,9 +211,9 @@ def set_stat_mean_and_stddev(
 
     def get_stat_mean_and_stddev(
         self,
-    ) -> Tuple[
-        List[Union[torch.Tensor, List[torch.Tensor]]],
-        List[Union[torch.Tensor, List[torch.Tensor]]],
+    ) -> tuple[
+        list[Union[torch.Tensor, list[torch.Tensor]]],
+        list[Union[torch.Tensor, list[torch.Tensor]]],
     ]:
         """Get mean and stddev for descriptor."""
         mean_list = []
@@ -233,7 +230,7 @@ def forward(
         atype_ext: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -303,9 +300,9 @@ def forward(
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/pt/model/descriptor/repformer_layer.py b/deepmd/pt/model/descriptor/repformer_layer.py
index 579dc0c81e..92e2404469 100644
--- a/deepmd/pt/model/descriptor/repformer_layer.py
+++ b/deepmd/pt/model/descriptor/repformer_layer.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -44,7 +43,7 @@ def get_residual(
     _mode: str = "norm",
     trainable: bool = True,
     precision: str = "float64",
-    seed: Optional[Union[int, List[int]]] = None,
+    seed: Optional[Union[int, list[int]]] = None,
 ) -> torch.Tensor:
     r"""
     Get residual tensor for one update vector.
@@ -160,7 +159,7 @@ def __init__(
         smooth: bool = True,
         attnw_shift: float = 20.0,
         precision: str = "float64",
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         """Return neighbor-wise multi-head self-attention maps, with gate mechanism."""
         super().__init__()
@@ -285,7 +284,7 @@ def __init__(
         input_dim: int,
         head_num: int,
         precision: str = "float64",
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__()
         self.input_dim = input_dim
@@ -370,7 +369,7 @@ def __init__(
         input_dim: int,
         head_num: int,
         precision: str = "float64",
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__()
         self.input_dim = input_dim
@@ -443,7 +442,7 @@ def __init__(
         smooth: bool = True,
         attnw_shift: float = 20.0,
         precision: str = "float64",
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__()
         self.input_dim = input_dim
@@ -602,7 +601,7 @@ def __init__(
         use_sqrt_nnei: bool = True,
         g1_out_conv: bool = True,
         g1_out_mlp: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__()
         self.epsilon = 1e-4  # protection of 1./nnei
@@ -1132,10 +1131,10 @@ def forward(
         assert (nb, nloc) == g1.shape[:2]
         assert (nb, nloc, nnei) == h2.shape[:3]
 
-        g2_update: List[torch.Tensor] = [g2]
-        h2_update: List[torch.Tensor] = [h2]
-        g1_update: List[torch.Tensor] = [g1]
-        g1_mlp: List[torch.Tensor] = [g1] if not self.g1_out_mlp else []
+        g2_update: list[torch.Tensor] = [g2]
+        h2_update: list[torch.Tensor] = [h2]
+        g1_update: list[torch.Tensor] = [g1]
+        g1_mlp: list[torch.Tensor] = [g1] if not self.g1_out_mlp else []
         if self.g1_out_mlp:
             assert self.g1_self_mlp is not None
             g1_self_mlp = self.act(self.g1_self_mlp(g1))
@@ -1236,7 +1235,7 @@ def forward(
     @torch.jit.export
     def list_update_res_avg(
         self,
-        update_list: List[torch.Tensor],
+        update_list: list[torch.Tensor],
     ) -> torch.Tensor:
         nitem = len(update_list)
         uu = update_list[0]
@@ -1245,7 +1244,7 @@ def list_update_res_avg(
         return uu / (float(nitem) ** 0.5)
 
     @torch.jit.export
-    def list_update_res_incr(self, update_list: List[torch.Tensor]) -> torch.Tensor:
+    def list_update_res_incr(self, update_list: list[torch.Tensor]) -> torch.Tensor:
         nitem = len(update_list)
         uu = update_list[0]
         scale = 1.0 / (float(nitem - 1) ** 0.5) if nitem > 1 else 0.0
@@ -1255,7 +1254,7 @@ def list_update_res_incr(self, update_list: List[torch.Tensor]) -> torch.Tensor:
 
     @torch.jit.export
     def list_update_res_residual(
-        self, update_list: List[torch.Tensor], update_name: str = "g1"
+        self, update_list: list[torch.Tensor], update_name: str = "g1"
     ) -> torch.Tensor:
         nitem = len(update_list)
         uu = update_list[0]
@@ -1275,7 +1274,7 @@ def list_update_res_residual(
 
     @torch.jit.export
     def list_update(
-        self, update_list: List[torch.Tensor], update_name: str = "g1"
+        self, update_list: list[torch.Tensor], update_name: str = "g1"
     ) -> torch.Tensor:
         if self.update_style == "res_avg":
             return self.list_update_res_avg(update_list)
diff --git a/deepmd/pt/model/descriptor/repformer_layer_old_impl.py b/deepmd/pt/model/descriptor/repformer_layer_old_impl.py
index 81ee35c9ab..47b20f7b03 100644
--- a/deepmd/pt/model/descriptor/repformer_layer_old_impl.py
+++ b/deepmd/pt/model/descriptor/repformer_layer_old_impl.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Callable,
-    List,
 )
 
 import torch
@@ -634,10 +633,10 @@ def forward(
         if self.update_h2:
             h2 = _apply_h_norm(h2)
 
-        g2_update: List[torch.Tensor] = [g2]
-        h2_update: List[torch.Tensor] = [h2]
-        g1_update: List[torch.Tensor] = [g1]
-        g1_mlp: List[torch.Tensor] = [g1]
+        g2_update: list[torch.Tensor] = [g2]
+        h2_update: list[torch.Tensor] = [h2]
+        g1_update: list[torch.Tensor] = [g1]
+        g1_mlp: list[torch.Tensor] = [g1]
 
         if cal_gg1:
             gg1 = _make_nei_g1(g1_ext, nlist)
@@ -704,7 +703,7 @@ def forward(
     @torch.jit.export
     def list_update_res_avg(
         self,
-        update_list: List[torch.Tensor],
+        update_list: list[torch.Tensor],
     ) -> torch.Tensor:
         nitem = len(update_list)
         uu = update_list[0]
@@ -713,7 +712,7 @@ def list_update_res_avg(
         return uu / (float(nitem) ** 0.5)
 
     @torch.jit.export
-    def list_update_res_incr(self, update_list: List[torch.Tensor]) -> torch.Tensor:
+    def list_update_res_incr(self, update_list: list[torch.Tensor]) -> torch.Tensor:
         nitem = len(update_list)
         uu = update_list[0]
         scale = 1.0 / (float(nitem - 1) ** 0.5) if nitem > 1 else 0.0
@@ -722,7 +721,7 @@ def list_update_res_incr(self, update_list: List[torch.Tensor]) -> torch.Tensor:
         return uu
 
     @torch.jit.export
-    def list_update(self, update_list: List[torch.Tensor]) -> torch.Tensor:
+    def list_update(self, update_list: list[torch.Tensor]) -> torch.Tensor:
         if self.update_style == "res_avg":
             return self.list_update_res_avg(update_list)
         elif self.update_style == "res_incr":
diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py
index a9e4ef7893..64965825a0 100644
--- a/deepmd/pt/model/descriptor/repformers.py
+++ b/deepmd/pt/model/descriptor/repformers.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -60,7 +57,8 @@ def border_op(
         argument8,
     ) -> torch.Tensor:
         raise NotImplementedError(
-            "border_op is not available since customized PyTorch OP library is not built when freezing the model."
+            "border_op is not available since customized PyTorch OP library is not built when freezing the model. "
+            "See documentation for DPA-2 for details."
         )
 
     # Note: this hack cannot actually save a model that can be runned using LAMMPS.
@@ -99,12 +97,12 @@ def __init__(
         update_residual_init: str = "norm",
         set_davg_zero: bool = True,
         smooth: bool = True,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         precision: str = "float64",
         trainable_ln: bool = True,
         ln_eps: Optional[float] = 1e-5,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
         use_sqrt_nnei: bool = True,
         g1_out_conv: bool = True,
         g1_out_mlp: bool = True,
@@ -177,7 +175,7 @@ def __init__(
             The precision of the embedding net parameters.
         smooth : bool, optional
             Whether to use smoothness in processes such as attention weights calculation.
-        exclude_types : List[List[int]], optional
+        exclude_types : list[list[int]], optional
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
         env_protection : float, optional
@@ -339,7 +337,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
@@ -408,7 +406,7 @@ def dim_emb(self):
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
@@ -420,7 +418,7 @@ def forward(
         extended_atype: torch.Tensor,
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
     ):
         if comm_dict is None:
             assert mapping is not None
@@ -530,7 +528,7 @@ def forward(
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -538,11 +536,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -567,7 +565,7 @@ def compute_input_stats(
             self.mean.copy_(torch.tensor(mean, device=env.DEVICE))  # pylint: disable=no-explicit-dtype
         self.stddev.copy_(torch.tensor(stddev, device=env.DEVICE))  # pylint: disable=no-explicit-dtype
 
-    def get_stats(self) -> Dict[str, StatItem]:
+    def get_stats(self) -> dict[str, StatItem]:
         """Get the statistics of the descriptor."""
         if self.stats is None:
             raise RuntimeError(
diff --git a/deepmd/pt/model/descriptor/se_a.py b/deepmd/pt/model/descriptor/se_a.py
index 44564a6fd3..1b51acfa21 100644
--- a/deepmd/pt/model/descriptor/se_a.py
+++ b/deepmd/pt/model/descriptor/se_a.py
@@ -3,10 +3,7 @@
 from typing import (
     Callable,
     ClassVar,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -84,14 +81,14 @@ def __init__(
         activation_function: str = "tanh",
         precision: str = "float64",
         resnet_dt: bool = False,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         old_impl: bool = False,
         type_one_side: bool = True,
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
         ntypes: Optional[int] = None,  # to be compat with input
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         # not implemented
         spin=None,
     ):
@@ -130,7 +127,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return self.sea.get_nsel()
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sea.get_sel()
 
@@ -138,7 +135,7 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.sea.get_ntypes()
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
@@ -192,7 +189,7 @@ def dim_out(self):
         return self.sea.dim_out
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -205,7 +202,7 @@ def change_type_map(
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -213,11 +210,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -228,7 +225,7 @@ def compute_input_stats(
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         """Update the type exclusions."""
         self.sea.reinit_exclude(exclude_types)
@@ -239,7 +236,7 @@ def forward(
         atype_ext: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -284,7 +281,7 @@ def set_stat_mean_and_stddev(
         self.sea.mean = mean
         self.sea.stddev = stddev
 
-    def get_stat_mean_and_stddev(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def get_stat_mean_and_stddev(self) -> tuple[torch.Tensor, torch.Tensor]:
         """Get mean and stddev for descriptor."""
         return self.sea.mean, self.sea.stddev
 
@@ -342,9 +339,9 @@ def t_cvt(xx):
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -386,12 +383,12 @@ def __init__(
         activation_function: str = "tanh",
         precision: str = "float64",
         resnet_dt: bool = False,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         old_impl: bool = False,
         type_one_side: bool = True,
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
         **kwargs,
     ):
         """Construct an embedding net of type `se_a`.
@@ -484,7 +481,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
@@ -548,7 +545,7 @@ def __getitem__(self, key):
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -556,11 +553,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -585,7 +582,7 @@ def compute_input_stats(
             self.mean.copy_(torch.tensor(mean, device=env.DEVICE))  # pylint: disable=no-explicit-dtype
         self.stddev.copy_(torch.tensor(stddev, device=env.DEVICE))  # pylint: disable=no-explicit-dtype
 
-    def get_stats(self) -> Dict[str, StatItem]:
+    def get_stats(self) -> dict[str, StatItem]:
         """Get the statistics of the descriptor."""
         if self.stats is None:
             raise RuntimeError(
@@ -595,7 +592,7 @@ def get_stats(self) -> Dict[str, StatItem]:
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
diff --git a/deepmd/pt/model/descriptor/se_atten.py b/deepmd/pt/model/descriptor/se_atten.py
index 92d6e223e4..c760f7330b 100644
--- a/deepmd/pt/model/descriptor/se_atten.py
+++ b/deepmd/pt/model/descriptor/se_atten.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -63,7 +60,7 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: Union[List[int], int],
+        sel: Union[list[int], int],
         ntypes: int,
         neuron: list = [25, 50, 100],
         axis_neuron: int = 16,
@@ -82,11 +79,11 @@ def __init__(
         temperature=None,
         smooth: bool = True,
         type_one_side: bool = False,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         trainable_ln: bool = True,
         ln_eps: Optional[float] = 1e-5,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
         type: Optional[str] = None,
         old_impl: bool = False,
     ):
@@ -134,7 +131,7 @@ def __init__(
             (Only support False to keep consistent with other backend references.)
             (Not used in this version.)
             If mask the diagonal of attention weights
-        exclude_types : List[List[int]]
+        exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
         env_protection : float
@@ -304,7 +301,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
@@ -373,7 +370,7 @@ def dim_emb(self):
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -381,11 +378,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -410,7 +407,7 @@ def compute_input_stats(
             self.mean.copy_(torch.tensor(mean, device=env.DEVICE))  # pylint: disable=no-explicit-dtype
         self.stddev.copy_(torch.tensor(stddev, device=env.DEVICE))  # pylint: disable=no-explicit-dtype
 
-    def get_stats(self) -> Dict[str, StatItem]:
+    def get_stats(self) -> dict[str, StatItem]:
         """Get the statistics of the descriptor."""
         if self.stats is None:
             raise RuntimeError(
@@ -420,7 +417,7 @@ def get_stats(self) -> Dict[str, StatItem]:
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
@@ -612,7 +609,7 @@ def __init__(
         ln_eps: float = 1e-5,
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         """Construct a neighbor-wise attention net."""
         super().__init__()
@@ -755,7 +752,7 @@ def __init__(
         trainable_ln: bool = True,
         ln_eps: float = 1e-5,
         precision: str = DEFAULT_PRECISION,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         """Construct a neighbor-wise attention layer."""
         super().__init__()
@@ -862,7 +859,7 @@ def __init__(
         bias: bool = True,
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         """Construct a multi-head neighbor-wise attention net."""
         super().__init__()
diff --git a/deepmd/pt/model/descriptor/se_atten_v2.py b/deepmd/pt/model/descriptor/se_atten_v2.py
index 41e37eb03c..f73ff255e6 100644
--- a/deepmd/pt/model/descriptor/se_atten_v2.py
+++ b/deepmd/pt/model/descriptor/se_atten_v2.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -42,7 +40,7 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: Union[List[int], int],
+        sel: Union[list[int], int],
         ntypes: int,
         neuron: list = [25, 50, 100],
         axis_neuron: int = 16,
@@ -55,7 +53,7 @@ def __init__(
         activation_function: str = "tanh",
         precision: str = "float64",
         resnet_dt: bool = False,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         scaling_factor: int = 1.0,
         normalize=True,
@@ -66,10 +64,10 @@ def __init__(
         ln_eps: Optional[float] = 1e-5,
         type_one_side: bool = False,
         stripped_type_embedding: Optional[bool] = None,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         # not implemented
         spin=None,
         type: Optional[str] = None,
@@ -113,7 +111,7 @@ def __init__(
         resnet_dt : bool
             Time-step `dt` in the resnet construction:
             y = x + dt * \phi (Wx + b)
-        exclude_types : List[List[int]]
+        exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
         env_protection : float
@@ -149,7 +147,7 @@ def __init__(
             Whether to use electronic configuration type embedding.
         use_tebd_bias : bool, Optional
             Whether to use bias in the type embedding layer.
-        type_map : List[str], Optional
+        type_map : list[str], Optional
             A list of strings. Give the name to each type of atoms.
         spin
             (Only support None to keep consistent with other backend references.)
diff --git a/deepmd/pt/model/descriptor/se_r.py b/deepmd/pt/model/descriptor/se_r.py
index da8d422444..b873ee20b8 100644
--- a/deepmd/pt/model/descriptor/se_r.py
+++ b/deepmd/pt/model/descriptor/se_r.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -69,12 +66,12 @@ def __init__(
         activation_function: str = "tanh",
         precision: str = "float64",
         resnet_dt: bool = False,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         old_impl: bool = False,
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
-        type_map: Optional[List[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
+        type_map: Optional[list[str]] = None,
         **kwargs,
     ):
         super().__init__()
@@ -143,7 +140,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
@@ -151,7 +148,7 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.ntypes
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
@@ -225,7 +222,7 @@ def share_params(self, base_class, shared_level, resume=False):
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -238,7 +235,7 @@ def change_type_map(
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -246,11 +243,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -275,7 +272,7 @@ def compute_input_stats(
             self.mean.copy_(torch.tensor(mean, device=env.DEVICE))  # pylint: disable=no-explicit-dtype
         self.stddev.copy_(torch.tensor(stddev, device=env.DEVICE))  # pylint: disable=no-explicit-dtype
 
-    def get_stats(self) -> Dict[str, StatItem]:
+    def get_stats(self) -> dict[str, StatItem]:
         """Get the statistics of the descriptor."""
         if self.stats is None:
             raise RuntimeError(
@@ -301,7 +298,7 @@ def __getitem__(self, key):
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
@@ -312,7 +309,7 @@ def forward(
         atype_ext: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -404,7 +401,7 @@ def set_stat_mean_and_stddev(
         self.mean = mean
         self.stddev = stddev
 
-    def get_stat_mean_and_stddev(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def get_stat_mean_and_stddev(self) -> tuple[torch.Tensor, torch.Tensor]:
         """Get mean and stddev for descriptor."""
         return self.mean, self.stddev
 
@@ -458,9 +455,9 @@ def t_cvt(xx):
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/pt/model/descriptor/se_t.py b/deepmd/pt/model/descriptor/se_t.py
index 5e7e507fbf..072457b48f 100644
--- a/deepmd/pt/model/descriptor/se_t.py
+++ b/deepmd/pt/model/descriptor/se_t.py
@@ -3,10 +3,7 @@
 from typing import (
     Callable,
     ClassVar,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -95,7 +92,7 @@ class DescrptSeT(BaseDescriptor, torch.nn.Module):
             The activation function in the embedding net. Supported options are |ACTIVATION_FN|
     env_protection : float
             Protection parameter to prevent division by zero errors during environment matrix calculations.
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     precision : str
@@ -104,7 +101,7 @@ class DescrptSeT(BaseDescriptor, torch.nn.Module):
             If the weights of embedding net are trainable.
     seed : int, Optional
             Random seed for initializing the network parameters.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
 
@@ -112,17 +109,17 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         resnet_dt: bool = False,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         env_protection: float = 0.0,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         precision: str = "float64",
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
-        type_map: Optional[List[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
+        type_map: Optional[list[str]] = None,
         ntypes: Optional[int] = None,  # to be compat with input
         # not implemented
         spin=None,
@@ -159,7 +156,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return self.seat.get_nsel()
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.seat.get_sel()
 
@@ -167,7 +164,7 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.seat.get_ntypes()
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
@@ -221,7 +218,7 @@ def dim_out(self):
         return self.seat.dim_out
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -234,7 +231,7 @@ def change_type_map(
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -242,11 +239,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -257,7 +254,7 @@ def compute_input_stats(
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         """Update the type exclusions."""
         self.seat.reinit_exclude(exclude_types)
@@ -268,7 +265,7 @@ def forward(
         atype_ext: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -314,7 +311,7 @@ def set_stat_mean_and_stddev(
         self.seat.mean = mean
         self.seat.stddev = stddev
 
-    def get_stat_mean_and_stddev(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def get_stat_mean_and_stddev(self) -> tuple[torch.Tensor, torch.Tensor]:
         """Get mean and stddev for descriptor."""
         return self.seat.mean, self.seat.stddev
 
@@ -367,9 +364,9 @@ def t_cvt(xx):
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -404,16 +401,16 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         resnet_dt: bool = False,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         env_protection: float = 0.0,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         precision: str = "float64",
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         r"""Construct an embedding net of type `se_e3`.
 
@@ -438,7 +435,7 @@ def __init__(
             The activation function in the embedding net. Supported options are |ACTIVATION_FN|
         env_protection : float
             Protection parameter to prevent division by zero errors during environment matrix calculations.
-        exclude_types : List[List[int]]
+        exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
         precision : str
@@ -511,7 +508,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
@@ -575,7 +572,7 @@ def __getitem__(self, key):
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -583,11 +580,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -612,7 +609,7 @@ def compute_input_stats(
             self.mean.copy_(torch.tensor(mean, device=env.DEVICE))  # pylint: disable=no-explicit-dtype
         self.stddev.copy_(torch.tensor(stddev, device=env.DEVICE))  # pylint: disable=no-explicit-dtype
 
-    def get_stats(self) -> Dict[str, StatItem]:
+    def get_stats(self) -> dict[str, StatItem]:
         """Get the statistics of the descriptor."""
         if self.stats is None:
             raise RuntimeError(
@@ -622,7 +619,7 @@ def get_stats(self) -> Dict[str, StatItem]:
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
diff --git a/deepmd/pt/model/descriptor/se_t_tebd.py b/deepmd/pt/model/descriptor/se_t_tebd.py
index 774a9154de..437a464709 100644
--- a/deepmd/pt/model/descriptor/se_t_tebd.py
+++ b/deepmd/pt/model/descriptor/se_t_tebd.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -79,7 +76,7 @@ class DescrptSeTTebd(BaseDescriptor, torch.nn.Module):
             The cut-off radius
     rcut_smth
             From where the environment matrix should be smoothed
-    sel : Union[List[int], int]
+    sel : Union[list[int], int]
             list[int]: sel[i] specifies the maxmum number of type i atoms in the cut-off radius
             int: the total maxmum number of atoms in the cut-off radius
     ntypes : int
@@ -101,7 +98,7 @@ class DescrptSeTTebd(BaseDescriptor, torch.nn.Module):
             The activation function in the embedding net. Supported options are |ACTIVATION_FN|
     env_protection: float
             Protection parameter to prevent division by zero errors during environment matrix calculations.
-    exclude_types : List[Tuple[int, int]]
+    exclude_types : list[tuple[int, int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     precision
@@ -110,7 +107,7 @@ class DescrptSeTTebd(BaseDescriptor, torch.nn.Module):
             If the weights of embedding net are trainable.
     seed
             Random seed for initializing the network parameters.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     concat_output_tebd: bool
             Whether to concat type embedding at the output of the descriptor.
@@ -127,7 +124,7 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: Union[List[int], int],
+        sel: Union[list[int], int],
         ntypes: int,
         neuron: list = [2, 4, 8],
         tebd_dim: int = 8,
@@ -136,11 +133,11 @@ def __init__(
         set_davg_zero: bool = True,
         activation_function: str = "tanh",
         env_protection: float = 0.0,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         precision: str = "float64",
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
-        type_map: Optional[List[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
+        type_map: Optional[list[str]] = None,
         concat_output_tebd: bool = True,
         use_econf_tebd: bool = False,
         use_tebd_bias=False,
@@ -195,7 +192,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return self.se_ttebd.get_nsel()
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.se_ttebd.get_sel()
 
@@ -203,7 +200,7 @@ def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return self.se_ttebd.get_ntypes()
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
@@ -274,7 +271,7 @@ def dim_emb(self):
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -282,11 +279,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -304,12 +301,12 @@ def set_stat_mean_and_stddev(
         self.se_ttebd.mean = mean
         self.se_ttebd.stddev = stddev
 
-    def get_stat_mean_and_stddev(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def get_stat_mean_and_stddev(self) -> tuple[torch.Tensor, torch.Tensor]:
         """Get mean and stddev for descriptor."""
         return self.se_ttebd.mean, self.se_ttebd.stddev
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -410,7 +407,7 @@ def forward(
         extended_atype: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -465,9 +462,9 @@ def forward(
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -500,7 +497,7 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: Union[List[int], int],
+        sel: Union[list[int], int],
         ntypes: int,
         neuron: list = [25, 50, 100],
         tebd_dim: int = 8,
@@ -509,10 +506,10 @@ def __init__(
         activation_function="tanh",
         precision: str = "float64",
         resnet_dt: bool = False,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         smooth: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__()
         self.rcut = rcut
@@ -598,7 +595,7 @@ def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
 
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
@@ -667,7 +664,7 @@ def dim_emb(self):
 
     def compute_input_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
     ):
         """
@@ -675,11 +672,11 @@ def compute_input_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         path : Optional[DPPath]
@@ -704,7 +701,7 @@ def compute_input_stats(
             self.mean.copy_(torch.tensor(mean, device=env.DEVICE))  # pylint: disable=no-explicit-dtype
         self.stddev.copy_(torch.tensor(stddev, device=env.DEVICE))  # pylint: disable=no-explicit-dtype
 
-    def get_stats(self) -> Dict[str, StatItem]:
+    def get_stats(self) -> dict[str, StatItem]:
         """Get the statistics of the descriptor."""
         if self.stats is None:
             raise RuntimeError(
@@ -714,7 +711,7 @@ def get_stats(self) -> Dict[str, StatItem]:
 
     def reinit_exclude(
         self,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
diff --git a/deepmd/pt/model/model/__init__.py b/deepmd/pt/model/model/__init__.py
index 1c81d42013..613baf440e 100644
--- a/deepmd/pt/model/model/__init__.py
+++ b/deepmd/pt/model/model/__init__.py
@@ -36,6 +36,9 @@
 from .dos_model import (
     DOSModel,
 )
+from .dp_linear_model import (
+    LinearEnergyModel,
+)
 from .dp_model import (
     DPModelCommon,
 )
@@ -69,6 +72,29 @@
 )
 
 
+def _get_standard_model_components(model_params, ntypes):
+    # descriptor
+    model_params["descriptor"]["ntypes"] = ntypes
+    model_params["descriptor"]["type_map"] = copy.deepcopy(model_params["type_map"])
+    descriptor = BaseDescriptor(**model_params["descriptor"])
+    # fitting
+    fitting_net = model_params.get("fitting_net", {})
+    fitting_net["type"] = fitting_net.get("type", "ener")
+    fitting_net["ntypes"] = descriptor.get_ntypes()
+    fitting_net["type_map"] = copy.deepcopy(model_params["type_map"])
+    fitting_net["mixed_types"] = descriptor.mixed_types()
+    if fitting_net["type"] in ["dipole", "polar"]:
+        fitting_net["embedding_width"] = descriptor.get_dim_emb()
+    fitting_net["dim_descrpt"] = descriptor.get_dim_out()
+    grad_force = "direct" not in fitting_net["type"]
+    if not grad_force:
+        fitting_net["out_dim"] = descriptor.get_dim_emb()
+        if "ener" in fitting_net["type"]:
+            fitting_net["return_energy"] = True
+    fitting = BaseFitting(**fitting_net)
+    return descriptor, fitting, fitting_net["type"]
+
+
 def get_spin_model(model_params):
     model_params = copy.deepcopy(model_params)
     if not model_params["spin"]["use_spin"] or isinstance(
@@ -105,27 +131,50 @@ def get_spin_model(model_params):
     return SpinEnergyModel(backbone_model=backbone_model, spin=spin)
 
 
+def get_linear_model(model_params):
+    model_params = copy.deepcopy(model_params)
+    weights = model_params.get("weights", "mean")
+    list_of_models = []
+    ntypes = len(model_params["type_map"])
+    for sub_model_params in model_params["models"]:
+        if "descriptor" in sub_model_params:
+            # descriptor
+            sub_model_params["descriptor"]["ntypes"] = ntypes
+            descriptor, fitting, _ = _get_standard_model_components(
+                sub_model_params, ntypes
+            )
+            list_of_models.append(
+                DPAtomicModel(descriptor, fitting, type_map=model_params["type_map"])
+            )
+
+        else:  # must be pairtab
+            assert (
+                "type" in sub_model_params and sub_model_params["type"] == "pairtab"
+            ), "Sub-models in LinearEnergyModel must be a DPModel or a PairTable Model"
+            list_of_models.append(
+                PairTabAtomicModel(
+                    sub_model_params["tab_file"],
+                    sub_model_params["rcut"],
+                    sub_model_params["sel"],
+                    type_map=model_params["type_map"],
+                )
+            )
+
+    atom_exclude_types = model_params.get("atom_exclude_types", [])
+    pair_exclude_types = model_params.get("pair_exclude_types", [])
+    return LinearEnergyModel(
+        models=list_of_models,
+        type_map=model_params["type_map"],
+        weights=weights,
+        atom_exclude_types=atom_exclude_types,
+        pair_exclude_types=pair_exclude_types,
+    )
+
+
 def get_zbl_model(model_params):
     model_params = copy.deepcopy(model_params)
     ntypes = len(model_params["type_map"])
-    # descriptor
-    model_params["descriptor"]["ntypes"] = ntypes
-    model_params["descriptor"]["type_map"] = copy.deepcopy(model_params["type_map"])
-    descriptor = BaseDescriptor(**model_params["descriptor"])
-    # fitting
-    fitting_net = model_params.get("fitting_net", None)
-    fitting_net["type"] = fitting_net.get("type", "ener")
-    fitting_net["ntypes"] = descriptor.get_ntypes()
-    fitting_net["type_map"] = copy.deepcopy(model_params["type_map"])
-    fitting_net["mixed_types"] = descriptor.mixed_types()
-    fitting_net["embedding_width"] = descriptor.get_dim_out()
-    fitting_net["dim_descrpt"] = descriptor.get_dim_out()
-    grad_force = "direct" not in fitting_net["type"]
-    if not grad_force:
-        fitting_net["out_dim"] = descriptor.get_dim_emb()
-        if "ener" in fitting_net["type"]:
-            fitting_net["return_energy"] = True
-    fitting = BaseFitting(**fitting_net)
+    descriptor, fitting, _ = _get_standard_model_components(model_params, ntypes)
     dp_model = DPAtomicModel(descriptor, fitting, type_map=model_params["type_map"])
     # pairtab
     filepath = model_params["use_srtab"]
@@ -187,25 +236,9 @@ def get_standard_model(model_params):
     model_params_old = model_params
     model_params = copy.deepcopy(model_params)
     ntypes = len(model_params["type_map"])
-    # descriptor
-    model_params["descriptor"]["ntypes"] = ntypes
-    model_params["descriptor"]["type_map"] = copy.deepcopy(model_params["type_map"])
-    descriptor = BaseDescriptor(**model_params["descriptor"])
-    # fitting
-    fitting_net = model_params.get("fitting_net", {})
-    fitting_net["type"] = fitting_net.get("type", "ener")
-    fitting_net["ntypes"] = descriptor.get_ntypes()
-    fitting_net["type_map"] = copy.deepcopy(model_params["type_map"])
-    fitting_net["mixed_types"] = descriptor.mixed_types()
-    if fitting_net["type"] in ["dipole", "polar"]:
-        fitting_net["embedding_width"] = descriptor.get_dim_emb()
-    fitting_net["dim_descrpt"] = descriptor.get_dim_out()
-    grad_force = "direct" not in fitting_net["type"]
-    if not grad_force:
-        fitting_net["out_dim"] = descriptor.get_dim_emb()
-        if "ener" in fitting_net["type"]:
-            fitting_net["return_energy"] = True
-    fitting = BaseFitting(**fitting_net)
+    descriptor, fitting, fitting_net_type = _get_standard_model_components(
+        model_params, ntypes
+    )
     atom_exclude_types = model_params.get("atom_exclude_types", [])
     pair_exclude_types = model_params.get("pair_exclude_types", [])
     preset_out_bias = model_params.get("preset_out_bias")
@@ -213,18 +246,18 @@ def get_standard_model(model_params):
         preset_out_bias, model_params["type_map"]
     )
 
-    if fitting_net["type"] == "dipole":
+    if fitting_net_type == "dipole":
         modelcls = DipoleModel
-    elif fitting_net["type"] == "polar":
+    elif fitting_net_type == "polar":
         modelcls = PolarModel
-    elif fitting_net["type"] == "dos":
+    elif fitting_net_type == "dos":
         modelcls = DOSModel
-    elif fitting_net["type"] in ["ener", "direct_force_ener"]:
+    elif fitting_net_type in ["ener", "direct_force_ener"]:
         modelcls = EnergyModel
-    elif fitting_net["type"] == "property":
+    elif fitting_net_type == "property":
         modelcls = PropertyModel
     else:
-        raise RuntimeError(f"Unknown fitting type: {fitting_net['type']}")
+        raise RuntimeError(f"Unknown fitting type: {fitting_net_type}")
 
     model = modelcls(
         descriptor=descriptor,
@@ -247,6 +280,8 @@ def get_model(model_params):
             return get_zbl_model(model_params)
         else:
             return get_standard_model(model_params)
+    elif model_type == "linear_ener":
+        return get_linear_model(model_params)
     else:
         return BaseModel.get_class_by_type(model_type).get_model(model_params)
 
@@ -265,4 +300,5 @@ def get_model(model_params):
     "DPZBLModel",
     "make_model",
     "make_hessian_model",
+    "LinearEnergyModel",
 ]
diff --git a/deepmd/pt/model/model/dipole_model.py b/deepmd/pt/model/model/dipole_model.py
index 0d4a53a850..c83d1f0bf7 100644
--- a/deepmd/pt/model/model/dipole_model.py
+++ b/deepmd/pt/model/model/dipole_model.py
@@ -3,7 +3,6 @@
     deepcopy,
 )
 from typing import (
-    Dict,
     Optional,
 )
 
@@ -64,7 +63,7 @@ def forward(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ) -> Dict[str, torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common(
             coord,
             atype,
diff --git a/deepmd/pt/model/model/dos_model.py b/deepmd/pt/model/model/dos_model.py
index 27d62fa882..abfcd4a2b4 100644
--- a/deepmd/pt/model/model/dos_model.py
+++ b/deepmd/pt/model/model/dos_model.py
@@ -3,7 +3,6 @@
     deepcopy,
 )
 from typing import (
-    Dict,
     Optional,
 )
 
@@ -56,7 +55,7 @@ def forward(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ) -> Dict[str, torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common(
             coord,
             atype,
diff --git a/deepmd/pt/model/model/dp_linear_model.py b/deepmd/pt/model/model/dp_linear_model.py
new file mode 100644
index 0000000000..ef2e84bd19
--- /dev/null
+++ b/deepmd/pt/model/model/dp_linear_model.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from copy import (
+    deepcopy,
+)
+from typing import (
+    Optional,
+)
+
+import torch
+
+from deepmd.pt.model.atomic_model import (
+    LinearEnergyAtomicModel,
+)
+from deepmd.pt.model.model.model import (
+    BaseModel,
+)
+from deepmd.utils.data_system import (
+    DeepmdDataSystem,
+)
+
+from .dp_model import (
+    DPModelCommon,
+)
+from .make_model import (
+    make_model,
+)
+
+DPLinearModel_ = make_model(LinearEnergyAtomicModel)
+
+
+@BaseModel.register("linear_ener")
+class LinearEnergyModel(DPLinearModel_):
+    model_type = "ener"
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+    def translated_output_def(self):
+        out_def_data = self.model_output_def().get_data()
+        output_def = {
+            "atom_energy": deepcopy(out_def_data["energy"]),
+            "energy": deepcopy(out_def_data["energy_redu"]),
+        }
+        if self.do_grad_r("energy"):
+            output_def["force"] = deepcopy(out_def_data["energy_derv_r"])
+            output_def["force"].squeeze(-2)
+        if self.do_grad_c("energy"):
+            output_def["virial"] = deepcopy(out_def_data["energy_derv_c_redu"])
+            output_def["virial"].squeeze(-2)
+            output_def["atom_virial"] = deepcopy(out_def_data["energy_derv_c"])
+            output_def["atom_virial"].squeeze(-3)
+        if "mask" in out_def_data:
+            output_def["mask"] = deepcopy(out_def_data["mask"])
+        return output_def
+
+    def forward(
+        self,
+        coord,
+        atype,
+        box: Optional[torch.Tensor] = None,
+        fparam: Optional[torch.Tensor] = None,
+        aparam: Optional[torch.Tensor] = None,
+        do_atomic_virial: bool = False,
+    ) -> dict[str, torch.Tensor]:
+        model_ret = self.forward_common(
+            coord,
+            atype,
+            box,
+            fparam=fparam,
+            aparam=aparam,
+            do_atomic_virial=do_atomic_virial,
+        )
+
+        model_predict = {}
+        model_predict["atom_energy"] = model_ret["energy"]
+        model_predict["energy"] = model_ret["energy_redu"]
+        if self.do_grad_r("energy"):
+            model_predict["force"] = model_ret["energy_derv_r"].squeeze(-2)
+        if self.do_grad_c("energy"):
+            model_predict["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2)
+            if do_atomic_virial:
+                model_predict["atom_virial"] = model_ret["energy_derv_c"].squeeze(-3)
+        else:
+            model_predict["force"] = model_ret["dforce"]
+        if "mask" in model_ret:
+            model_predict["mask"] = model_ret["mask"]
+        return model_predict
+
+    @torch.jit.export
+    def forward_lower(
+        self,
+        extended_coord,
+        extended_atype,
+        nlist,
+        mapping: Optional[torch.Tensor] = None,
+        fparam: Optional[torch.Tensor] = None,
+        aparam: Optional[torch.Tensor] = None,
+        do_atomic_virial: bool = False,
+    ):
+        model_ret = self.forward_common_lower(
+            extended_coord,
+            extended_atype,
+            nlist,
+            mapping=mapping,
+            fparam=fparam,
+            aparam=aparam,
+            do_atomic_virial=do_atomic_virial,
+            extra_nlist_sort=self.need_sorted_nlist_for_lower(),
+        )
+
+        model_predict = {}
+        model_predict["atom_energy"] = model_ret["energy"]
+        model_predict["energy"] = model_ret["energy_redu"]
+        if self.do_grad_r("energy"):
+            model_predict["extended_force"] = model_ret["energy_derv_r"].squeeze(-2)
+        if self.do_grad_c("energy"):
+            model_predict["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2)
+            if do_atomic_virial:
+                model_predict["extended_virial"] = model_ret["energy_derv_c"].squeeze(
+                    -3
+                )
+        else:
+            assert model_ret["dforce"] is not None
+            model_predict["dforce"] = model_ret["dforce"]
+        return model_predict
+
+    @classmethod
+    def update_sel(
+        cls,
+        train_data: DeepmdDataSystem,
+        type_map: Optional[list[str]],
+        local_jdata: dict,
+    ) -> tuple[dict, Optional[float]]:
+        """Update the selection and perform neighbor statistics.
+
+        Parameters
+        ----------
+        train_data : DeepmdDataSystem
+            data used to do neighbor statictics
+        type_map : list[str], optional
+            The name of each type of atoms
+        local_jdata : dict
+            The local data refer to the current class
+
+        Returns
+        -------
+        dict
+            The updated local data
+        float
+            The minimum distance between two atoms
+        """
+        local_jdata_cpy = local_jdata.copy()
+        type_map = local_jdata_cpy["type_map"]
+        min_nbor_dist = None
+        for idx, sub_model in enumerate(local_jdata_cpy["models"]):
+            if "tab_file" not in sub_model:
+                sub_model, temp_min = DPModelCommon.update_sel(
+                    train_data, type_map, local_jdata["models"][idx]
+                )
+                if min_nbor_dist is None or temp_min <= min_nbor_dist:
+                    min_nbor_dist = temp_min
+        return local_jdata_cpy, min_nbor_dist
diff --git a/deepmd/pt/model/model/dp_model.py b/deepmd/pt/model/model/dp_model.py
index d3a65db287..bd278ed787 100644
--- a/deepmd/pt/model/model/dp_model.py
+++ b/deepmd/pt/model/model/dp_model.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
-    Tuple,
 )
 
+import torch
+
 from deepmd.pt.model.descriptor.base_descriptor import (
     BaseDescriptor,
 )
@@ -20,9 +20,9 @@ class DPModelCommon:
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -54,3 +54,13 @@ def get_fitting_net(self):
     def get_descriptor(self):
         """Get the descriptor."""
         return self.atomic_model.descriptor
+
+    @torch.jit.export
+    def set_eval_descriptor_hook(self, enable: bool) -> None:
+        """Set the hook for evaluating descriptor and clear the cache for descriptor list."""
+        self.atomic_model.set_eval_descriptor_hook(enable)
+
+    @torch.jit.export
+    def eval_descriptor(self) -> torch.Tensor:
+        """Evaluate the descriptor."""
+        return self.atomic_model.eval_descriptor()
diff --git a/deepmd/pt/model/model/dp_zbl_model.py b/deepmd/pt/model/model/dp_zbl_model.py
index 4016f0eb35..59147e1d4c 100644
--- a/deepmd/pt/model/model/dp_zbl_model.py
+++ b/deepmd/pt/model/model/dp_zbl_model.py
@@ -3,10 +3,7 @@
     deepcopy,
 )
 from typing import (
-    Dict,
-    List,
     Optional,
-    Tuple,
 )
 
 import torch
@@ -68,7 +65,7 @@ def forward(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ) -> Dict[str, torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common(
             coord,
             atype,
@@ -135,9 +132,9 @@ def forward_lower(
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/pt/model/model/ener_model.py b/deepmd/pt/model/model/ener_model.py
index e58ba1df62..82f429c4ab 100644
--- a/deepmd/pt/model/model/ener_model.py
+++ b/deepmd/pt/model/model/ener_model.py
@@ -3,7 +3,6 @@
     deepcopy,
 )
 from typing import (
-    Dict,
     Optional,
 )
 
@@ -64,7 +63,7 @@ def forward(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ) -> Dict[str, torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common(
             coord,
             atype,
@@ -104,7 +103,7 @@ def forward_lower(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
     ):
         model_ret = self.forward_common_lower(
             extended_coord,
diff --git a/deepmd/pt/model/model/frozen.py b/deepmd/pt/model/model/frozen.py
index 395d81c217..431c035339 100644
--- a/deepmd/pt/model/model/frozen.py
+++ b/deepmd/pt/model/model/frozen.py
@@ -2,10 +2,7 @@
 import json
 import tempfile
 from typing import (
-    Dict,
-    List,
     Optional,
-    Tuple,
 )
 
 import torch
@@ -56,12 +53,12 @@ def get_rcut(self) -> float:
         return self.model.get_rcut()
 
     @torch.jit.export
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map."""
         return self.model.get_type_map()
 
     @torch.jit.export
-    def get_sel(self) -> List[int]:
+    def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.model.get_sel()
 
@@ -76,7 +73,7 @@ def get_dim_aparam(self) -> int:
         return self.model.get_dim_aparam()
 
     @torch.jit.export
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
@@ -124,7 +121,7 @@ def forward(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ) -> Dict[str, torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
         return self.model.forward(
             coord,
             atype,
@@ -177,9 +174,9 @@ def get_nsel(self) -> int:
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/pt/model/model/make_hessian_model.py b/deepmd/pt/model/model/make_hessian_model.py
index 9588348f53..d2541a815e 100644
--- a/deepmd/pt/model/model/make_hessian_model.py
+++ b/deepmd/pt/model/model/make_hessian_model.py
@@ -2,8 +2,6 @@
 import copy
 import math
 from typing import (
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -47,7 +45,7 @@ def __init__(
 
         def requires_hessian(
             self,
-            keys: Union[str, List[str]],
+            keys: Union[str, list[str]],
         ):
             """Set which output variable(s) requires hessian."""
             if isinstance(keys, str):
@@ -68,7 +66,7 @@ def forward_common(
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
             do_atomic_virial: bool = False,
-        ) -> Dict[str, torch.Tensor]:
+        ) -> dict[str, torch.Tensor]:
             """Return model prediction.
 
             Parameters
@@ -90,7 +88,7 @@ def forward_common(
             Returns
             -------
             ret_dict
-                The result dict of type Dict[str,torch.Tensor].
+                The result dict of type dict[str,torch.Tensor].
                 The keys are defined by the `ModelOutputDef`.
 
             """
@@ -122,7 +120,7 @@ def _cal_hessian_all(
             box: Optional[torch.Tensor] = None,
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
-        ) -> Dict[str, torch.Tensor]:
+        ) -> dict[str, torch.Tensor]:
             nf, nloc = atype.shape
             coord = coord.view([nf, (nloc * 3)])
             box = box.view([nf, 9]) if box is not None else None
@@ -130,7 +128,7 @@ def _cal_hessian_all(
             aparam = aparam.view([nf, nloc, -1]) if aparam is not None else None
             fdef = self.atomic_output_def()
             # keys of values that require hessian
-            hess_keys: List[str] = []
+            hess_keys: list[str] = []
             for kk in fdef.keys():
                 if fdef[kk].r_hessian:
                     hess_keys.append(kk)
diff --git a/deepmd/pt/model/model/make_model.py b/deepmd/pt/model/model/make_model.py
index 8207f4961e..46b7e51109 100644
--- a/deepmd/pt/model/model/make_model.py
+++ b/deepmd/pt/model/model/make_model.py
@@ -1,10 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    Dict,
-    List,
     Optional,
-    Tuple,
-    Type,
 )
 
 import torch
@@ -43,7 +39,7 @@
 )
 
 
-def make_model(T_AtomicModel: Type[BaseAtomicModel]):
+def make_model(T_AtomicModel: type[BaseAtomicModel]):
     """Make a model as a derived class of an atomic model.
 
     The model provide two interfaces.
@@ -89,13 +85,13 @@ def model_output_def(self):
             return ModelOutputDef(self.atomic_output_def())
 
         @torch.jit.export
-        def model_output_type(self) -> List[str]:
+        def model_output_type(self) -> list[str]:
             """Get the output type for the model."""
             output_def = self.model_output_def()
             var_defs = output_def.var_defs
             # jit: Comprehension ifs are not supported yet
             # type hint is critical for JIT
-            vars: List[str] = []
+            vars: list[str] = []
             for kk, vv in var_defs.items():
                 # .value is critical for JIT
                 if vv.category == OutputVariableCategory.OUT.value:
@@ -111,7 +107,7 @@ def forward_common(
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
             do_atomic_virial: bool = False,
-        ) -> Dict[str, torch.Tensor]:
+        ) -> dict[str, torch.Tensor]:
             """Return model prediction.
 
             Parameters
@@ -133,7 +129,7 @@ def forward_common(
             Returns
             -------
             ret_dict
-                The result dict of type Dict[str,torch.Tensor].
+                The result dict of type dict[str,torch.Tensor].
                 The keys are defined by the `ModelOutputDef`.
 
             """
@@ -187,11 +183,11 @@ def change_out_bias(
 
             Parameters
             ----------
-            merged : Union[Callable[[], List[dict]], List[dict]]
-                - List[dict]: A list of data samples from various data systems.
+            merged : Union[Callable[[], list[dict]], list[dict]]
+                - list[dict]: A list of data samples from various data systems.
                     Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                     originating from the `i`-th data system.
-                - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+                - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                     only when needed. Since the sampling process can be slow and memory-intensive,
                     the lazy function helps by only sampling once.
             bias_adjust_mode : str
@@ -214,7 +210,7 @@ def forward_common_lower(
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
             do_atomic_virial: bool = False,
-            comm_dict: Optional[Dict[str, torch.Tensor]] = None,
+            comm_dict: Optional[dict[str, torch.Tensor]] = None,
             extra_nlist_sort: bool = False,
         ):
             """Return model prediction. Lower interface that takes
@@ -283,7 +279,7 @@ def input_type_cast(
             box: Optional[torch.Tensor] = None,
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
-        ) -> Tuple[
+        ) -> tuple[
             torch.Tensor,
             Optional[torch.Tensor],
             Optional[torch.Tensor],
@@ -302,7 +298,7 @@ def input_type_cast(
             #           " does not match"
             #           f" that of the coordinate {input_prec}"
             #         )
-            _lst: List[Optional[torch.Tensor]] = [
+            _lst: list[Optional[torch.Tensor]] = [
                 vv.to(coord.dtype) if vv is not None else None
                 for vv in [box, fparam, aparam]
             ]
@@ -324,9 +320,9 @@ def input_type_cast(
 
         def output_type_cast(
             self,
-            model_ret: Dict[str, torch.Tensor],
+            model_ret: dict[str, torch.Tensor],
             input_prec: str,
-        ) -> Dict[str, torch.Tensor]:
+        ) -> dict[str, torch.Tensor]:
             """Convert the model output to the input prec."""
             do_cast = (
                 input_prec
@@ -469,7 +465,7 @@ def do_grad_c(
             return self.atomic_model.do_grad_c(var_name)
 
         def change_type_map(
-            self, type_map: List[str], model_with_new_type_stat=None
+            self, type_map: list[str], model_with_new_type_stat=None
         ) -> None:
             """Change the type related params to new ones, according to `type_map` and the original one in the model.
             If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -499,7 +495,7 @@ def get_dim_aparam(self) -> int:
             return self.atomic_model.get_dim_aparam()
 
         @torch.jit.export
-        def get_sel_type(self) -> List[int]:
+        def get_sel_type(self) -> list[int]:
             """Get the selected atom types of this model.
 
             Only atoms with selected atom types have atomic contribution
@@ -522,7 +518,7 @@ def get_rcut(self) -> float:
             return self.atomic_model.get_rcut()
 
         @torch.jit.export
-        def get_type_map(self) -> List[str]:
+        def get_type_map(self) -> list[str]:
             """Get the type map."""
             return self.atomic_model.get_type_map()
 
@@ -548,7 +544,7 @@ def compute_or_load_stat(
             """Compute or load the statistics."""
             return self.atomic_model.compute_or_load_stat(sampled_func, stat_file_path)
 
-        def get_sel(self) -> List[int]:
+        def get_sel(self) -> list[int]:
             """Returns the number of selected atoms for each type."""
             return self.atomic_model.get_sel()
 
@@ -581,7 +577,7 @@ def forward(
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
             do_atomic_virial: bool = False,
-        ) -> Dict[str, torch.Tensor]:
+        ) -> dict[str, torch.Tensor]:
             # directly call the forward_common method when no specific transform rule
             return self.forward_common(
                 coord,
diff --git a/deepmd/pt/model/model/polar_model.py b/deepmd/pt/model/model/polar_model.py
index 7fbb7bdcf4..57379ba372 100644
--- a/deepmd/pt/model/model/polar_model.py
+++ b/deepmd/pt/model/model/polar_model.py
@@ -3,7 +3,6 @@
     deepcopy,
 )
 from typing import (
-    Dict,
     Optional,
 )
 
@@ -56,7 +55,7 @@ def forward(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ) -> Dict[str, torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common(
             coord,
             atype,
diff --git a/deepmd/pt/model/model/property_model.py b/deepmd/pt/model/model/property_model.py
index a5b52139fe..164331f44c 100644
--- a/deepmd/pt/model/model/property_model.py
+++ b/deepmd/pt/model/model/property_model.py
@@ -3,7 +3,6 @@
     deepcopy,
 )
 from typing import (
-    Dict,
     Optional,
 )
 
@@ -56,7 +55,7 @@ def forward(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ) -> Dict[str, torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common(
             coord,
             atype,
@@ -92,7 +91,7 @@ def forward_lower(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
     ):
         model_ret = self.forward_common_lower(
             extended_coord,
diff --git a/deepmd/pt/model/model/spin_model.py b/deepmd/pt/model/model/spin_model.py
index 717a7ee7c8..a9f6e4d75a 100644
--- a/deepmd/pt/model/model/spin_model.py
+++ b/deepmd/pt/model/model/spin_model.py
@@ -4,8 +4,6 @@
     deepcopy,
 )
 from typing import (
-    Dict,
-    List,
     Optional,
 )
 
@@ -258,7 +256,7 @@ def expand_aparam(aparam, nloc: int):
         return aparam
 
     @torch.jit.export
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map."""
         tmap = self.backbone_model.get_type_map()
         ntypes = len(tmap) // 2  # ignore the virtual type
@@ -285,7 +283,7 @@ def get_dim_aparam(self):
         return self.backbone_model.get_dim_aparam()
 
     @torch.jit.export
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
         Only atoms with selected atom types have atomic contribution
         to the result of the model.
@@ -301,7 +299,7 @@ def is_aparam_nall(self) -> bool:
         return self.backbone_model.is_aparam_nall()
 
     @torch.jit.export
-    def model_output_type(self) -> List[str]:
+    def model_output_type(self) -> list[str]:
         """Get the output type for the model."""
         return self.backbone_model.model_output_type()
 
@@ -422,7 +420,7 @@ def forward_common(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ) -> Dict[str, torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
         nframes, nloc = atype.shape
         coord_updated, atype_updated = self.process_spin_input(coord, atype, spin)
         if aparam is not None:
@@ -576,7 +574,7 @@ def forward(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ) -> Dict[str, torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common(
             coord,
             atype,
diff --git a/deepmd/pt/model/model/transform_output.py b/deepmd/pt/model/model/transform_output.py
index e8afab15c4..e15eda6a1d 100644
--- a/deepmd/pt/model/model/transform_output.py
+++ b/deepmd/pt/model/model/transform_output.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    Dict,
-    List,
     Optional,
 )
 
@@ -31,7 +29,7 @@ def atomic_virial_corr(
     ce = coord * atom_energy
     sumce0, sumce1, sumce2 = torch.split(torch.sum(ce, dim=1), [1, 1, 1], dim=-1)
     faked_grad = torch.ones_like(sumce0)
-    lst = torch.jit.annotate(List[Optional[torch.Tensor]], [faked_grad])
+    lst = torch.jit.annotate(list[Optional[torch.Tensor]], [faked_grad])
     extended_virial_corr0 = torch.autograd.grad(
         [sumce0],
         [extended_coord],
@@ -76,7 +74,7 @@ def task_deriv_one(
     create_graph: bool = True,
 ):
     faked_grad = torch.ones_like(energy)
-    lst = torch.jit.annotate(List[Optional[torch.Tensor]], [faked_grad])
+    lst = torch.jit.annotate(list[Optional[torch.Tensor]], [faked_grad])
     extended_force = torch.autograd.grad(
         [energy],
         [extended_coord],
@@ -153,12 +151,12 @@ def take_deriv(
 
 
 def fit_output_to_model_output(
-    fit_ret: Dict[str, torch.Tensor],
+    fit_ret: dict[str, torch.Tensor],
     fit_output_def: FittingOutputDef,
     coord_ext: torch.Tensor,
     do_atomic_virial: bool = False,
     create_graph: bool = True,
-) -> Dict[str, torch.Tensor]:
+) -> dict[str, torch.Tensor]:
     """Transform the output of the fitting network to
     the model output.
 
@@ -197,11 +195,11 @@ def fit_output_to_model_output(
 
 
 def communicate_extended_output(
-    model_ret: Dict[str, torch.Tensor],
+    model_ret: dict[str, torch.Tensor],
     model_output_def: ModelOutputDef,
     mapping: torch.Tensor,  # nf x nloc
     do_atomic_virial: bool = False,
-) -> Dict[str, torch.Tensor]:
+) -> dict[str, torch.Tensor]:
     """Transform the output of the model network defined on
     local and ghost (extended) atoms to local atoms.
 
diff --git a/deepmd/pt/model/network/layernorm.py b/deepmd/pt/model/network/layernorm.py
index c1c2c29c87..76ce90b627 100644
--- a/deepmd/pt/model/network/layernorm.py
+++ b/deepmd/pt/model/network/layernorm.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -45,7 +44,7 @@ def __init__(
         stddev: float = 1.0,
         precision: str = DEFAULT_PRECISION,
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__()
         self.eps = eps
diff --git a/deepmd/pt/model/network/mlp.py b/deepmd/pt/model/network/mlp.py
index 090d64fbcf..f2137bd004 100644
--- a/deepmd/pt/model/network/mlp.py
+++ b/deepmd/pt/model/network/mlp.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     ClassVar,
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -83,7 +81,7 @@ def __init__(
         stddev: float = 1.0,
         precision: str = DEFAULT_PRECISION,
         init: str = "default",
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
     ):
         super().__init__()
         # only use_timestep when skip connection is established.
@@ -297,7 +295,7 @@ def __init__(self, *args, **kwargs):
 class NetworkCollection(DPNetworkCollection, nn.Module):
     """PyTorch implementation of NetworkCollection."""
 
-    NETWORK_TYPE_MAP: ClassVar[Dict[str, type]] = {
+    NETWORK_TYPE_MAP: ClassVar[dict[str, type]] = {
         "network": MLP,
         "embedding_network": EmbeddingNet,
         "fitting_network": FittingNet,
diff --git a/deepmd/pt/model/network/network.py b/deepmd/pt/model/network/network.py
index 0c21a9814b..ef50274b03 100644
--- a/deepmd/pt/model/network/network.py
+++ b/deepmd/pt/model/network/network.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -571,7 +570,7 @@ def __init__(
         bavg=0.0,
         stddev=1.0,
         precision="default",
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
         use_econf_tebd=False,
         use_tebd_bias: bool = False,
         type_map=None,
@@ -627,7 +626,7 @@ def share_params(self, base_class, shared_level, resume=False):
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -660,7 +659,7 @@ class TypeEmbedNetConsistent(nn.Module):
         Whether to use electronic configuration type embedding.
     use_tebd_bias : bool, Optional
         Whether to use bias in the type embedding layer.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
         A list of strings. Give the name to each type of atoms.
     """
 
@@ -668,16 +667,16 @@ def __init__(
         self,
         *,
         ntypes: int,
-        neuron: List[int],
+        neuron: list[int],
         resnet_dt: bool = False,
         activation_function: str = "tanh",
         precision: str = "default",
         trainable: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
         padding: bool = False,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
     ):
         """Construct a type embedding net."""
         super().__init__()
@@ -734,7 +733,7 @@ def forward(self, device: torch.device):
         return embed
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
diff --git a/deepmd/pt/model/task/dipole.py b/deepmd/pt/model/task/dipole.py
index 30c5a341a7..56b14677b9 100644
--- a/deepmd/pt/model/task/dipole.py
+++ b/deepmd/pt/model/task/dipole.py
@@ -3,7 +3,6 @@
 import logging
 from typing import (
     Callable,
-    List,
     Optional,
     Union,
 )
@@ -45,7 +44,7 @@ class DipoleFittingNet(GeneralFitting):
         Embedding width per atom.
     embedding_width : int
         The dimension of rotation matrix, m1.
-    neuron : List[int]
+    neuron : list[int]
         Number of neurons in each hidden layers of the fitting net.
     resnet_dt : bool
         Using time-step in the ResNet construction.
@@ -70,7 +69,7 @@ class DipoleFittingNet(GeneralFitting):
     c_differentiable
         If the variable is differentiated with respect to the cell tensor (pbc case).
         Only reducible variable are differentiable.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
         A list of strings. Give the name to each type of atoms.
     """
 
@@ -79,7 +78,7 @@ def __init__(
         ntypes: int,
         dim_descrpt: int,
         embedding_width: int,
-        neuron: List[int] = [128, 128, 128],
+        neuron: list[int] = [128, 128, 128],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
         numb_aparam: int = 0,
@@ -87,11 +86,11 @@ def __init__(
         precision: str = DEFAULT_PRECISION,
         mixed_types: bool = True,
         rcond: Optional[float] = None,
-        seed: Optional[Union[int, List[int]]] = None,
-        exclude_types: List[int] = [],
+        seed: Optional[Union[int, list[int]]] = None,
+        exclude_types: list[int] = [],
         r_differentiable: bool = True,
         c_differentiable: bool = True,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         **kwargs,
     ):
         self.embedding_width = embedding_width
@@ -151,7 +150,7 @@ def output_def(self) -> FittingOutputDef:
 
     def compute_output_stats(
         self,
-        merged: Union[Callable[[], List[dict]], List[dict]],
+        merged: Union[Callable[[], list[dict]], list[dict]],
         stat_file_path: Optional[DPPath] = None,
     ):
         """
@@ -159,11 +158,11 @@ def compute_output_stats(
 
         Parameters
         ----------
-        merged : Union[Callable[[], List[dict]], List[dict]]
-            - List[dict]: A list of data samples from various data systems.
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
                 Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
                 originating from the `i`-th data system.
-            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
                 only when needed. Since the sampling process can be slow and memory-intensive,
                 the lazy function helps by only sampling once.
         stat_file_path : Optional[DPPath]
@@ -197,4 +196,4 @@ def forward(
         return {self.var_name: out.to(env.GLOBAL_PT_FLOAT_PRECISION)}
 
     # make jit happy with torch 2.0.0
-    exclude_types: List[int]
+    exclude_types: list[int]
diff --git a/deepmd/pt/model/task/dos.py b/deepmd/pt/model/task/dos.py
index c27e287728..4f69094b0d 100644
--- a/deepmd/pt/model/task/dos.py
+++ b/deepmd/pt/model/task/dos.py
@@ -2,7 +2,6 @@
 import copy
 import logging
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -45,19 +44,19 @@ def __init__(
         ntypes: int,
         dim_descrpt: int,
         numb_dos: int = 300,
-        neuron: List[int] = [128, 128, 128],
+        neuron: list[int] = [128, 128, 128],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
         numb_aparam: int = 0,
         rcond: Optional[float] = None,
         bias_dos: Optional[torch.Tensor] = None,
-        trainable: Union[bool, List[bool]] = True,
-        seed: Optional[Union[int, List[int]]] = None,
+        trainable: Union[bool, list[bool]] = True,
+        seed: Optional[Union[int, list[int]]] = None,
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
-        exclude_types: List[int] = [],
+        exclude_types: list[int] = [],
         mixed_types: bool = True,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
     ):
         if bias_dos is not None:
             self.bias_dos = bias_dos
@@ -127,4 +126,4 @@ def serialize(self) -> dict:
         return dd
 
     # make jit happy with torch 2.0.0
-    exclude_types: List[int]
+    exclude_types: list[int]
diff --git a/deepmd/pt/model/task/ener.py b/deepmd/pt/model/task/ener.py
index 1737e401fb..2048c05ba9 100644
--- a/deepmd/pt/model/task/ener.py
+++ b/deepmd/pt/model/task/ener.py
@@ -2,9 +2,7 @@
 import copy
 import logging
 from typing import (
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -48,7 +46,7 @@ def __init__(
         self,
         ntypes: int,
         dim_descrpt: int,
-        neuron: List[int] = [128, 128, 128],
+        neuron: list[int] = [128, 128, 128],
         bias_atom_e: Optional[torch.Tensor] = None,
         resnet_dt: bool = True,
         numb_fparam: int = 0,
@@ -56,8 +54,8 @@ def __init__(
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
         mixed_types: bool = True,
-        seed: Optional[Union[int, List[int]]] = None,
-        type_map: Optional[List[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
+        type_map: Optional[list[str]] = None,
         **kwargs,
     ):
         super().__init__(
@@ -94,7 +92,7 @@ def serialize(self) -> dict:
         }
 
     # make jit happy with torch 2.0.0
-    exclude_types: List[int]
+    exclude_types: list[int]
 
 
 @Fitting.register("direct_force")
@@ -185,11 +183,11 @@ def deserialize(self) -> "EnergyFittingNetDirect":
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         raise NotImplementedError
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         raise NotImplementedError
 
     def forward(
@@ -201,7 +199,7 @@ def forward(
         h2: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, None]:
+    ) -> tuple[torch.Tensor, None]:
         """Based on embedding net output, alculate total energy.
 
         Args:
diff --git a/deepmd/pt/model/task/fitting.py b/deepmd/pt/model/task/fitting.py
index 95242eb67c..1827569a17 100644
--- a/deepmd/pt/model/task/fitting.py
+++ b/deepmd/pt/model/task/fitting.py
@@ -5,7 +5,6 @@
     abstractmethod,
 )
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -97,7 +96,7 @@ class GeneralFitting(Fitting):
         Embedding width per atom.
     dim_out : int
         The output dimension of the fitting net.
-    neuron : List[int]
+    neuron : list[int]
         Number of neurons in each hidden layers of the fitting net.
     bias_atom_e : torch.Tensor, optional
         Average enery per atom for each element.
@@ -118,17 +117,17 @@ class GeneralFitting(Fitting):
         The condition number for the regression of atomic energy.
     seed : int, optional
         Random seed.
-    exclude_types: List[int]
+    exclude_types: list[int]
         Atomic contributions of the excluded atom types are set zero.
-    trainable : Union[List[bool], bool]
+    trainable : Union[list[bool], bool]
         If the parameters in the fitting net are trainable.
         Now this only supports setting all the parameters in the fitting net at one state.
-        When in List[bool], the trainable will be True only if all the boolean parameters are True.
-    remove_vaccum_contribution: List[bool], optional
+        When in list[bool], the trainable will be True only if all the boolean parameters are True.
+    remove_vaccum_contribution: list[bool], optional
         Remove vaccum contribution before the bias is added. The list assigned each
         type. For `mixed_types` provide `[True]`, otherwise it should be a list of the same
         length as `ntypes` signaling if or not removing the vaccum contribution for the atom types in the list.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
         A list of strings. Give the name to each type of atoms.
     """
 
@@ -137,7 +136,7 @@ def __init__(
         var_name: str,
         ntypes: int,
         dim_descrpt: int,
-        neuron: List[int] = [128, 128, 128],
+        neuron: list[int] = [128, 128, 128],
         bias_atom_e: Optional[torch.Tensor] = None,
         resnet_dt: bool = True,
         numb_fparam: int = 0,
@@ -146,11 +145,11 @@ def __init__(
         precision: str = DEFAULT_PRECISION,
         mixed_types: bool = True,
         rcond: Optional[float] = None,
-        seed: Optional[Union[int, List[int]]] = None,
-        exclude_types: List[int] = [],
-        trainable: Union[bool, List[bool]] = True,
-        remove_vaccum_contribution: Optional[List[bool]] = None,
-        type_map: Optional[List[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
+        exclude_types: list[int] = [],
+        trainable: Union[bool, list[bool]] = True,
+        remove_vaccum_contribution: Optional[list[bool]] = None,
+        type_map: Optional[list[str]] = None,
         **kwargs,
     ):
         super().__init__()
@@ -253,13 +252,13 @@ def __init__(
 
     def reinit_exclude(
         self,
-        exclude_types: List[int] = [],
+        exclude_types: list[int] = [],
     ):
         self.exclude_types = exclude_types
         self.emask = AtomExcludeMask(self.ntypes, self.exclude_types)
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -342,9 +341,9 @@ def get_dim_aparam(self) -> int:
         return self.numb_aparam
 
     # make jit happy
-    exclude_types: List[int]
+    exclude_types: list[int]
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
         Only atoms with selected atom types have atomic contribution
@@ -352,13 +351,13 @@ def get_sel_type(self) -> List[int]:
         If returning an empty list, all atom types are selected.
         """
         # make jit happy
-        sel_type: List[int] = []
+        sel_type: list[int] = []
         for ii in range(self.ntypes):
             if ii not in self.exclude_types:
                 sel_type.append(ii)
         return sel_type
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
diff --git a/deepmd/pt/model/task/invar_fitting.py b/deepmd/pt/model/task/invar_fitting.py
index 36c416d6e5..230046b74b 100644
--- a/deepmd/pt/model/task/invar_fitting.py
+++ b/deepmd/pt/model/task/invar_fitting.py
@@ -2,7 +2,6 @@
 import copy
 import logging
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -48,7 +47,7 @@ class InvarFitting(GeneralFitting):
         Embedding width per atom.
     dim_out : int
         The output dimension of the fitting net.
-    neuron : List[int]
+    neuron : list[int]
         Number of neurons in each hidden layers of the fitting net.
     bias_atom_e : torch.Tensor, optional
         Average enery per atom for each element.
@@ -69,14 +68,14 @@ class InvarFitting(GeneralFitting):
         The condition number for the regression of atomic energy.
     seed : int, optional
         Random seed.
-    exclude_types: List[int]
+    exclude_types: list[int]
         Atomic contributions of the excluded atom types are set zero.
-    atom_ener: List[Optional[torch.Tensor]], optional
+    atom_ener: list[Optional[torch.Tensor]], optional
         Specifying atomic energy contribution in vacuum.
         The value is a list specifying the bias. the elements can be None or np.array of output shape.
         For example: [None, [2.]] means type 0 is not set, type 1 is set to [2.]
         The `set_davg_zero` key in the descrptor should be set.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
         A list of strings. Give the name to each type of atoms.
 
     """
@@ -87,7 +86,7 @@ def __init__(
         ntypes: int,
         dim_descrpt: int,
         dim_out: int,
-        neuron: List[int] = [128, 128, 128],
+        neuron: list[int] = [128, 128, 128],
         bias_atom_e: Optional[torch.Tensor] = None,
         resnet_dt: bool = True,
         numb_fparam: int = 0,
@@ -96,10 +95,10 @@ def __init__(
         precision: str = DEFAULT_PRECISION,
         mixed_types: bool = True,
         rcond: Optional[float] = None,
-        seed: Optional[Union[int, List[int]]] = None,
-        exclude_types: List[int] = [],
-        atom_ener: Optional[List[Optional[torch.Tensor]]] = None,
-        type_map: Optional[List[str]] = None,
+        seed: Optional[Union[int, list[int]]] = None,
+        exclude_types: list[int] = [],
+        atom_ener: Optional[list[Optional[torch.Tensor]]] = None,
+        type_map: Optional[list[str]] = None,
         **kwargs,
     ):
         self.dim_out = dim_out
@@ -179,4 +178,4 @@ def forward(
         return self._forward_common(descriptor, atype, gr, g2, h2, fparam, aparam)
 
     # make jit happy with torch 2.0.0
-    exclude_types: List[int]
+    exclude_types: list[int]
diff --git a/deepmd/pt/model/task/polarizability.py b/deepmd/pt/model/task/polarizability.py
index 7345fa296c..a16ab886d4 100644
--- a/deepmd/pt/model/task/polarizability.py
+++ b/deepmd/pt/model/task/polarizability.py
@@ -2,7 +2,6 @@
 import copy
 import logging
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -47,7 +46,7 @@ class PolarFittingNet(GeneralFitting):
         Embedding width per atom.
     embedding_width : int
         The dimension of rotation matrix, m1.
-    neuron : List[int]
+    neuron : list[int]
         Number of neurons in each hidden layers of the fitting net.
     resnet_dt : bool
         Using time-step in the ResNet construction.
@@ -69,11 +68,11 @@ class PolarFittingNet(GeneralFitting):
     fit_diag : bool
         Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to
         normal polarizability matrix by contracting with the rotation matrix.
-    scale : List[float]
+    scale : list[float]
         The output of the fitting net (polarizability matrix) for type i atom will be scaled by scale[i]
     shift_diag : bool
         Whether to shift the diagonal part of the polarizability matrix. The shift operation is carried out after scale.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
         A list of strings. Give the name to each type of atoms.
 
     """
@@ -83,7 +82,7 @@ def __init__(
         ntypes: int,
         dim_descrpt: int,
         embedding_width: int,
-        neuron: List[int] = [128, 128, 128],
+        neuron: list[int] = [128, 128, 128],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
         numb_aparam: int = 0,
@@ -91,12 +90,12 @@ def __init__(
         precision: str = DEFAULT_PRECISION,
         mixed_types: bool = True,
         rcond: Optional[float] = None,
-        seed: Optional[Union[int, List[int]]] = None,
-        exclude_types: List[int] = [],
+        seed: Optional[Union[int, list[int]]] = None,
+        exclude_types: list[int] = [],
         fit_diag: bool = True,
-        scale: Optional[Union[List[float], float]] = None,
+        scale: Optional[Union[list[float], float]] = None,
         shift_diag: bool = True,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         **kwargs,
     ):
         self.embedding_width = embedding_width
@@ -162,7 +161,7 @@ def __getitem__(self, key):
             return super().__getitem__(key)
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -258,4 +257,4 @@ def forward(
         return {"polarizability": out.to(env.GLOBAL_PT_FLOAT_PRECISION)}
 
     # make jit happy with torch 2.0.0
-    exclude_types: List[int]
+    exclude_types: list[int]
diff --git a/deepmd/pt/model/task/property.py b/deepmd/pt/model/task/property.py
index 804383c57f..cc6a4e8745 100644
--- a/deepmd/pt/model/task/property.py
+++ b/deepmd/pt/model/task/property.py
@@ -2,7 +2,6 @@
 import copy
 import logging
 from typing import (
-    List,
     Optional,
 )
 
@@ -46,7 +45,7 @@ class PropertyFittingNet(InvarFitting):
         Embedding width per atom.
     task_dim : int
             The dimension of outputs of fitting net.
-    neuron : List[int]
+    neuron : list[int]
         Number of neurons in each hidden layers of the fitting net.
     bias_atom_p : torch.Tensor, optional
         Average property per atom for each element.
@@ -78,7 +77,7 @@ def __init__(
         ntypes: int,
         dim_descrpt: int,
         task_dim: int = 1,
-        neuron: List[int] = [128, 128, 128],
+        neuron: list[int] = [128, 128, 128],
         bias_atom_p: Optional[torch.Tensor] = None,
         intensive: bool = False,
         bias_method: str = "normal",
@@ -149,4 +148,4 @@ def serialize(self) -> dict:
         return dd
 
     # make jit happy with torch 2.0.0
-    exclude_types: List[int]
+    exclude_types: list[int]
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
index 9bdc80195f..4d746e84c0 100644
--- a/deepmd/pt/train/training.py
+++ b/deepmd/pt/train/training.py
@@ -10,7 +10,6 @@
 )
 from typing import (
     Any,
-    Dict,
 )
 
 import numpy as np
@@ -88,7 +87,7 @@
 class Trainer:
     def __init__(
         self,
-        config: Dict[str, Any],
+        config: dict[str, Any],
         training_data,
         stat_file_path=None,
         validation_data=None,
@@ -890,8 +889,9 @@ def log_loss_valid(_task_key="Default"):
                     )
                 # the first training time is not accurate
                 if (
-                    _step_id + 1
-                ) > self.disp_freq or self.num_steps < 2 * self.disp_freq:
+                    (_step_id + 1 - self.start_step) > self.disp_freq
+                    or self.num_steps - self.start_step < 2 * self.disp_freq
+                ):
                     self.total_train_time += train_time
 
                 if fout:
@@ -982,13 +982,14 @@ def log_loss_valid(_task_key="Default"):
                 with open("checkpoint", "w") as f:
                     f.write(str(self.latest_model))
 
-            if self.timing_in_training and self.num_steps // self.disp_freq > 0:
-                if self.num_steps >= 2 * self.disp_freq:
+            elapsed_batch = self.num_steps - self.start_step
+            if self.timing_in_training and elapsed_batch // self.disp_freq > 0:
+                if self.start_step >= 2 * self.disp_freq:
                     log.info(
                         "average training time: %.4f s/batch (exclude first %d batches)",
                         self.total_train_time
                         / (
-                            self.num_steps // self.disp_freq * self.disp_freq
+                            elapsed_batch // self.disp_freq * self.disp_freq
                             - self.disp_freq
                         ),
                         self.disp_freq,
@@ -997,7 +998,7 @@ def log_loss_valid(_task_key="Default"):
                     log.info(
                         "average training time: %.4f s/batch",
                         self.total_train_time
-                        / (self.num_steps // self.disp_freq * self.disp_freq),
+                        / (elapsed_batch // self.disp_freq * self.disp_freq),
                     )
 
             if JIT:
diff --git a/deepmd/pt/train/wrapper.py b/deepmd/pt/train/wrapper.py
index 6bc7cdc87a..922ac296ea 100644
--- a/deepmd/pt/train/wrapper.py
+++ b/deepmd/pt/train/wrapper.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
-    Dict,
     Optional,
     Union,
 )
@@ -18,8 +17,8 @@
 class ModelWrapper(torch.nn.Module):
     def __init__(
         self,
-        model: Union[torch.nn.Module, Dict],
-        loss: Union[torch.nn.Module, Dict] = None,
+        model: Union[torch.nn.Module, dict],
+        loss: Union[torch.nn.Module, dict] = None,
         model_params=None,
         shared_links=None,
     ):
@@ -183,12 +182,12 @@ def forward(
             )
             return model_pred, loss, more_loss
 
-    def set_extra_state(self, state: Dict):
+    def set_extra_state(self, state: dict):
         self.model_params = state["model_params"]
         self.train_infos = state["train_infos"]
         return None
 
-    def get_extra_state(self) -> Dict:
+    def get_extra_state(self) -> dict:
         state = {
             "model_params": self.model_params,
             "train_infos": self.train_infos,
diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py
index 6a37a4a843..c7f44cfb70 100644
--- a/deepmd/pt/utils/dataloader.py
+++ b/deepmd/pt/utils/dataloader.py
@@ -9,9 +9,6 @@
 from threading import (
     Thread,
 )
-from typing import (
-    List,
-)
 
 import h5py
 import numpy as np
@@ -86,7 +83,7 @@ def __init__(
             with h5py.File(systems) as file:
                 systems = [os.path.join(systems, item) for item in file.keys()]
 
-        self.systems: List[DeepmdDataSetForLoader] = []
+        self.systems: list[DeepmdDataSetForLoader] = []
         if len(systems) >= 100:
             log.info(f"Constructing DataLoaders from {len(systems)} systems")
 
@@ -106,7 +103,7 @@ def construct_dataset(system):
         ) as pool:
             self.systems = pool.map(construct_dataset, systems)
 
-        self.sampler_list: List[DistributedSampler] = []
+        self.sampler_list: list[DistributedSampler] = []
         self.index = []
         self.total_batch = 0
 
@@ -178,7 +175,7 @@ def __getitem__(self, idx):
         batch["sid"] = idx
         return batch
 
-    def add_data_requirement(self, data_requirement: List[DataRequirementItem]):
+    def add_data_requirement(self, data_requirement: list[DataRequirementItem]):
         """Add data requirement for each system in multiple systems."""
         for system in self.systems:
             system.add_data_requirement(data_requirement)
@@ -186,7 +183,7 @@ def add_data_requirement(self, data_requirement: List[DataRequirementItem]):
     def print_summary(
         self,
         name: str,
-        prob: List[float],
+        prob: list[float],
     ):
         print_summary(
             name,
diff --git a/deepmd/pt/utils/dataset.py b/deepmd/pt/utils/dataset.py
index dbe4d92a0f..4a29f3f045 100644
--- a/deepmd/pt/utils/dataset.py
+++ b/deepmd/pt/utils/dataset.py
@@ -2,7 +2,6 @@
 
 
 from typing import (
-    List,
     Optional,
 )
 
@@ -17,7 +16,7 @@
 
 
 class DeepmdDataSetForLoader(Dataset):
-    def __init__(self, system: str, type_map: Optional[List[str]] = None):
+    def __init__(self, system: str, type_map: Optional[list[str]] = None):
         """Construct DeePMD-style dataset containing frames cross different systems.
 
         Args:
@@ -41,7 +40,7 @@ def __getitem__(self, index):
         b_data["natoms"] = self._natoms_vec
         return b_data
 
-    def add_data_requirement(self, data_requirement: List[DataRequirementItem]):
+    def add_data_requirement(self, data_requirement: list[DataRequirementItem]):
         """Add data requirement for this data system."""
         for data_item in data_requirement:
             self._data_system.add(
diff --git a/deepmd/pt/utils/env_mat_stat.py b/deepmd/pt/utils/env_mat_stat.py
index 9eaea16c3e..cc30bd5155 100644
--- a/deepmd/pt/utils/env_mat_stat.py
+++ b/deepmd/pt/utils/env_mat_stat.py
@@ -1,10 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from collections.abc import (
+    Iterator,
+)
 from typing import (
     TYPE_CHECKING,
-    Dict,
-    Iterator,
-    List,
-    Tuple,
     Union,
 )
 
@@ -38,7 +37,7 @@
 
 
 class EnvMatStat(BaseEnvMatStat):
-    def compute_stat(self, env_mat: Dict[str, torch.Tensor]) -> Dict[str, StatItem]:
+    def compute_stat(self, env_mat: dict[str, torch.Tensor]) -> dict[str, StatItem]:
         """Compute the statistics of the environment matrix for a single system.
 
         Parameters
@@ -48,7 +47,7 @@ def compute_stat(self, env_mat: Dict[str, torch.Tensor]) -> Dict[str, StatItem]:
 
         Returns
         -------
-        Dict[str, StatItem]
+        dict[str, StatItem]
             The statistics of the environment matrix.
         """
         stats = {}
@@ -78,18 +77,18 @@ def __init__(self, descriptor: "DescriptorBlock"):
         )  # se_r=1, se_a=4
 
     def iter(
-        self, data: List[Dict[str, Union[torch.Tensor, List[Tuple[int, int]]]]]
-    ) -> Iterator[Dict[str, StatItem]]:
+        self, data: list[dict[str, Union[torch.Tensor, list[tuple[int, int]]]]]
+    ) -> Iterator[dict[str, StatItem]]:
         """Get the iterator of the environment matrix.
 
         Parameters
         ----------
-        data : List[Dict[str, Union[torch.Tensor, List[Tuple[int, int]]]]]
+        data : list[dict[str, Union[torch.Tensor, list[tuple[int, int]]]]]
             The data.
 
         Yields
         ------
-        Dict[str, StatItem]
+        dict[str, StatItem]
             The statistics of the environment matrix.
         """
         zero_mean = torch.zeros(
diff --git a/deepmd/pt/utils/exclude_mask.py b/deepmd/pt/utils/exclude_mask.py
index c3f3f8eb2f..a5de969c07 100644
--- a/deepmd/pt/utils/exclude_mask.py
+++ b/deepmd/pt/utils/exclude_mask.py
@@ -1,9 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    List,
-    Set,
-    Tuple,
-)
 
 import numpy as np
 import torch
@@ -19,7 +14,7 @@ class AtomExcludeMask(torch.nn.Module):
     def __init__(
         self,
         ntypes: int,
-        exclude_types: List[int] = [],
+        exclude_types: list[int] = [],
     ):
         super().__init__()
         self.reinit(ntypes, exclude_types)
@@ -27,7 +22,7 @@ def __init__(
     def reinit(
         self,
         ntypes: int,
-        exclude_types: List[int] = [],
+        exclude_types: list[int] = [],
     ):
         self.ntypes = ntypes
         self.exclude_types = exclude_types
@@ -72,7 +67,7 @@ class PairExcludeMask(torch.nn.Module):
     def __init__(
         self,
         ntypes: int,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         super().__init__()
         self.reinit(ntypes, exclude_types)
@@ -80,10 +75,10 @@ def __init__(
     def reinit(
         self,
         ntypes: int,
-        exclude_types: List[Tuple[int, int]] = [],
+        exclude_types: list[tuple[int, int]] = [],
     ):
         self.ntypes = ntypes
-        self._exclude_types: Set[Tuple[int, int]] = set()
+        self._exclude_types: set[tuple[int, int]] = set()
         for tt in exclude_types:
             assert len(tt) == 2
             self._exclude_types.add((tt[0], tt[1]))
diff --git a/deepmd/pt/utils/neighbor_stat.py b/deepmd/pt/utils/neighbor_stat.py
index d5b5c74bdc..d427dc758a 100644
--- a/deepmd/pt/utils/neighbor_stat.py
+++ b/deepmd/pt/utils/neighbor_stat.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
+from collections.abc import (
     Iterator,
+)
+from typing import (
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -52,7 +53,7 @@ def forward(
         coord: torch.Tensor,
         atype: torch.Tensor,
         cell: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """Calculate the neareest neighbor distance between atoms, maximum nbor size of
         atoms and the output data range of the environment matrix.
 
@@ -139,7 +140,7 @@ def __init__(
 
     def iterator(
         self, data: DeepmdDataSystem
-    ) -> Iterator[Tuple[np.ndarray, float, str]]:
+    ) -> Iterator[tuple[np.ndarray, float, str]]:
         """Abstract method for producing data.
 
         Yields
diff --git a/deepmd/pt/utils/nlist.py b/deepmd/pt/utils/nlist.py
index b34c43378c..a4f81a23a5 100644
--- a/deepmd/pt/utils/nlist.py
+++ b/deepmd/pt/utils/nlist.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -21,7 +19,7 @@ def extend_input_and_build_neighbor_list(
     coord,
     atype,
     rcut: float,
-    sel: List[int],
+    sel: list[int],
     mixed_types: bool = False,
     box: Optional[torch.Tensor] = None,
 ):
@@ -55,7 +53,7 @@ def build_neighbor_list(
     atype: torch.Tensor,
     nloc: int,
     rcut: float,
-    sel: Union[int, List[int]],
+    sel: Union[int, list[int]],
     distinguish_types: bool = True,
 ) -> torch.Tensor:
     """Build neightbor list for a single frame. keeps nsel neighbors.
@@ -71,7 +69,7 @@ def build_neighbor_list(
         number of local atoms.
     rcut : float
         cut-off radius
-    sel : int or List[int]
+    sel : int or list[int]
         maximal number of neighbors (of each type).
         if distinguish_types==True, nsel should be list and
         the length of nsel should be equal to number of
@@ -137,7 +135,7 @@ def _trim_mask_distinguish_nlist(
     rr: torch.Tensor,
     nlist: torch.Tensor,
     rcut: float,
-    sel: List[int],
+    sel: list[int],
     distinguish_types: bool,
 ) -> torch.Tensor:
     """Trim the size of nlist, mask if any central atom is virtual, distinguish types if necessary."""
@@ -178,7 +176,7 @@ def build_directional_neighbor_list(
     coord_neig: torch.Tensor,
     atype_neig: torch.Tensor,
     rcut: float,
-    sel: Union[int, List[int]],
+    sel: Union[int, list[int]],
     distinguish_types: bool = True,
 ) -> torch.Tensor:
     """Build directional neighbor list.
@@ -205,7 +203,7 @@ def build_directional_neighbor_list(
         if type < 0 the atom is treated as virtual atoms.
     rcut : float
         cut-off radius
-    sel : int or List[int]
+    sel : int or list[int]
         maximal number of neighbors (of each type).
         if distinguish_types==True, nsel should be list and
         the length of nsel should be equal to number of
@@ -277,7 +275,7 @@ def build_directional_neighbor_list(
 def nlist_distinguish_types(
     nlist: torch.Tensor,
     atype: torch.Tensor,
-    sel: List[int],
+    sel: list[int],
 ):
     """Given a nlist that does not distinguish atom types, return a nlist that
     distinguish atom types.
@@ -327,9 +325,9 @@ def get_multiple_nlist_key(
 def build_multiple_neighbor_list(
     coord: torch.Tensor,
     nlist: torch.Tensor,
-    rcuts: List[float],
-    nsels: List[int],
-) -> Dict[str, torch.Tensor]:
+    rcuts: list[float],
+    nsels: list[int],
+) -> dict[str, torch.Tensor]:
     """Input one neighbor list, and produce multiple neighbor lists with
     different cutoff radius and numbers of selection out of it.  The
     required rcuts and nsels should be smaller or equal to the input nlist.
@@ -341,14 +339,14 @@ def build_multiple_neighbor_list(
     nlist : torch.Tensor
         Neighbor list of shape [batch_size, nloc, nsel], the neighbors
         should be stored in an ascending order.
-    rcuts : List[float]
+    rcuts : list[float]
         list of cut-off radius in ascending order.
-    nsels : List[int]
+    nsels : list[int]
         maximal number of neighbors in ascending order.
 
     Returns
     -------
-    nlist_dict : Dict[str, torch.Tensor]
+    nlist_dict : dict[str, torch.Tensor]
         A dict of nlists, key given by get_multiple_nlist_key(rc, nsel)
         value being the corresponding nlist.
 
diff --git a/deepmd/pt/utils/stat.py b/deepmd/pt/utils/stat.py
index 58e02f436d..23fb12f2a4 100644
--- a/deepmd/pt/utils/stat.py
+++ b/deepmd/pt/utils/stat.py
@@ -5,8 +5,6 @@
 )
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -89,7 +87,7 @@ def make_stat_input(datasets, dataloaders, nbatches):
 
 def _restore_from_file(
     stat_file_path: DPPath,
-    keys: List[str] = ["energy"],
+    keys: list[str] = ["energy"],
 ) -> Optional[dict]:
     if stat_file_path is None:
         return None, None
@@ -147,8 +145,8 @@ def _post_process_stat(
 
 
 def _compute_model_predict(
-    sampled: Union[Callable[[], List[dict]], List[dict]],
-    keys: List[str],
+    sampled: Union[Callable[[], list[dict]], list[dict]],
+    keys: list[str],
     model_forward: Callable[..., torch.Tensor],
 ):
     auto_batch_size = AutoBatchSize()
@@ -187,7 +185,7 @@ def model_forward_auto_batch_size(*args, **kwargs):
 
 def _make_preset_out_bias(
     ntypes: int,
-    ibias: List[Optional[np.ndarray]],
+    ibias: list[Optional[np.ndarray]],
 ) -> Optional[np.ndarray]:
     """Make preset out bias.
 
@@ -237,12 +235,12 @@ def _fill_stat_with_global(
 
 
 def compute_output_stats(
-    merged: Union[Callable[[], List[dict]], List[dict]],
+    merged: Union[Callable[[], list[dict]], list[dict]],
     ntypes: int,
-    keys: Union[str, List[str]] = ["energy"],
+    keys: Union[str, list[str]] = ["energy"],
     stat_file_path: Optional[DPPath] = None,
     rcond: Optional[float] = None,
-    preset_bias: Optional[Dict[str, List[Optional[np.ndarray]]]] = None,
+    preset_bias: Optional[dict[str, list[Optional[np.ndarray]]]] = None,
     model_forward: Optional[Callable[..., torch.Tensor]] = None,
     atomic_output: Optional[FittingOutputDef] = None,
 ):
@@ -251,11 +249,11 @@ def compute_output_stats(
 
     Parameters
     ----------
-    merged : Union[Callable[[], List[dict]], List[dict]]
-        - List[dict]: A list of data samples from various data systems.
+    merged : Union[Callable[[], list[dict]], list[dict]]
+        - list[dict]: A list of data samples from various data systems.
             Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
             originating from the `i`-th data system.
-        - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+        - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
             only when needed. Since the sampling process can be slow and memory-intensive,
             the lazy function helps by only sampling once.
     ntypes : int
@@ -264,7 +262,7 @@ def compute_output_stats(
         The path to the stat file.
     rcond : float, optional
         The condition number for the regression of atomic energy.
-    preset_bias : Dict[str, List[Optional[np.ndarray]]], optional
+    preset_bias : dict[str, list[Optional[np.ndarray]]], optional
         Specifying atomic energy contribution in vacuum. Given by key:value pairs.
         The value is a list specifying the bias. the elements can be None or np.ndarray of output shape.
         For example: [None, [2.]] means type 0 is not set, type 1 is set to [2.]
@@ -401,12 +399,12 @@ def compute_output_stats(
 
 
 def compute_output_stats_global(
-    sampled: List[dict],
+    sampled: list[dict],
     ntypes: int,
-    keys: List[str],
+    keys: list[str],
     rcond: Optional[float] = None,
-    preset_bias: Optional[Dict[str, List[Optional[np.ndarray]]]] = None,
-    model_pred: Optional[Dict[str, np.ndarray]] = None,
+    preset_bias: Optional[dict[str, list[Optional[np.ndarray]]]] = None,
+    model_pred: Optional[dict[str, np.ndarray]] = None,
     atomic_output: Optional[FittingOutputDef] = None,
 ):
     """This function only handle stat computation from reduced global labels."""
@@ -526,10 +524,10 @@ def rmse(x):
 
 
 def compute_output_stats_atomic(
-    sampled: List[dict],
+    sampled: list[dict],
     ntypes: int,
-    keys: List[str],
-    model_pred: Optional[Dict[str, np.ndarray]] = None,
+    keys: list[str],
+    model_pred: Optional[dict[str, np.ndarray]] = None,
 ):
     # get label dict from sample; for each key, only picking the system with atomic labels.
     outputs = {
diff --git a/deepmd/pt/utils/update_sel.py b/deepmd/pt/utils/update_sel.py
index 7f42a9f91c..e8c40e2626 100644
--- a/deepmd/pt/utils/update_sel.py
+++ b/deepmd/pt/utils/update_sel.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    Type,
-)
 
 from deepmd.pt.utils.neighbor_stat import (
     NeighborStat,
@@ -13,5 +10,5 @@
 
 class UpdateSel(BaseUpdateSel):
     @property
-    def neighbor_stat(self) -> Type[NeighborStat]:
+    def neighbor_stat(self) -> type[NeighborStat]:
         return NeighborStat
diff --git a/deepmd/pt/utils/utils.py b/deepmd/pt/utils/utils.py
index 9ccdbfdb5d..43b82efcc1 100644
--- a/deepmd/pt/utils/utils.py
+++ b/deepmd/pt/utils/utils.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
     Union,
     overload,
@@ -123,7 +122,7 @@ def dict_to_device(sample_dict):
 XSHIFT = 16
 
 
-def hashmix(value: int, hash_const: List[int]):
+def hashmix(value: int, hash_const: list[int]):
     value ^= INIT_A
     hash_const[0] *= MULT_A
     value *= INIT_A
@@ -142,7 +141,7 @@ def mix(x: int, y: int):
     return result
 
 
-def mix_entropy(entropy_array: List[int]) -> int:
+def mix_entropy(entropy_array: list[int]) -> int:
     # https://github.com/numpy/numpy/blob/a4cddb60489f821a1a4dffc16cd5c69755d43bdb/numpy/random/bit_generator.pyx#L341-L374
     hash_const = [INIT_A]
     mixer = hashmix(entropy_array[0], hash_const)
@@ -152,7 +151,7 @@ def mix_entropy(entropy_array: List[int]) -> int:
 
 
 def get_generator(
-    seed: Optional[Union[int, List[int]]] = None,
+    seed: Optional[Union[int, list[int]]] = None,
 ) -> Optional[torch.Generator]:
     if seed is not None:
         if isinstance(seed, list):
diff --git a/deepmd/tf/cluster/__init__.py b/deepmd/tf/cluster/__init__.py
index 6735ce92f4..0f8916038d 100644
--- a/deepmd/tf/cluster/__init__.py
+++ b/deepmd/tf/cluster/__init__.py
@@ -2,9 +2,7 @@
 """Module that reads node resources, auto detects if running local or on SLURM."""
 
 from typing import (
-    List,
     Optional,
-    Tuple,
 )
 
 from .local import get_resource as get_local_res
@@ -12,12 +10,12 @@
 __all__ = ["get_resource"]
 
 
-def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
+def get_resource() -> tuple[str, list[str], Optional[list[int]]]:
     """Get local or slurm resources: nodename, nodelist, and gpus.
 
     Returns
     -------
-    Tuple[str, List[str], Optional[List[int]]]
+    tuple[str, list[str], Optional[list[int]]]
         nodename, nodelist, and gpus
     """
     return get_local_res()
diff --git a/deepmd/tf/cluster/local.py b/deepmd/tf/cluster/local.py
index 009a182e55..a9392bd326 100644
--- a/deepmd/tf/cluster/local.py
+++ b/deepmd/tf/cluster/local.py
@@ -4,9 +4,7 @@
 import subprocess as sp
 import sys
 from typing import (
-    List,
     Optional,
-    Tuple,
 )
 
 from deepmd.tf.env import (
@@ -25,7 +23,7 @@ def get_gpus():
 
     Returns
     -------
-    Optional[List[int]]
+    Optional[list[int]]
         List of available GPU IDs. Otherwise, None.
     """
     if not tf.test.is_built_with_cuda() and not (
@@ -51,12 +49,12 @@ def get_gpus():
         return list(range(num_gpus)) if num_gpus > 0 else None
 
 
-def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
+def get_resource() -> tuple[str, list[str], Optional[list[int]]]:
     """Get local resources: nodename, nodelist, and gpus.
 
     Returns
     -------
-    Tuple[str, List[str], Optional[List[int]]]
+    tuple[str, list[str], Optional[list[int]]]
         nodename, nodelist, and gpus
     """
     nodename, nodelist = get_host_names()
diff --git a/deepmd/tf/descriptor/descriptor.py b/deepmd/tf/descriptor/descriptor.py
index 2bef63fa5e..ba54ca1309 100644
--- a/deepmd/tf/descriptor/descriptor.py
+++ b/deepmd/tf/descriptor/descriptor.py
@@ -4,11 +4,7 @@
 )
 from typing import (
     Any,
-    Dict,
-    List,
     Optional,
-    Set,
-    Tuple,
 )
 
 import numpy as np
@@ -111,7 +107,7 @@ def get_dim_rot_mat_1(self) -> int:
         """
         raise NotImplementedError
 
-    def get_nlist(self) -> Tuple[tf.Tensor, tf.Tensor, List[int], List[int]]:
+    def get_nlist(self) -> tuple[tf.Tensor, tf.Tensor, list[int], list[int]]:
         """Returns neighbor information.
 
         Returns
@@ -130,12 +126,12 @@ def get_nlist(self) -> Tuple[tf.Tensor, tf.Tensor, List[int], List[int]]:
     @abstractmethod
     def compute_input_stats(
         self,
-        data_coord: List[np.ndarray],
-        data_box: List[np.ndarray],
-        data_atype: List[np.ndarray],
-        natoms_vec: List[np.ndarray],
-        mesh: List[np.ndarray],
-        input_dict: Dict[str, List[np.ndarray]],
+        data_coord: list[np.ndarray],
+        data_box: list[np.ndarray],
+        data_atype: list[np.ndarray],
+        natoms_vec: list[np.ndarray],
+        mesh: list[np.ndarray],
+        input_dict: dict[str, list[np.ndarray]],
         **kwargs,
     ) -> None:
         """Compute the statisitcs (avg and std) of the training data. The input will be
@@ -175,7 +171,7 @@ def build(
         natoms: tf.Tensor,
         box_: tf.Tensor,
         mesh: tf.Tensor,
-        input_dict: Dict[str, Any],
+        input_dict: dict[str, Any],
         reuse: Optional[bool] = None,
         suffix: str = "",
     ) -> tf.Tensor:
@@ -275,7 +271,7 @@ def enable_mixed_precision(self, mixed_prec: Optional[dict] = None) -> None:
     @abstractmethod
     def prod_force_virial(
         self, atom_ener: tf.Tensor, natoms: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Compute force and virial.
 
         Parameters
@@ -323,7 +319,7 @@ def init_variables(
             f"Descriptor {type(self).__name__} doesn't support initialization from the given variables!"
         )
 
-    def get_tensor_names(self, suffix: str = "") -> Tuple[str]:
+    def get_tensor_names(self, suffix: str = "") -> tuple[str]:
         """Get names of tensors.
 
         Parameters
@@ -333,7 +329,7 @@ def get_tensor_names(self, suffix: str = "") -> Tuple[str]:
 
         Returns
         -------
-        Tuple[str]
+        tuple[str]
             Names of tensors
         """
         raise NotImplementedError(
@@ -362,9 +358,9 @@ def pass_tensors_from_frz_model(
 
     def build_type_exclude_mask(
         self,
-        exclude_types: Set[Tuple[int, int]],
+        exclude_types: set[tuple[int, int]],
         ntypes: int,
-        sel: List[int],
+        sel: list[int],
         ndescrpt: int,
         atype: tf.Tensor,
         shape0: tf.Tensor,
@@ -391,12 +387,12 @@ def build_type_exclude_mask(
 
         Parameters
         ----------
-        exclude_types : List[Tuple[int, int]]
+        exclude_types : list[tuple[int, int]]
             The list of excluded types, e.g. [(0, 1), (1, 0)] means the interaction
             between type 0 and type 1 is excluded.
         ntypes : int
             The number of types.
-        sel : List[int]
+        sel : list[int]
             The list of the number of selected neighbors for each type.
         ndescrpt : int
             The number of descriptors for each atom.
@@ -469,9 +465,9 @@ def explicit_ntypes(self) -> bool:
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -535,6 +531,6 @@ def serialize(self, suffix: str = "") -> dict:
         raise NotImplementedError(f"Not implemented in class {self.__name__}")
 
     @property
-    def input_requirement(self) -> List[DataRequirementItem]:
+    def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
         return []
diff --git a/deepmd/tf/descriptor/hybrid.py b/deepmd/tf/descriptor/hybrid.py
index fe4fc2ae6a..e4458476c8 100644
--- a/deepmd/tf/descriptor/hybrid.py
+++ b/deepmd/tf/descriptor/hybrid.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Any,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -41,14 +38,14 @@ class DescrptHybrid(Descriptor):
 
     Parameters
     ----------
-    list : list : List[Union[Descriptor, Dict[str, Any]]]
+    list : list : list[Union[Descriptor, dict[str, Any]]]
             Build a descriptor from the concatenation of the list of descriptors.
             The descriptor can be either an object or a dictionary.
     """
 
     def __init__(
         self,
-        list: List[Union[Descriptor, Dict[str, Any]]],
+        list: list[Union[Descriptor, dict[str, Any]]],
         ntypes: Optional[int] = None,
         spin: Optional[Spin] = None,
         **kwargs,
@@ -93,7 +90,7 @@ def get_dim_out(self) -> int:
 
     def get_nlist(
         self,
-    ) -> Tuple[tf.Tensor, tf.Tensor, List[int], List[int]]:
+    ) -> tuple[tf.Tensor, tf.Tensor, list[int], list[int]]:
         """Get the neighbor information of the descriptor, returns the
         nlist of the descriptor with the largest cut-off radius.
 
@@ -111,7 +108,7 @@ def get_nlist(
         maxr_idx = np.argmax([ii.get_rcut() for ii in self.descrpt_list])
         return self.get_nlist_i(maxr_idx)
 
-    def get_nlist_i(self, ii: int) -> Tuple[tf.Tensor, tf.Tensor, List[int], List[int]]:
+    def get_nlist_i(self, ii: int) -> tuple[tf.Tensor, tf.Tensor, list[int], list[int]]:
         """Get the neighbor information of the ii-th descriptor.
 
         Parameters
@@ -275,7 +272,7 @@ def build(
 
     def prod_force_virial(
         self, atom_ener: tf.Tensor, natoms: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Compute force and virial.
 
         Parameters
@@ -385,7 +382,7 @@ def init_variables(
         for idx, ii in enumerate(self.descrpt_list):
             ii.init_variables(graph, graph_def, suffix=f"{suffix}_{idx}")
 
-    def get_tensor_names(self, suffix: str = "") -> Tuple[str]:
+    def get_tensor_names(self, suffix: str = "") -> tuple[str]:
         """Get names of tensors.
 
         Parameters
@@ -395,7 +392,7 @@ def get_tensor_names(self, suffix: str = "") -> Tuple[str]:
 
         Returns
         -------
-        Tuple[str]
+        tuple[str]
             Names of tensors
         """
         tensor_names = []
@@ -429,9 +426,9 @@ def explicit_ntypes(self) -> bool:
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/tf/descriptor/loc_frame.py b/deepmd/tf/descriptor/loc_frame.py
index 4891c5a55f..74ba755b4c 100644
--- a/deepmd/tf/descriptor/loc_frame.py
+++ b/deepmd/tf/descriptor/loc_frame.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -60,9 +58,9 @@ class DescrptLocFrame(Descriptor):
     def __init__(
         self,
         rcut: float,
-        sel_a: List[int],
-        sel_r: List[int],
-        axis_rule: List[int],
+        sel_a: list[int],
+        sel_r: list[int],
+        axis_rule: list[int],
         **kwargs,
     ) -> None:
         """Constructor."""
@@ -142,7 +140,7 @@ def get_dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.ndescrpt
 
-    def get_nlist(self) -> Tuple[tf.Tensor, tf.Tensor, List[int], List[int]]:
+    def get_nlist(self) -> tuple[tf.Tensor, tf.Tensor, list[int], list[int]]:
         """Returns
         -------
         nlist
@@ -320,7 +318,7 @@ def get_rot_mat(self) -> tf.Tensor:
 
     def prod_force_virial(
         self, atom_ener: tf.Tensor, natoms: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Compute force and virial.
 
         Parameters
@@ -437,9 +435,9 @@ def init_variables(
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/tf/descriptor/se.py b/deepmd/tf/descriptor/se.py
index f5f54550f2..319a65f6da 100644
--- a/deepmd/tf/descriptor/se.py
+++ b/deepmd/tf/descriptor/se.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import re
 from typing import (
-    List,
     Optional,
-    Set,
-    Tuple,
 )
 
 from deepmd.dpmodel.utils.network import (
@@ -80,7 +77,7 @@ def _identity_tensors(self, suffix: str = "") -> None:
         self.rij = tf.identity(self.rij, name="o_rij" + suffix)
         self.nlist = tf.identity(self.nlist, name="o_nlist" + suffix)
 
-    def get_tensor_names(self, suffix: str = "") -> Tuple[str]:
+    def get_tensor_names(self, suffix: str = "") -> tuple[str]:
         """Get names of tensors.
 
         Parameters
@@ -90,7 +87,7 @@ def get_tensor_names(self, suffix: str = "") -> Tuple[str]:
 
         Returns
         -------
-        Tuple[str]
+        tuple[str]
             Names of tensors
         """
         return (
@@ -157,9 +154,9 @@ def precision(self) -> tf.DType:
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -190,11 +187,11 @@ def serialize_network(
         ntypes: int,
         ndim: int,
         in_dim: int,
-        neuron: List[int],
+        neuron: list[int],
         activation_function: str,
         resnet_dt: bool,
         variables: dict,
-        excluded_types: Set[Tuple[int, int]] = set(),
+        excluded_types: set[tuple[int, int]] = set(),
         suffix: str = "",
     ) -> dict:
         """Serialize network.
@@ -207,7 +204,7 @@ def serialize_network(
             The dimension of elements
         in_dim : int
             The input dimension
-        neuron : List[int]
+        neuron : list[int]
             The neuron list
         activation_function : str
             The activation function
@@ -215,7 +212,7 @@ def serialize_network(
             Whether to use resnet
         variables : dict
             The input variables
-        excluded_types : Set[Tuple[int, int]], optional
+        excluded_types : set[tuple[int, int]], optional
             The excluded types
         suffix : str, optional
             The suffix of the scope
diff --git a/deepmd/tf/descriptor/se_a.py b/deepmd/tf/descriptor/se_a.py
index 721e8e71d1..d5a8ed6815 100644
--- a/deepmd/tf/descriptor/se_a.py
+++ b/deepmd/tf/descriptor/se_a.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -141,7 +139,7 @@ class DescrptSeA(DescrptSe):
             Random seed for initializing the network parameters.
     type_one_side
             Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     set_davg_zero
@@ -154,7 +152,7 @@ class DescrptSeA(DescrptSe):
             Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
     env_protection: float
             Protection parameter to prevent division by zero errors during environment matrix calculations.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
 
     References
@@ -169,21 +167,21 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         axis_neuron: int = 8,
         resnet_dt: bool = False,
         trainable: bool = True,
         seed: Optional[int] = None,
         type_one_side: bool = True,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "default",
         uniform_seed: bool = False,
         spin: Optional[Spin] = None,
         tebd_input_mode: str = "concat",
-        type_map: Optional[List[str]] = None,  # to be compat with input
+        type_map: Optional[list[str]] = None,  # to be compat with input
         env_protection: float = 0.0,  # not implement!!
         **kwargs,
     ) -> None:
@@ -327,7 +325,7 @@ def get_dim_rot_mat_1(self) -> int:
         """Returns the first dimension of the rotation matrix. The rotation is of shape dim_1 x 3."""
         return self.filter_neuron[-1]
 
-    def get_nlist(self) -> Tuple[tf.Tensor, tf.Tensor, List[int], List[int]]:
+    def get_nlist(self) -> tuple[tf.Tensor, tf.Tensor, list[int], list[int]]:
         """Returns neighbor information.
 
         Returns
@@ -696,7 +694,7 @@ def get_rot_mat(self) -> tf.Tensor:
 
     def prod_force_virial(
         self, atom_ener: tf.Tensor, natoms: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Compute force and virial.
 
         Parameters
diff --git a/deepmd/tf/descriptor/se_a_ebd.py b/deepmd/tf/descriptor/se_a_ebd.py
index c558cd285e..ae76308e69 100644
--- a/deepmd/tf/descriptor/se_a_ebd.py
+++ b/deepmd/tf/descriptor/se_a_ebd.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
 )
 
@@ -65,7 +64,7 @@ class DescrptSeAEbd(DescrptSeA):
             The activation function in the embedding net. Supported options are {0}
     precision
             The precision of the embedding net parameters. Supported options are {1}
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     """
@@ -74,8 +73,8 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         axis_neuron: int = 8,
         resnet_dt: bool = False,
         trainable: bool = True,
@@ -87,7 +86,7 @@ def __init__(
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "default",
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         **kwargs,
     ) -> None:
         """Constructor."""
@@ -600,7 +599,7 @@ def _ebd_filter(
         return result, qmat
 
     @property
-    def input_requirement(self) -> List[DataRequirementItem]:
+    def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
         data_requirement = super().input_requirement
         if self.numb_aparam > 0:
diff --git a/deepmd/tf/descriptor/se_a_ebd_v2.py b/deepmd/tf/descriptor/se_a_ebd_v2.py
index 9afa6598d1..af43eedbbc 100644
--- a/deepmd/tf/descriptor/se_a_ebd_v2.py
+++ b/deepmd/tf/descriptor/se_a_ebd_v2.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
-    List,
     Optional,
 )
 
@@ -31,14 +30,14 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         axis_neuron: int = 8,
         resnet_dt: bool = False,
         trainable: bool = True,
         seed: Optional[int] = None,
         type_one_side: bool = True,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "default",
diff --git a/deepmd/tf/descriptor/se_a_ef.py b/deepmd/tf/descriptor/se_a_ef.py
index 81f4c8955a..9f70464c56 100644
--- a/deepmd/tf/descriptor/se_a_ef.py
+++ b/deepmd/tf/descriptor/se_a_ef.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -57,7 +55,7 @@ class DescrptSeAEf(DescrptSe):
             Random seed for initializing the network parameters.
     type_one_side
             Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     set_davg_zero
@@ -74,14 +72,14 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         axis_neuron: int = 8,
         resnet_dt: bool = False,
         trainable: bool = True,
         seed: Optional[int] = None,
         type_one_side: bool = True,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "default",
@@ -144,7 +142,7 @@ def get_rot_mat(self) -> tf.Tensor:
         """Get rotational matrix."""
         return self.qmat
 
-    def get_nlist(self) -> Tuple[tf.Tensor, tf.Tensor, List[int], List[int]]:
+    def get_nlist(self) -> tuple[tf.Tensor, tf.Tensor, list[int], list[int]]:
         """Returns neighbor information.
 
         Returns
@@ -267,7 +265,7 @@ def build(
 
     def prod_force_virial(
         self, atom_ener: tf.Tensor, natoms: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Compute force and virial.
 
         Parameters
@@ -305,14 +303,14 @@ def __init__(
         op,
         rcut: float,
         rcut_smth: float,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         axis_neuron: int = 8,
         resnet_dt: bool = False,
         trainable: bool = True,
         seed: Optional[int] = None,
         type_one_side: bool = True,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "default",
@@ -586,7 +584,7 @@ def _compute_dstats_sys_smth(
         return sysr, sysr2, sysa, sysa2, sysn
 
     @property
-    def input_requirement(self) -> List[DataRequirementItem]:
+    def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
         data_requirement = super().input_requirement
         data_requirement.append(
diff --git a/deepmd/tf/descriptor/se_a_mask.py b/deepmd/tf/descriptor/se_a_mask.py
index 316a909be1..e12f6a0fff 100644
--- a/deepmd/tf/descriptor/se_a_mask.py
+++ b/deepmd/tf/descriptor/se_a_mask.py
@@ -2,10 +2,7 @@
 import warnings
 from typing import (
     Any,
-    Dict,
-    List,
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -100,7 +97,7 @@ class DescrptSeAMask(DescrptSeA):
             Random seed for initializing the network parameters.
     type_one_side
             Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     activation_function
@@ -120,13 +117,13 @@ class DescrptSeAMask(DescrptSeA):
 
     def __init__(
         self,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         axis_neuron: int = 8,
         resnet_dt: bool = False,
         trainable: bool = True,
         type_one_side: bool = False,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         seed: Optional[int] = None,
         activation_function: str = "tanh",
         precision: str = "default",
@@ -271,7 +268,7 @@ def build(
         natoms: tf.Tensor,
         box_: tf.Tensor,
         mesh: tf.Tensor,
-        input_dict: Dict[str, Any],
+        input_dict: dict[str, Any],
         reuse: Optional[bool] = None,
         suffix: str = "",
     ) -> tf.Tensor:
@@ -384,7 +381,7 @@ def prod_force_virial(
         self,
         atom_ener: tf.Tensor,
         natoms: tf.Tensor,
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Compute force and virial.
 
         Parameters
@@ -430,9 +427,9 @@ def prod_force_virial(
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/tf/descriptor/se_atten.py b/deepmd/tf/descriptor/se_atten.py
index 37bcd7eea0..963e81ecf0 100644
--- a/deepmd/tf/descriptor/se_atten.py
+++ b/deepmd/tf/descriptor/se_atten.py
@@ -4,10 +4,7 @@
 import warnings
 from typing import (
     Any,
-    List,
     Optional,
-    Set,
-    Tuple,
     Union,
 )
 
@@ -125,7 +122,7 @@ class DescrptSeAtten(DescrptSeA):
             If 'False', type embeddings of both neighbor and central atoms are considered.
             If 'True', only type embeddings of neighbor atoms are considered.
             Default is 'False'.
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     set_davg_zero: bool
@@ -162,7 +159,7 @@ class DescrptSeAtten(DescrptSeA):
             Setting this parameter to `True` is equivalent to setting `tebd_input_mode` to 'strip'.
             Setting it to `False` is equivalent to setting `tebd_input_mode` to 'concat'.
             The default value is `None`, which means the `tebd_input_mode` setting will be used instead.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
 
     Raises
@@ -175,16 +172,16 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: Union[List[int], int],
+        sel: Union[list[int], int],
         ntypes: int,
-        neuron: List[int] = [25, 50, 100],
+        neuron: list[int] = [25, 50, 100],
         axis_neuron: int = 8,
         resnet_dt: bool = False,
         trainable: bool = True,
         seed: Optional[int] = None,
         type_one_side: bool = True,
         set_davg_zero: bool = True,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         activation_function: str = "tanh",
         precision: str = "default",
         uniform_seed: bool = False,
@@ -203,7 +200,7 @@ def __init__(
         concat_output_tebd: bool = True,
         env_protection: float = 0.0,  # not implement!!
         stripped_type_embedding: Optional[bool] = None,
-        type_map: Optional[List[str]] = None,  # to be compat with input
+        type_map: Optional[list[str]] = None,  # to be compat with input
         **kwargs,
     ) -> None:
         # Ensure compatibility with the deprecated stripped_type_embedding option.
@@ -1420,9 +1417,9 @@ def compat_ln_pattern(old_key):
 
     def build_type_exclude_mask_mixed(
         self,
-        exclude_types: Set[Tuple[int, int]],
+        exclude_types: set[tuple[int, int]],
         ntypes: int,
-        sel: List[int],
+        sel: list[int],
         ndescrpt: int,
         atype: tf.Tensor,
         shape0: tf.Tensor,
@@ -1441,12 +1438,12 @@ def build_type_exclude_mask_mixed(
 
         Parameters
         ----------
-        exclude_types : List[Tuple[int, int]]
+        exclude_types : list[tuple[int, int]]
             The list of excluded types, e.g. [(0, 1), (1, 0)] means the interaction
             between type 0 and type 1 is excluded.
         ntypes : int
             The number of types.
-        sel : List[int]
+        sel : list[int]
             The list of the number of selected neighbors for each type.
         ndescrpt : int
             The number of descriptors for each atom.
@@ -1511,9 +1508,9 @@ def explicit_ntypes(self) -> bool:
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -1646,7 +1643,7 @@ def serialize_network_strip(
         ntypes: int,
         ndim: int,
         in_dim: int,
-        neuron: List[int],
+        neuron: list[int],
         activation_function: str,
         resnet_dt: bool,
         variables: dict,
@@ -1663,7 +1660,7 @@ def serialize_network_strip(
             The dimension of elements
         in_dim : int
             The input dimension
-        neuron : List[int]
+        neuron : list[int]
             The neuron list
         activation_function : str
             The activation function
@@ -2055,7 +2052,7 @@ class DescrptDPA1Compat(DescrptSeAtten):
     attn_mask: bool
             (Only support False to keep consistent with other backend references.)
             If mask the diagonal of attention weights
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     env_protection: float
@@ -2088,7 +2085,7 @@ class DescrptDPA1Compat(DescrptSeAtten):
             Whether to use electronic configuration type embedding.
     use_tebd_bias : bool, Optional
             Whether to use bias in the type embedding layer.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     spin
             (Only support None to keep consistent with old implementation.)
@@ -2099,9 +2096,9 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: Union[List[int], int],
+        sel: Union[list[int], int],
         ntypes: int,
-        neuron: List[int] = [25, 50, 100],
+        neuron: list[int] = [25, 50, 100],
         axis_neuron: int = 8,
         tebd_dim: int = 8,
         tebd_input_mode: str = "concat",
@@ -2112,7 +2109,7 @@ def __init__(
         attn_layer: int = 2,
         attn_dotr: bool = True,
         attn_mask: bool = False,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         env_protection: float = 0.0,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
@@ -2126,7 +2123,7 @@ def __init__(
         concat_output_tebd: bool = True,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         spin: Optional[Any] = None,
         # consistent with argcheck, not used though
         seed: Optional[int] = None,
diff --git a/deepmd/tf/descriptor/se_atten_v2.py b/deepmd/tf/descriptor/se_atten_v2.py
index a4fdf24a55..dc71f87523 100644
--- a/deepmd/tf/descriptor/se_atten_v2.py
+++ b/deepmd/tf/descriptor/se_atten_v2.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
-    List,
     Optional,
 )
 
@@ -44,7 +43,7 @@ class DescrptSeAttenV2(DescrptSeAtten):
             Random seed for initializing the network parameters.
     type_one_side
             Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     set_davg_zero
@@ -71,14 +70,14 @@ def __init__(
         rcut_smth: float,
         sel: int,
         ntypes: int,
-        neuron: List[int] = [24, 48, 96],
+        neuron: list[int] = [24, 48, 96],
         axis_neuron: int = 8,
         resnet_dt: bool = False,
         trainable: bool = True,
         seed: Optional[int] = None,
         type_one_side: bool = True,
         set_davg_zero: bool = False,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         activation_function: str = "tanh",
         precision: str = "default",
         uniform_seed: bool = False,
diff --git a/deepmd/tf/descriptor/se_r.py b/deepmd/tf/descriptor/se_r.py
index cd99651314..8096ef7c96 100644
--- a/deepmd/tf/descriptor/se_r.py
+++ b/deepmd/tf/descriptor/se_r.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -76,7 +74,7 @@ class DescrptSeR(DescrptSe):
             Random seed for initializing the network parameters.
     type_one_side
             Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     activation_function
@@ -85,7 +83,7 @@ class DescrptSeR(DescrptSe):
             The precision of the embedding net parameters. Supported options are |PRECISION|
     uniform_seed
             Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
 
@@ -93,19 +91,19 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         resnet_dt: bool = False,
         trainable: bool = True,
         seed: Optional[int] = None,
         type_one_side: bool = True,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "default",
         uniform_seed: bool = False,
         spin: Optional[Spin] = None,
-        type_map: Optional[List[str]] = None,  # to be compat with input
+        type_map: Optional[list[str]] = None,  # to be compat with input
         env_protection: float = 0.0,  # not implement!!
         **kwargs,
     ) -> None:
@@ -488,7 +486,7 @@ def build(
 
     def prod_force_virial(
         self, atom_ener: tf.Tensor, natoms: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Compute force and virial.
 
         Parameters
diff --git a/deepmd/tf/descriptor/se_t.py b/deepmd/tf/descriptor/se_t.py
index d5f5e2ab8a..f96b1ba778 100644
--- a/deepmd/tf/descriptor/se_t.py
+++ b/deepmd/tf/descriptor/se_t.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import re
 from typing import (
-    List,
     Optional,
-    Set,
-    Tuple,
 )
 
 import numpy as np
@@ -90,7 +87,7 @@ class DescrptSeT(DescrptSe):
             Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
     env_protection: float
             Protection parameter to prevent division by zero errors during environment matrix calculations.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
 
@@ -98,17 +95,17 @@ def __init__(
         self,
         rcut: float,
         rcut_smth: float,
-        sel: List[int],
-        neuron: List[int] = [24, 48, 96],
+        sel: list[int],
+        neuron: list[int] = [24, 48, 96],
         resnet_dt: bool = False,
         trainable: bool = True,
         seed: Optional[int] = None,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "default",
         uniform_seed: bool = False,
-        type_map: Optional[List[str]] = None,  # to be compat with input
+        type_map: Optional[list[str]] = None,  # to be compat with input
         env_protection: float = 0.0,  # not implement!!
         **kwargs,
     ) -> None:
@@ -212,7 +209,7 @@ def get_dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.filter_neuron[-1]
 
-    def get_nlist(self) -> Tuple[tf.Tensor, tf.Tensor, List[int], List[int]]:
+    def get_nlist(self) -> tuple[tf.Tensor, tf.Tensor, list[int], list[int]]:
         """Returns neighbor information.
 
         Returns
@@ -495,7 +492,7 @@ def build(
 
     def prod_force_virial(
         self, atom_ener: tf.Tensor, natoms: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Compute force and virial.
 
         Parameters
@@ -724,11 +721,11 @@ def serialize_network(
         ntypes: int,
         ndim: int,
         in_dim: int,
-        neuron: List[int],
+        neuron: list[int],
         activation_function: str,
         resnet_dt: bool,
         variables: dict,
-        excluded_types: Set[Tuple[int, int]] = set(),
+        excluded_types: set[tuple[int, int]] = set(),
         suffix: str = "",
     ) -> dict:
         """Serialize network.
@@ -741,7 +738,7 @@ def serialize_network(
             The dimension of elements
         in_dim : int
             The input dimension
-        neuron : List[int]
+        neuron : list[int]
             The neuron list
         activation_function : str
             The activation function
@@ -749,7 +746,7 @@ def serialize_network(
             Whether to use resnet
         variables : dict
             The input variables
-        excluded_types : Set[Tuple[int, int]], optional
+        excluded_types : set[tuple[int, int]], optional
             The excluded types
         suffix : str, optional
             The suffix of the scope
diff --git a/deepmd/tf/entrypoints/freeze.py b/deepmd/tf/entrypoints/freeze.py
index 787d26e9a4..cee6615abc 100755
--- a/deepmd/tf/entrypoints/freeze.py
+++ b/deepmd/tf/entrypoints/freeze.py
@@ -15,7 +15,6 @@
     Path,
 )
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -80,7 +79,7 @@ def _make_node_names(
     modifier_type: Optional[str] = None,
     out_suffix: str = "",
     node_names: Optional[Union[str, list]] = None,
-) -> List[str]:
+) -> list[str]:
     """Get node names based on model type.
 
     Parameters
@@ -96,7 +95,7 @@ def _make_node_names(
 
     Returns
     -------
-    List[str]
+    list[str]
         list with all node names to freeze
 
     Raises
@@ -238,7 +237,7 @@ def freeze_graph(
         The default session.
     input_graph : tf.GraphDef
         The input graph_def stored from the checkpoint.
-    input_node : List[str]
+    input_node : list[str]
         The expected nodes to freeze.
     freeze_type : str
         The model type to freeze.
diff --git a/deepmd/tf/entrypoints/ipi.py b/deepmd/tf/entrypoints/ipi.py
index 1631a35c2e..1183375119 100644
--- a/deepmd/tf/entrypoints/ipi.py
+++ b/deepmd/tf/entrypoints/ipi.py
@@ -4,9 +4,6 @@
 import os
 import subprocess
 import sys
-from typing import (
-    List,
-)
 
 from deepmd.tf.lmp import (
     get_op_dir,
@@ -15,7 +12,7 @@
 ROOT_DIR = get_op_dir()
 
 
-def _program(name: str, args: List[str]):
+def _program(name: str, args: list[str]):
     """Execuate a program.
 
     Parameters
diff --git a/deepmd/tf/entrypoints/main.py b/deepmd/tf/entrypoints/main.py
index 493e5b7aa4..d9dff4eb4a 100644
--- a/deepmd/tf/entrypoints/main.py
+++ b/deepmd/tf/entrypoints/main.py
@@ -6,7 +6,6 @@
     Path,
 )
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -39,12 +38,12 @@
 __all__ = ["main", "parse_args", "get_ll", "main_parser"]
 
 
-def main(args: Optional[Union[List[str], argparse.Namespace]] = None):
+def main(args: Optional[Union[list[str], argparse.Namespace]] = None):
     """DeePMD-Kit entry point.
 
     Parameters
     ----------
-    args : List[str] or argparse.Namespace, optional
+    args : list[str] or argparse.Namespace, optional
         list of command line arguments, used to avoid calling from the subprocess,
         as it is quite slow to import tensorflow; if Namespace is given, it will
         be used directly
diff --git a/deepmd/tf/entrypoints/train.py b/deepmd/tf/entrypoints/train.py
index 12a3c59d70..66622b3182 100755
--- a/deepmd/tf/entrypoints/train.py
+++ b/deepmd/tf/entrypoints/train.py
@@ -9,7 +9,6 @@
 import time
 from typing import (
     Any,
-    Dict,
     Optional,
 )
 
@@ -186,12 +185,12 @@ def train(
     _do_work(jdata, run_opt, is_compress)
 
 
-def _do_work(jdata: Dict[str, Any], run_opt: RunOptions, is_compress: bool = False):
+def _do_work(jdata: dict[str, Any], run_opt: RunOptions, is_compress: bool = False):
     """Run serial model training.
 
     Parameters
     ----------
-    jdata : Dict[str, Any]
+    jdata : dict[str, Any]
         arguments read form json/yaml control file
     run_opt : RunOptions
         object with run configuration
diff --git a/deepmd/tf/entrypoints/transfer.py b/deepmd/tf/entrypoints/transfer.py
index 7c90c77de8..b93caf3cac 100644
--- a/deepmd/tf/entrypoints/transfer.py
+++ b/deepmd/tf/entrypoints/transfer.py
@@ -3,10 +3,11 @@
 
 import logging
 import re
+from collections.abc import (
+    Sequence,
+)
 from typing import (
-    Dict,
     Optional,
-    Sequence,
 )
 
 import numpy as np
@@ -234,7 +235,7 @@ def check_dim(raw_graph_node: tf.Tensor, old_graph_node: tf.Tensor, node_name: s
         )
 
 
-def load_transform_node(graph: tf.Graph) -> Dict[str, tf.Tensor]:
+def load_transform_node(graph: tf.Graph) -> dict[str, tf.Tensor]:
     """Load nodes and their names from graph to dict.
 
     Parameters
@@ -244,7 +245,7 @@ def load_transform_node(graph: tf.Graph) -> Dict[str, tf.Tensor]:
 
     Returns
     -------
-    Dict[str, tf.Tensor]
+    dict[str, tf.Tensor]
         mapping on graph node names and corresponding tensors
     """
     transform_node_pattern = re.compile(TRANSFER_PATTERN)
diff --git a/deepmd/tf/env.py b/deepmd/tf/env.py
index 03f36fb675..5a66498dba 100644
--- a/deepmd/tf/env.py
+++ b/deepmd/tf/env.py
@@ -2,6 +2,7 @@
 """Module that sets tensorflow working environment and exports inportant constants."""
 
 import ctypes
+import logging
 import os
 import platform
 from importlib import (
@@ -75,17 +76,27 @@ def dlopen_library(module: str, filename: str):
     dlopen_library("nvidia.cusparse.lib", "libcusparse.so*")
     dlopen_library("nvidia.cudnn.lib", "libcudnn.so*")
 
+
+FILTER_MSGS = [
+    "is deprecated and will be removed in a future version.",
+    "disable_mixed_precision_graph_rewrite() called when mixed precision is already disabled.",
+]
+
+
+class TFWarningFilter(logging.Filter):
+    def filter(self, record):
+        return not any(msg in record.getMessage().strip() for msg in FILTER_MSGS)
+
+
 # keras 3 is incompatible with tf.compat.v1
 # https://keras.io/getting_started/#tensorflow--keras-2-backwards-compatibility
 # 2024/04/24: deepmd.tf doesn't import tf.keras any more
 
 # import tensorflow v1 compatability
-try:
-    import tensorflow.compat.v1 as tf
+import tensorflow.compat.v1 as tf
 
-    tf.disable_v2_behavior()
-except ImportError:
-    import tensorflow as tf
+tf.get_logger().addFilter(TFWarningFilter())
+tf.disable_v2_behavior()
 try:
     import tensorflow.compat.v2 as tfv2
 except ImportError:
diff --git a/deepmd/tf/fit/dipole.py b/deepmd/tf/fit/dipole.py
index fd37b63720..0e5b860fa2 100644
--- a/deepmd/tf/fit/dipole.py
+++ b/deepmd/tf/fit/dipole.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
 )
 
@@ -47,12 +46,12 @@ class DipoleFittingSeA(Fitting):
             The dimension of the descrptor :math:`\mathcal{D}`
     embedding_width
             The rotation matrix dimension of the descrptor :math:`\mathcal{D}`
-    neuron : List[int]
+    neuron : list[int]
             Number of neurons in each hidden layer of the fitting net
     resnet_dt : bool
             Time-step `dt` in the resnet construction:
             y = x + dt * \phi (Wx + b)
-    sel_type : List[int]
+    sel_type : list[int]
             The atom types selected to have an atomic dipole prediction. If is None, all atoms are selected.
     seed : int
             Random seed for initializing the network parameters.
@@ -65,7 +64,7 @@ class DipoleFittingSeA(Fitting):
     mixed_types : bool
         If true, use a uniform fitting net for all atom types, otherwise use
         different fitting nets for different atom types.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
 
@@ -74,15 +73,15 @@ def __init__(
         ntypes: int,
         dim_descrpt: int,
         embedding_width: int,
-        neuron: List[int] = [120, 120, 120],
+        neuron: list[int] = [120, 120, 120],
         resnet_dt: bool = True,
-        sel_type: Optional[List[int]] = None,
+        sel_type: Optional[list[int]] = None,
         seed: Optional[int] = None,
         activation_function: str = "tanh",
         precision: str = "default",
         uniform_seed: bool = False,
         mixed_types: bool = False,
-        type_map: Optional[List[str]] = None,  # to be compat with input
+        type_map: Optional[list[str]] = None,  # to be compat with input
         **kwargs,
     ) -> None:
         """Constructor."""
diff --git a/deepmd/tf/fit/dos.py b/deepmd/tf/fit/dos.py
index 382d11f45e..ebc347c2fd 100644
--- a/deepmd/tf/fit/dos.py
+++ b/deepmd/tf/fit/dos.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
-    List,
     Optional,
 )
 
@@ -100,7 +99,7 @@ class DOSFitting(Fitting):
     mixed_types : bool
         If true, use a uniform fitting net for all atom types, otherwise use
         different fitting nets for different atom types.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
 
@@ -108,21 +107,21 @@ def __init__(
         self,
         ntypes: int,
         dim_descrpt: int,
-        neuron: List[int] = [120, 120, 120],
+        neuron: list[int] = [120, 120, 120],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
         numb_aparam: int = 0,
         numb_dos: int = 300,
         rcond: Optional[float] = None,
-        trainable: Optional[List[bool]] = None,
+        trainable: Optional[list[bool]] = None,
         seed: Optional[int] = None,
         activation_function: str = "tanh",
         precision: str = "default",
         uniform_seed: bool = False,
-        layer_name: Optional[List[Optional[str]]] = None,
+        layer_name: Optional[list[Optional[str]]] = None,
         use_aparam_as_mask: bool = False,
         mixed_types: bool = False,
-        type_map: Optional[List[str]] = None,  # to be compat with input
+        type_map: Optional[list[str]] = None,  # to be compat with input
         **kwargs,
     ) -> None:
         """Constructor."""
@@ -738,7 +737,7 @@ def serialize(self, suffix: str = "") -> dict:
         return data
 
     @property
-    def input_requirement(self) -> List[DataRequirementItem]:
+    def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
         data_requirement = []
         if self.numb_fparam > 0:
diff --git a/deepmd/tf/fit/ener.py b/deepmd/tf/fit/ener.py
index c2aef0610a..b01574cf87 100644
--- a/deepmd/tf/fit/ener.py
+++ b/deepmd/tf/fit/ener.py
@@ -2,7 +2,6 @@
 import logging
 from typing import (
     TYPE_CHECKING,
-    List,
     Optional,
 )
 
@@ -149,7 +148,7 @@ class EnerFitting(Fitting):
     mixed_types : bool
         If true, use a uniform fitting net for all atom types, otherwise use
         different fitting nets for different atom types.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
 
@@ -157,23 +156,23 @@ def __init__(
         self,
         ntypes: int,
         dim_descrpt: int,
-        neuron: List[int] = [120, 120, 120],
+        neuron: list[int] = [120, 120, 120],
         resnet_dt: bool = True,
         numb_fparam: int = 0,
         numb_aparam: int = 0,
         rcond: Optional[float] = None,
         tot_ener_zero: bool = False,
-        trainable: Optional[List[bool]] = None,
+        trainable: Optional[list[bool]] = None,
         seed: Optional[int] = None,
-        atom_ener: List[float] = [],
+        atom_ener: list[float] = [],
         activation_function: str = "tanh",
         precision: str = "default",
         uniform_seed: bool = False,
-        layer_name: Optional[List[Optional[str]]] = None,
+        layer_name: Optional[list[Optional[str]]] = None,
         use_aparam_as_mask: bool = False,
         spin: Optional[Spin] = None,
         mixed_types: bool = False,
-        type_map: Optional[List[str]] = None,  # to be compat with input
+        type_map: Optional[list[str]] = None,  # to be compat with input
         **kwargs,
     ) -> None:
         """Constructor."""
@@ -942,7 +941,7 @@ def serialize(self, suffix: str = "") -> dict:
         return data
 
     @property
-    def input_requirement(self) -> List[DataRequirementItem]:
+    def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
         data_requirement = []
         if self.numb_fparam > 0:
@@ -963,8 +962,8 @@ def input_requirement(self) -> List[DataRequirementItem]:
 def change_energy_bias_lower(
     data: DeepmdDataSystem,
     dp: DeepEval,
-    origin_type_map: List[str],
-    full_type_map: List[str],
+    origin_type_map: list[str],
+    full_type_map: list[str],
     bias_atom_e: np.ndarray,
     bias_adjust_mode="change-by-statistic",
     ntest=10,
diff --git a/deepmd/tf/fit/fitting.py b/deepmd/tf/fit/fitting.py
index 9190261187..f159de1628 100644
--- a/deepmd/tf/fit/fitting.py
+++ b/deepmd/tf/fit/fitting.py
@@ -4,7 +4,6 @@
     abstractmethod,
 )
 from typing import (
-    List,
     Optional,
 )
 
@@ -131,7 +130,7 @@ def serialize_network(
         ntypes: int,
         ndim: int,
         in_dim: int,
-        neuron: List[int],
+        neuron: list[int],
         activation_function: str,
         resnet_dt: bool,
         variables: dict,
@@ -148,7 +147,7 @@ def serialize_network(
             The dimension of elements
         in_dim : int
             The input dimension
-        neuron : List[int]
+        neuron : list[int]
             The neuron list
         activation_function : str
             The activation function
@@ -257,6 +256,6 @@ def deserialize_network(cls, data: dict, suffix: str = "") -> dict:
         return fitting_net_variables
 
     @property
-    def input_requirement(self) -> List[DataRequirementItem]:
+    def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
         return []
diff --git a/deepmd/tf/fit/polar.py b/deepmd/tf/fit/polar.py
index 14902a4d96..cc79e3402a 100644
--- a/deepmd/tf/fit/polar.py
+++ b/deepmd/tf/fit/polar.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import warnings
 from typing import (
-    List,
     Optional,
 )
 
@@ -52,18 +51,18 @@ class PolarFittingSeA(Fitting):
             The dimension of the descrptor :math:`\mathcal{D}`
     embedding_width
             The rotation matrix dimension of the descrptor :math:`\mathcal{D}`
-    neuron : List[int]
+    neuron : list[int]
             Number of neurons in each hidden layer of the fitting net
     resnet_dt : bool
             Time-step `dt` in the resnet construction:
             y = x + dt * \phi (Wx + b)
-    sel_type : List[int]
+    sel_type : list[int]
             The atom types selected to have an atomic polarizability prediction. If is None, all atoms are selected.
     fit_diag : bool
             Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
-    scale : List[float]
+    scale : list[float]
             The output of the fitting net (polarizability matrix) for type i atom will be scaled by scale[i]
-    diag_shift : List[float]
+    diag_shift : list[float]
             The diagonal part of the polarizability matrix of type i will be shifted by diag_shift[i]. The shift operation is carried out after scale.
     seed : int
             Random seed for initializing the network parameters.
@@ -76,7 +75,7 @@ class PolarFittingSeA(Fitting):
     mixed_types : bool
         If true, use a uniform fitting net for all atom types, otherwise use
         different fitting nets for different atom types.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
 
@@ -85,19 +84,19 @@ def __init__(
         ntypes: int,
         dim_descrpt: int,
         embedding_width: int,
-        neuron: List[int] = [120, 120, 120],
+        neuron: list[int] = [120, 120, 120],
         resnet_dt: bool = True,
-        sel_type: Optional[List[int]] = None,
+        sel_type: Optional[list[int]] = None,
         fit_diag: bool = True,
-        scale: Optional[List[float]] = None,
+        scale: Optional[list[float]] = None,
         shift_diag: bool = True,  # YWolfeee: will support the user to decide whether to use this function
-        # diag_shift : List[float] = None, YWolfeee: will not support the user to assign a shift
+        # diag_shift : list[float] = None, YWolfeee: will not support the user to assign a shift
         seed: Optional[int] = None,
         activation_function: str = "tanh",
         precision: str = "default",
         uniform_seed: bool = False,
         mixed_types: bool = False,
-        type_map: Optional[List[str]] = None,  # to be compat with input
+        type_map: Optional[list[str]] = None,  # to be compat with input
         **kwargs,
     ) -> None:
         """Constructor."""
@@ -153,7 +152,7 @@ def __init__(
         self.mixed_types = mixed_types
         self.type_map = type_map
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get selected atom types."""
         return self.sel_type
 
@@ -620,18 +619,18 @@ class GlobalPolarFittingSeA:
     ----------
     descrpt : tf.Tensor
             The descrptor
-    neuron : List[int]
+    neuron : list[int]
             Number of neurons in each hidden layer of the fitting net
     resnet_dt : bool
             Time-step `dt` in the resnet construction:
             y = x + dt * \phi (Wx + b)
-    sel_type : List[int]
+    sel_type : list[int]
             The atom types selected to have an atomic polarizability prediction
     fit_diag : bool
             Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
-    scale : List[float]
+    scale : list[float]
             The output of the fitting net (polarizability matrix) for type i atom will be scaled by scale[i]
-    diag_shift : List[float]
+    diag_shift : list[float]
             The diagonal part of the polarizability matrix of type i will be shifted by diag_shift[i]. The shift operation is carried out after scale.
     seed : int
             Random seed for initializing the network parameters.
@@ -644,12 +643,12 @@ class GlobalPolarFittingSeA:
     def __init__(
         self,
         descrpt: tf.Tensor,
-        neuron: List[int] = [120, 120, 120],
+        neuron: list[int] = [120, 120, 120],
         resnet_dt: bool = True,
-        sel_type: Optional[List[int]] = None,
+        sel_type: Optional[list[int]] = None,
         fit_diag: bool = True,
-        scale: Optional[List[float]] = None,
-        diag_shift: Optional[List[float]] = None,
+        scale: Optional[list[float]] = None,
+        diag_shift: Optional[list[float]] = None,
         seed: Optional[int] = None,
         activation_function: str = "tanh",
         precision: str = "default",
diff --git a/deepmd/tf/infer/data_modifier.py b/deepmd/tf/infer/data_modifier.py
index 08966c3498..ddb1af68d7 100644
--- a/deepmd/tf/infer/data_modifier.py
+++ b/deepmd/tf/infer/data_modifier.py
@@ -1,9 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import os
-from typing import (
-    List,
-    Tuple,
-)
 
 import numpy as np
 
@@ -47,8 +43,8 @@ class DipoleChargeModifier(DeepDipole):
     def __init__(
         self,
         model_name: str,
-        model_charge_map: List[float],
-        sys_charge_map: List[float],
+        model_charge_map: list[float],
+        sys_charge_map: list[float],
         ewald_h: float = 1,
         ewald_beta: float = 1,
     ) -> None:
@@ -219,7 +215,7 @@ def eval(
         box: np.ndarray,
         atype: np.ndarray,
         eval_fv: bool = True,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         """Evaluate the modification.
 
         Parameters
diff --git a/deepmd/tf/infer/deep_eval.py b/deepmd/tf/infer/deep_eval.py
index 0f317bd21f..56df7f782f 100644
--- a/deepmd/tf/infer/deep_eval.py
+++ b/deepmd/tf/infer/deep_eval.py
@@ -1,17 +1,13 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import json
 from functools import (
-    lru_cache,
+    cached_property,
 )
 from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
-    Dict,
-    List,
     Optional,
-    Tuple,
-    Type,
     Union,
 )
 
@@ -267,9 +263,8 @@ def _init_attr(self):
         else:
             self.modifier_type = None
 
-    @property
-    @lru_cache(maxsize=None)
-    def model_type(self) -> Type["DeepEvalWrapper"]:
+    @cached_property
+    def model_type(self) -> type["DeepEvalWrapper"]:
         """Get type of model.
 
         :type:str
@@ -292,8 +287,7 @@ def model_type(self) -> Type["DeepEvalWrapper"]:
         else:
             raise RuntimeError(f"unknown model type {model_type}")
 
-    @property
-    @lru_cache(maxsize=None)
+    @cached_property
     def model_version(self) -> str:
         """Get version of model.
 
@@ -311,8 +305,7 @@ def model_version(self) -> str:
             [mt] = run_sess(self.sess, [t_mt], feed_dict={})
             return mt.decode("utf-8")
 
-    @property
-    @lru_cache(maxsize=None)
+    @cached_property
     def sess(self) -> tf.Session:
         """Get TF session."""
         # start a tf session associated to the graph
@@ -398,7 +391,7 @@ def _load_graph(
     def sort_input(
         coord: np.ndarray,
         atom_type: np.ndarray,
-        sel_atoms: Optional[List[int]] = None,
+        sel_atoms: Optional[list[int]] = None,
     ):
         """Sort atoms in the system according their types.
 
@@ -451,7 +444,7 @@ def sort_input(
             return coord, atom_type, idx_map, atom_type, idx_map
 
     @staticmethod
-    def reverse_map(vec: np.ndarray, imap: List[int]) -> np.ndarray:
+    def reverse_map(vec: np.ndarray, imap: list[int]) -> np.ndarray:
         """Reverse mapping of a vector according to the index map.
 
         Parameters
@@ -635,7 +628,7 @@ def get_rcut(self) -> float:
         """Get the cut-off radius of this model."""
         return self.rcut
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map (element name of the atom types) of this model."""
         return self.tmap
 
@@ -687,8 +680,8 @@ def eval_func(*args, **kwargs):
     def _get_natoms_and_nframes(
         self,
         coords: np.ndarray,
-        atom_types: Union[List[int], np.ndarray],
-    ) -> Tuple[int, int]:
+        atom_types: Union[list[int], np.ndarray],
+    ) -> tuple[int, int]:
         natoms = len(atom_types[0])
         if natoms == 0:
             assert coords.size == 0
@@ -707,7 +700,7 @@ def eval(
         aparam: Optional[np.ndarray] = None,
         efield: Optional[np.ndarray] = None,
         **kwargs: Any,
-    ) -> Dict[str, np.ndarray]:
+    ) -> dict[str, np.ndarray]:
         """Evaluate the energy, force and virial by using this DP.
 
         Parameters
@@ -1196,8 +1189,7 @@ def __init__(
 
         self.neighbor_list = neighbor_list
 
-    @property
-    @lru_cache(maxsize=None)
+    @cached_property
     def model_type(self) -> str:
         """Get type of model.
 
@@ -1207,8 +1199,7 @@ def model_type(self) -> str:
         [mt] = run_sess(self.sess, [t_mt], feed_dict={})
         return mt.decode("utf-8")
 
-    @property
-    @lru_cache(maxsize=None)
+    @cached_property
     def model_version(self) -> str:
         """Get version of model.
 
@@ -1226,8 +1217,7 @@ def model_version(self) -> str:
             [mt] = run_sess(self.sess, [t_mt], feed_dict={})
             return mt.decode("utf-8")
 
-    @property
-    @lru_cache(maxsize=None)
+    @cached_property
     def sess(self) -> tf.Session:
         """Get TF session."""
         # start a tf session associated to the graph
@@ -1319,7 +1309,7 @@ def _load_graph(
     def sort_input(
         coord: np.ndarray,
         atom_type: np.ndarray,
-        sel_atoms: Optional[List[int]] = None,
+        sel_atoms: Optional[list[int]] = None,
         mixed_type: bool = False,
     ):
         """Sort atoms in the system according their types.
@@ -1382,7 +1372,7 @@ def sort_input(
             return coord, atom_type, idx_map
 
     @staticmethod
-    def reverse_map(vec: np.ndarray, imap: List[int]) -> np.ndarray:
+    def reverse_map(vec: np.ndarray, imap: list[int]) -> np.ndarray:
         """Reverse mapping of a vector according to the index map.
 
         Parameters
diff --git a/deepmd/tf/infer/deep_tensor.py b/deepmd/tf/infer/deep_tensor.py
index b0f2f244e1..a20bbfe513 100644
--- a/deepmd/tf/infer/deep_tensor.py
+++ b/deepmd/tf/infer/deep_tensor.py
@@ -2,10 +2,7 @@
 from typing import (
     TYPE_CHECKING,
     ClassVar,
-    Dict,
-    List,
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -41,7 +38,7 @@ class DeepTensor(DeepEval):
         The neighbor list object. If None, then build the native neighbor list.
     """
 
-    tensors: ClassVar[Dict[str, str]] = {
+    tensors: ClassVar[dict[str, str]] = {
         # descriptor attrs
         "t_ntypes": "descrpt_attr/ntypes:0",
         "t_rcut": "descrpt_attr/rcut:0",
@@ -127,11 +124,11 @@ def get_rcut(self) -> float:
         """Get the cut-off radius of this model."""
         return self.rcut
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map (element name of the atom types) of this model."""
         return self.tmap
 
-    def get_sel_type(self) -> List[int]:
+    def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model."""
         return self.tselt
 
@@ -147,7 +144,7 @@ def eval(
         self,
         coords: np.ndarray,
         cells: Optional[np.ndarray],
-        atom_types: List[int],
+        atom_types: list[int],
         atomic: bool = True,
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
@@ -277,13 +274,13 @@ def eval_full(
         self,
         coords: np.ndarray,
         cells: Optional[np.ndarray],
-        atom_types: List[int],
+        atom_types: list[int],
         atomic: bool = False,
         fparam: Optional[np.array] = None,
         aparam: Optional[np.array] = None,
         efield: Optional[np.array] = None,
         mixed_type: bool = False,
-    ) -> Tuple[np.ndarray, ...]:
+    ) -> tuple[np.ndarray, ...]:
         """Evaluate the model with interface similar to the energy model.
         Will return global tensor, component-wise force and virial
         and optionally atomic tensor and atomic virial.
diff --git a/deepmd/tf/infer/ewald_recp.py b/deepmd/tf/infer/ewald_recp.py
index 110188c34f..f4b7d86588 100644
--- a/deepmd/tf/infer/ewald_recp.py
+++ b/deepmd/tf/infer/ewald_recp.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    Tuple,
-)
 
 import numpy as np
 
@@ -54,7 +51,7 @@ def __init__(self, hh, beta):
 
     def eval(
         self, coord: np.ndarray, charge: np.ndarray, box: np.ndarray
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         """Evaluate.
 
         Parameters
diff --git a/deepmd/tf/lmp.py b/deepmd/tf/lmp.py
index b2e47308ed..f3679847fc 100644
--- a/deepmd/tf/lmp.py
+++ b/deepmd/tf/lmp.py
@@ -10,7 +10,6 @@
     Path,
 )
 from typing import (
-    List,
     Optional,
 )
 
@@ -32,12 +31,12 @@
     find_libpython = None
 
 
-def get_env(paths: List[Optional[str]]) -> str:
+def get_env(paths: list[Optional[str]]) -> str:
     """Get the environment variable from given paths."""
     return ":".join(p for p in paths if p is not None)
 
 
-def get_library_path(module: str, filename: str) -> List[str]:
+def get_library_path(module: str, filename: str) -> list[str]:
     """Get library path from a module.
 
     Parameters
diff --git a/deepmd/tf/loss/dos.py b/deepmd/tf/loss/dos.py
index 385d2484a8..0b8efe26e0 100644
--- a/deepmd/tf/loss/dos.py
+++ b/deepmd/tf/loss/dos.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    List,
-)
 
 import numpy as np
 
@@ -211,7 +208,7 @@ def eval(self, sess, feed_dict, natoms):
         return results
 
     @property
-    def label_requirement(self) -> List[DataRequirementItem]:
+    def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
         data_requirements = []
         # data required
diff --git a/deepmd/tf/loss/ener.py b/deepmd/tf/loss/ener.py
index 7ecb185818..337046836b 100644
--- a/deepmd/tf/loss/ener.py
+++ b/deepmd/tf/loss/ener.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
 )
 
@@ -356,7 +355,7 @@ def eval(self, sess, feed_dict, natoms):
         return results
 
     @property
-    def label_requirement(self) -> List[DataRequirementItem]:
+    def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
         data_requirements = []
         # data required
@@ -726,7 +725,7 @@ def print_on_training(
         return print_str
 
     @property
-    def label_requirement(self) -> List[DataRequirementItem]:
+    def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
         data_requirements = []
         # data required
@@ -872,7 +871,7 @@ def eval(self, sess, feed_dict, natoms):
         return results
 
     @property
-    def label_requirement(self) -> List[DataRequirementItem]:
+    def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
         data_requirements = []
         # data required
diff --git a/deepmd/tf/loss/loss.py b/deepmd/tf/loss/loss.py
index ca90c2eb64..351da7b748 100644
--- a/deepmd/tf/loss/loss.py
+++ b/deepmd/tf/loss/loss.py
@@ -3,11 +3,6 @@
     ABCMeta,
     abstractmethod,
 )
-from typing import (
-    Dict,
-    List,
-    Tuple,
-)
 
 import numpy as np
 
@@ -27,10 +22,10 @@ def build(
         self,
         learning_rate: tf.Tensor,
         natoms: tf.Tensor,
-        model_dict: Dict[str, tf.Tensor],
-        label_dict: Dict[str, tf.Tensor],
+        model_dict: dict[str, tf.Tensor],
+        label_dict: dict[str, tf.Tensor],
         suffix: str,
-    ) -> Tuple[tf.Tensor, Dict[str, tf.Tensor]]:
+    ) -> tuple[tf.Tensor, dict[str, tf.Tensor]]:
         """Build the loss function graph.
 
         Parameters
@@ -58,7 +53,7 @@ def build(
     def eval(
         self,
         sess: tf.Session,
-        feed_dict: Dict[tf.placeholder, tf.Tensor],
+        feed_dict: dict[tf.placeholder, tf.Tensor],
         natoms: tf.Tensor,
     ) -> dict:
         """Eval the loss function.
@@ -98,5 +93,5 @@ def display_if_exist(loss: tf.Tensor, find_property: float) -> tf.Tensor:
 
     @property
     @abstractmethod
-    def label_requirement(self) -> List[DataRequirementItem]:
+    def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
diff --git a/deepmd/tf/loss/tensor.py b/deepmd/tf/loss/tensor.py
index 4a70ae2a96..a5bcbbe025 100644
--- a/deepmd/tf/loss/tensor.py
+++ b/deepmd/tf/loss/tensor.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    List,
-)
 
 import numpy as np
 
@@ -142,7 +139,7 @@ def eval(self, sess, feed_dict, natoms):
         return results
 
     @property
-    def label_requirement(self) -> List[DataRequirementItem]:
+    def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
         data_requirements = []
         # data required
diff --git a/deepmd/tf/model/dos.py b/deepmd/tf/model/dos.py
index 61809eff30..7ab068da63 100644
--- a/deepmd/tf/model/dos.py
+++ b/deepmd/tf/model/dos.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -51,7 +50,7 @@ def __init__(
         descriptor: dict,
         fitting_net: dict,
         type_embedding: Optional[Union[dict, TypeEmbedNet]] = None,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         data_stat_nbatch: int = 10,
         data_stat_protect: float = 1e-2,
         **kwargs,
diff --git a/deepmd/tf/model/ener.py b/deepmd/tf/model/ener.py
index 66aaff8189..b21c920d9c 100644
--- a/deepmd/tf/model/ener.py
+++ b/deepmd/tf/model/ener.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -77,7 +76,7 @@ def __init__(
         descriptor: dict,
         fitting_net: dict,
         type_embedding: Optional[Union[dict, TypeEmbedNet]] = None,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         data_stat_nbatch: int = 10,
         data_stat_protect: float = 1e-2,
         use_srtab: Optional[str] = None,
diff --git a/deepmd/tf/model/frozen.py b/deepmd/tf/model/frozen.py
index 3e296c00f2..05700dc64e 100644
--- a/deepmd/tf/model/frozen.py
+++ b/deepmd/tf/model/frozen.py
@@ -6,9 +6,7 @@
     Enum,
 )
 from typing import (
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -244,9 +242,9 @@ def get_type_map(self) -> list:
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -285,7 +283,7 @@ def deserialize(cls, data: dict, suffix: str = ""):
         raise RuntimeError("Should not touch here.")
 
     @property
-    def input_requirement(self) -> List[DataRequirementItem]:
+    def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
         data_requirement = []
         numb_fparam = self.model.get_dim_fparam()
diff --git a/deepmd/tf/model/linear.py b/deepmd/tf/model/linear.py
index 1bd1644e54..4c75c2a1d5 100644
--- a/deepmd/tf/model/linear.py
+++ b/deepmd/tf/model/linear.py
@@ -8,9 +8,7 @@
     reduce,
 )
 from typing import (
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -50,7 +48,7 @@ class LinearModel(Model):
         If "sum", the weights are set to be 1.
     """
 
-    def __init__(self, models: List[dict], weights: List[float], **kwargs):
+    def __init__(self, models: list[dict], weights: list[float], **kwargs):
         super().__init__(**kwargs)
         self.models = [Model(**model) for model in models]
         if isinstance(weights, list):
@@ -140,9 +138,9 @@ def get_type_map(self) -> list:
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -175,7 +173,7 @@ def update_sel(
         return local_jdata_cpy, min_nbor_dist
 
     @property
-    def input_requirement(self) -> List[DataRequirementItem]:
+    def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
         return reduce(
             operator.iadd, [model.input_requirement for model in self.models], []
diff --git a/deepmd/tf/model/model.py b/deepmd/tf/model/model.py
index 5224fde473..833f8364ae 100644
--- a/deepmd/tf/model/model.py
+++ b/deepmd/tf/model/model.py
@@ -8,10 +8,7 @@
     Enum,
 )
 from typing import (
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -113,7 +110,7 @@ def __new__(cls, *args, **kwargs):
     def __init__(
         self,
         type_embedding: Optional[Union[dict, TypeEmbedNet]] = None,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         data_stat_nbatch: int = 10,
         data_bias_nsample: int = 10,
         data_stat_protect: float = 1e-2,
@@ -360,7 +357,7 @@ def build_type_embedding(
         return dout
 
     def _import_graph_def_from_frz_model(
-        self, frz_model: str, feed_dict: dict, return_elements: List[str]
+        self, frz_model: str, feed_dict: dict, return_elements: list[str]
     ):
         return_nodes = [x[:-2] for x in return_elements]
         graph, graph_def = load_graph_def(frz_model)
@@ -370,7 +367,7 @@ def _import_graph_def_from_frz_model(
         )
 
     def _import_graph_def_from_ckpt_meta(
-        self, ckpt_meta: str, feed_dict: dict, return_elements: List[str]
+        self, ckpt_meta: str, feed_dict: dict, return_elements: list[str]
     ):
         return_nodes = [x[:-2] for x in return_elements]
         with tf.Graph().as_default() as graph:
@@ -469,7 +466,7 @@ def get_feed_dict(
         box: tf.Tensor,
         mesh: tf.Tensor,
         **kwargs,
-    ) -> Dict[str, tf.Tensor]:
+    ) -> dict[str, tf.Tensor]:
         """Generate the feed_dict for current descriptor.
 
         Parameters
@@ -515,9 +512,9 @@ def get_feed_dict(
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Notes
@@ -586,7 +583,7 @@ def serialize(self, suffix: str = "") -> dict:
 
     @property
     @abstractmethod
-    def input_requirement(self) -> List[DataRequirementItem]:
+    def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
 
 
@@ -647,7 +644,7 @@ def __init__(
         descriptor: Union[dict, Descriptor],
         fitting_net: Union[dict, Fitting],
         type_embedding: Optional[Union[dict, TypeEmbedNet]] = None,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         **kwargs,
     ) -> None:
         super().__init__(
@@ -761,9 +758,9 @@ def get_ntypes(self) -> int:
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -863,6 +860,6 @@ def serialize(self, suffix: str = "") -> dict:
         }
 
     @property
-    def input_requirement(self) -> List[DataRequirementItem]:
+    def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
         return self.descrpt.input_requirement + self.fitting.input_requirement
diff --git a/deepmd/tf/model/pairtab.py b/deepmd/tf/model/pairtab.py
index 29ddfe9499..d54940fec6 100644
--- a/deepmd/tf/model/pairtab.py
+++ b/deepmd/tf/model/pairtab.py
@@ -3,9 +3,7 @@
     Enum,
 )
 from typing import (
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -69,7 +67,7 @@ class PairTabModel(Model):
     model_type = "ener"
 
     def __init__(
-        self, tab_file: str, rcut: float, sel: Union[int, List[int]], **kwargs
+        self, tab_file: str, rcut: float, sel: Union[int, list[int]], **kwargs
     ):
         super().__init__()
         self.tab_file = tab_file
@@ -275,9 +273,9 @@ def enable_compression(self, suffix: str = "") -> None:
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Notes
@@ -308,6 +306,6 @@ def update_sel(
         return local_jdata_cpy, min_nbor_dist
 
     @property
-    def input_requirement(self) -> List[DataRequirementItem]:
+    def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
         return []
diff --git a/deepmd/tf/model/pairwise_dprc.py b/deepmd/tf/model/pairwise_dprc.py
index 6fd8e82f7e..c8a57d90b3 100644
--- a/deepmd/tf/model/pairwise_dprc.py
+++ b/deepmd/tf/model/pairwise_dprc.py
@@ -1,9 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -53,7 +50,7 @@ def __init__(
         qm_model: dict,
         qmmm_model: dict,
         type_embedding: Union[dict, TypeEmbedNet],
-        type_map: List[str],
+        type_map: list[str],
         data_stat_nbatch: int = 10,
         data_stat_nsample: int = 10,
         data_stat_protect: float = 1e-2,
@@ -373,7 +370,7 @@ def get_feed_dict(
         box: tf.Tensor,
         mesh: tf.Tensor,
         **kwargs,
-    ) -> Dict[str, tf.Tensor]:
+    ) -> dict[str, tf.Tensor]:
         """Generate the feed_dict for current descriptor.
 
         Parameters
@@ -416,9 +413,9 @@ def get_feed_dict(
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> Tuple[dict, Optional[float]]:
+    ) -> tuple[dict, Optional[float]]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -442,7 +439,7 @@ def update_sel(
         return local_jdata, min_nbor_dist
 
     @property
-    def input_requirement(self) -> List[DataRequirementItem]:
+    def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
         data_requirement = []
         data_requirement.append(
diff --git a/deepmd/tf/model/tensor.py b/deepmd/tf/model/tensor.py
index b2afe0d71f..8514844e03 100644
--- a/deepmd/tf/model/tensor.py
+++ b/deepmd/tf/model/tensor.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -50,7 +49,7 @@ def __init__(
         descriptor: dict,
         fitting_net: dict,
         type_embedding: Optional[Union[dict, TypeEmbedNet]] = None,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         data_stat_nbatch: int = 10,
         data_stat_protect: float = 1e-2,
         **kwargs,
diff --git a/deepmd/tf/nvnmd/utils/fio.py b/deepmd/tf/nvnmd/utils/fio.py
index 3efd7520dd..9daff62183 100644
--- a/deepmd/tf/nvnmd/utils/fio.py
+++ b/deepmd/tf/nvnmd/utils/fio.py
@@ -3,9 +3,6 @@
 import logging
 import os
 import struct
-from typing import (
-    List,
-)
 
 import numpy as np
 
@@ -168,7 +165,7 @@ def load(self, file_name="", default_value=""):
             log.warning(f"can not find {file_name}")
             return default_value
 
-    def save(self, file_name: str, data: List[str]):
+    def save(self, file_name: str, data: list[str]):
         r"""Save hex string into binary file."""
         log.info(f"write binary to {file_name}")
         Fio().create_file_path(file_name)
diff --git a/deepmd/tf/train/run_options.py b/deepmd/tf/train/run_options.py
index b835d63852..c36b42e194 100644
--- a/deepmd/tf/train/run_options.py
+++ b/deepmd/tf/train/run_options.py
@@ -8,7 +8,6 @@
 )
 from typing import (
     TYPE_CHECKING,
-    List,
     Optional,
 )
 
@@ -80,7 +79,7 @@ class RunOptions:
 
     Attributes
     ----------
-    gpus: Optional[List[int]]
+    gpus: Optional[list[int]]
         list of GPUs if any are present else None
     is_chief: bool
         in distribured training it is true for tha main MPI process in serail it is
@@ -91,17 +90,17 @@ class RunOptions:
         index of the MPI task
     nodename: str
         name of the node
-    node_list_ : List[str]
+    node_list_ : list[str]
         the list of nodes of the current mpirun
     my_device: str
         deviice type - gpu or cpu
     """
 
-    gpus: Optional[List[int]]
+    gpus: Optional[list[int]]
     world_size: int
     my_rank: int
     nodename: str
-    nodelist: List[int]
+    nodelist: list[int]
     my_device: str
 
     _HVD: Optional["HVD"]
diff --git a/deepmd/tf/train/trainer.py b/deepmd/tf/train/trainer.py
index 474af1da90..9f353f2e32 100644
--- a/deepmd/tf/train/trainer.py
+++ b/deepmd/tf/train/trainer.py
@@ -4,10 +4,6 @@
 import os
 import shutil
 import time
-from typing import (
-    Dict,
-    List,
-)
 
 import google.protobuf.message
 import numpy as np
@@ -420,6 +416,8 @@ def train(self, train_data=None, valid_data=None):
             fp = open(self.disp_file, "a")
 
         cur_batch = run_sess(self.sess, self.global_step)
+        start_batch = cur_batch
+        elapsed_batch = stop_batch - start_batch
         is_first_step = True
         self.cur_batch = cur_batch
         log.info(
@@ -556,7 +554,10 @@ def train(self, train_data=None, valid_data=None):
                         )
                     )
                     # the first training time is not accurate
-                    if cur_batch > self.disp_freq or stop_batch < 2 * self.disp_freq:
+                    if (
+                        cur_batch - start_batch > self.disp_freq
+                        or elapsed_batch < 2 * self.disp_freq
+                    ):
                         total_train_time += train_time
                     train_time = 0
                     wall_time_tic = toc
@@ -598,18 +599,23 @@ def train(self, train_data=None, valid_data=None):
             self.save_checkpoint(cur_batch)
         if self.run_opt.is_chief:
             fp.close()
-        if self.timing_in_training and stop_batch // self.disp_freq > 0:
-            if stop_batch >= 2 * self.disp_freq:
+        elapsed_batch = stop_batch - start_batch
+        if self.timing_in_training and elapsed_batch // self.disp_freq > 0:
+            if elapsed_batch >= 2 * self.disp_freq:
                 log.info(
                     "average training time: %.4f s/batch (exclude first %d batches)",
                     total_train_time
-                    / (stop_batch // self.disp_freq * self.disp_freq - self.disp_freq),
+                    / (
+                        elapsed_batch // self.disp_freq * self.disp_freq
+                        - self.disp_freq
+                    ),
                     self.disp_freq,
                 )
             else:
                 log.info(
                     "average training time: %.4f s/batch",
-                    total_train_time / (stop_batch // self.disp_freq * self.disp_freq),
+                    total_train_time
+                    / (elapsed_batch // self.disp_freq * self.disp_freq),
                 )
 
         if self.profiling and self.run_opt.is_chief:
@@ -891,7 +897,7 @@ def _change_energy_bias(
         )
 
     @property
-    def data_requirements(self) -> List[DataRequirementItem]:
+    def data_requirements(self) -> list[DataRequirementItem]:
         return self.model.input_requirement + self.loss.label_requirement
 
 
@@ -922,17 +928,17 @@ def __init__(self, train_data: DeepmdDataSystem):
         self.data_keys = batch_data.keys()
         self.data_types = [tf.as_dtype(x.dtype) for x in batch_data.values()]
 
-    def build(self) -> List[tf.Tensor]:
+    def build(self) -> list[tf.Tensor]:
         """Build the OP that loads the training data.
 
         Returns
         -------
-        List[tf.Tensor]
+        list[tf.Tensor]
             Tensor of the loaded data.
         """
         train_data = self.train_data
 
-        def get_train_batch() -> List[np.ndarray]:
+        def get_train_batch() -> list[np.ndarray]:
             batch_data = train_data.get_batch()
             # convert dict to list of arryas
             batch_data = tuple([batch_data[kk] for kk in self.data_keys])
@@ -940,17 +946,17 @@ def get_train_batch() -> List[np.ndarray]:
 
         return tf.py_func(get_train_batch, [], self.data_types, name="train_data")
 
-    def get_data_dict(self, batch_list: List[np.ndarray]) -> Dict[str, np.ndarray]:
+    def get_data_dict(self, batch_list: list[np.ndarray]) -> dict[str, np.ndarray]:
         """Generate a dict of the loaded data.
 
         Parameters
         ----------
-        batch_list : List[np.ndarray]
+        batch_list : list[np.ndarray]
             The loaded data.
 
         Returns
         -------
-        Dict[str, np.ndarray]
+        dict[str, np.ndarray]
             The dict of the loaded data.
         """
         return dict(zip(self.data_keys, batch_list))
diff --git a/deepmd/tf/utils/finetune.py b/deepmd/tf/utils/finetune.py
index 4e55b9f5bb..4c57246ffd 100644
--- a/deepmd/tf/utils/finetune.py
+++ b/deepmd/tf/utils/finetune.py
@@ -3,7 +3,6 @@
 import logging
 from typing import (
     Any,
-    Dict,
 )
 
 from deepmd.tf.utils.errors import (
@@ -17,13 +16,13 @@
 
 
 def replace_model_params_with_pretrained_model(
-    jdata: Dict[str, Any], pretrained_model: str
+    jdata: dict[str, Any], pretrained_model: str
 ):
     """Replace the model params in input script according to pretrained model.
 
     Parameters
     ----------
-    jdata : Dict[str, Any]
+    jdata : dict[str, Any]
         input script
     pretrained_model : str
         filename of the pretrained model
diff --git a/deepmd/tf/utils/graph.py b/deepmd/tf/utils/graph.py
index a891506e95..4fccaac0e8 100644
--- a/deepmd/tf/utils/graph.py
+++ b/deepmd/tf/utils/graph.py
@@ -1,9 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import re
-from typing import (
-    Dict,
-    Tuple,
-)
 
 import numpy as np
 
@@ -22,7 +18,7 @@
 )
 
 
-def load_graph_def(model_file: str) -> Tuple[tf.Graph, tf.GraphDef]:
+def load_graph_def(model_file: str) -> tuple[tf.Graph, tf.GraphDef]:
     """Load graph as well as the graph_def from the frozen model(model_file).
 
     Parameters
@@ -98,7 +94,7 @@ def get_tensor_by_name(model_file: str, tensor_name: str) -> tf.Tensor:
     return get_tensor_by_name_from_graph(graph, tensor_name)
 
 
-def get_pattern_nodes_from_graph_def(graph_def: tf.GraphDef, pattern: str) -> Dict:
+def get_pattern_nodes_from_graph_def(graph_def: tf.GraphDef, pattern: str) -> dict:
     """Get the pattern nodes with the given tf.GraphDef object.
 
     Parameters
@@ -123,7 +119,7 @@ def get_pattern_nodes_from_graph_def(graph_def: tf.GraphDef, pattern: str) -> Di
 
 def get_embedding_net_nodes_from_graph_def(
     graph_def: tf.GraphDef, suffix: str = ""
-) -> Dict:
+) -> dict:
     """Get the embedding net nodes with the given tf.GraphDef object.
 
     Parameters
@@ -154,7 +150,7 @@ def get_embedding_net_nodes_from_graph_def(
     return embedding_net_nodes
 
 
-def get_embedding_net_nodes(model_file: str, suffix: str = "") -> Dict:
+def get_embedding_net_nodes(model_file: str, suffix: str = "") -> dict:
     """Get the embedding net nodes with the given frozen model(model_file).
 
     Parameters
@@ -175,7 +171,7 @@ def get_embedding_net_nodes(model_file: str, suffix: str = "") -> Dict:
 
 def get_embedding_net_variables_from_graph_def(
     graph_def: tf.GraphDef, suffix: str = ""
-) -> Dict:
+) -> dict:
     """Get the embedding net variables with the given tf.GraphDef object.
 
     Parameters
@@ -220,7 +216,7 @@ def get_extra_embedding_net_nodes_from_graph_def(
     graph_def: tf.GraphDef,
     suffix: str = "",
     extra_suffix: str = "",
-) -> Dict:
+) -> dict:
     """Get the extra embedding net nodes with the given tf.GraphDef object.
 
     Parameters
@@ -259,7 +255,7 @@ def get_extra_embedding_net_variables_from_graph_def(
     graph_def: tf.GraphDef,
     suffix: str = "",
     extra_suffix: str = "",
-) -> Dict:
+) -> dict:
     """Get the embedding net variables with the given tf.GraphDef object.
 
     Parameters
@@ -282,7 +278,7 @@ def get_extra_embedding_net_variables_from_graph_def(
     return convert_tensor_to_ndarray_in_dict(extra_embedding_net_nodes)
 
 
-def get_embedding_net_variables(model_file: str, suffix: str = "") -> Dict:
+def get_embedding_net_variables(model_file: str, suffix: str = "") -> dict:
     """Get the embedding net variables with the given frozen model(model_file).
 
     Parameters
@@ -303,7 +299,7 @@ def get_embedding_net_variables(model_file: str, suffix: str = "") -> Dict:
 
 def get_fitting_net_nodes_from_graph_def(
     graph_def: tf.GraphDef, suffix: str = ""
-) -> Dict:
+) -> dict:
     """Get the fitting net nodes with the given tf.GraphDef object.
 
     Parameters
@@ -334,7 +330,7 @@ def get_fitting_net_nodes_from_graph_def(
     return fitting_net_nodes
 
 
-def get_fitting_net_nodes(model_file: str) -> Dict:
+def get_fitting_net_nodes(model_file: str) -> dict:
     """Get the fitting net nodes with the given frozen model(model_file).
 
     Parameters
@@ -353,7 +349,7 @@ def get_fitting_net_nodes(model_file: str) -> Dict:
 
 def get_fitting_net_variables_from_graph_def(
     graph_def: tf.GraphDef, suffix: str = ""
-) -> Dict:
+) -> dict:
     """Get the fitting net variables with the given tf.GraphDef object.
 
     Parameters
@@ -372,7 +368,7 @@ def get_fitting_net_variables_from_graph_def(
     return convert_tensor_to_ndarray_in_dict(fitting_net_nodes)
 
 
-def get_fitting_net_variables(model_file: str, suffix: str = "") -> Dict:
+def get_fitting_net_variables(model_file: str, suffix: str = "") -> dict:
     """Get the fitting net variables with the given frozen model(model_file).
 
     Parameters
@@ -393,7 +389,7 @@ def get_fitting_net_variables(model_file: str, suffix: str = "") -> Dict:
 
 def get_type_embedding_net_nodes_from_graph_def(
     graph_def: tf.GraphDef, suffix: str = ""
-) -> Dict:
+) -> dict:
     """Get the type embedding net nodes with the given tf.GraphDef object.
 
     Parameters
@@ -425,7 +421,7 @@ def get_type_embedding_net_nodes_from_graph_def(
 
 def get_type_embedding_net_variables_from_graph_def(
     graph_def: tf.GraphDef, suffix: str = ""
-) -> Dict:
+) -> dict:
     """Get the type embedding net variables with the given tf.GraphDef object.
 
     Parameters
@@ -448,7 +444,7 @@ def get_type_embedding_net_variables_from_graph_def(
 
 def get_attention_layer_nodes_from_graph_def(
     graph_def: tf.GraphDef, suffix: str = ""
-) -> Dict:
+) -> dict:
     """Get the attention layer nodes with the given tf.GraphDef object.
 
     Parameters
@@ -482,7 +478,7 @@ def get_attention_layer_nodes_from_graph_def(
 
 def get_attention_layer_variables_from_graph_def(
     graph_def: tf.GraphDef, suffix: str = ""
-) -> Dict:
+) -> dict:
     """Get the attention layer variables with the given tf.GraphDef object.
 
     Parameters
@@ -504,18 +500,18 @@ def get_attention_layer_variables_from_graph_def(
 
 
 def convert_tensor_to_ndarray_in_dict(
-    tensor_dict: Dict[str, tf.Tensor],
-) -> Dict[str, np.ndarray]:
+    tensor_dict: dict[str, tf.Tensor],
+) -> dict[str, np.ndarray]:
     """Convert tensor to ndarray in dict.
 
     Parameters
     ----------
-    tensor_dict : Dict[str, tf.Tensor]
+    tensor_dict : dict[str, tf.Tensor]
         The input tensor dict
 
     Returns
     -------
-    Dict[str, np.ndarray]
+    dict[str, np.ndarray]
         The converted tensor dict
     """
     for key in tensor_dict:
diff --git a/deepmd/tf/utils/neighbor_stat.py b/deepmd/tf/utils/neighbor_stat.py
index f668d4a4da..4052c89821 100644
--- a/deepmd/tf/utils/neighbor_stat.py
+++ b/deepmd/tf/utils/neighbor_stat.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
-from typing import (
+from collections.abc import (
     Iterator,
+)
+from typing import (
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -61,7 +62,7 @@ def build(
         atype: tf.Tensor,
         cell: tf.Tensor,
         pbc: tf.Tensor,
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
+    ) -> tuple[tf.Tensor, tf.Tensor]:
         """Calculate the nearest neighbor distance between atoms, maximum nbor size of
         atoms and the output data range of the environment matrix.
 
@@ -187,7 +188,7 @@ def __init__(
             self.op = self.build()
         self.sub_sess = tf.Session(graph=sub_graph, config=default_tf_session_config)
 
-    def build(self) -> Tuple[tf.Tensor, tf.Tensor]:
+    def build(self) -> tuple[tf.Tensor, tf.Tensor]:
         """Build the graph.
 
         Returns
@@ -215,7 +216,7 @@ def build(self) -> Tuple[tf.Tensor, tf.Tensor]:
 
     def iterator(
         self, data: DeepmdDataSystem
-    ) -> Iterator[Tuple[np.ndarray, float, str]]:
+    ) -> Iterator[tuple[np.ndarray, float, str]]:
         """Produce data.
 
         Parameters
diff --git a/deepmd/tf/utils/parallel_op.py b/deepmd/tf/utils/parallel_op.py
index 5eeb1fab7f..ce43ea8c15 100644
--- a/deepmd/tf/utils/parallel_op.py
+++ b/deepmd/tf/utils/parallel_op.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from collections.abc import (
+    Generator,
+)
 from typing import (
     Any,
     Callable,
-    Dict,
-    Generator,
     Optional,
-    Tuple,
 )
 
 from deepmd.tf.env import (
@@ -21,7 +21,7 @@ class ParallelOp:
 
     Parameters
     ----------
-    builder : Callable[..., Tuple[Dict[str, tf.Tensor], Tuple[tf.Tensor]]]
+    builder : Callable[..., tuple[dict[str, tf.Tensor], tuple[tf.Tensor]]]
         returns two objects: a dict which stores placeholders by key, and a tuple with the final op(s)
     nthreads : int, optional
         the number of threads
@@ -45,7 +45,7 @@ class ParallelOp:
 
     def __init__(
         self,
-        builder: Callable[..., Tuple[Dict[str, tf.Tensor], Tuple[tf.Tensor]]],
+        builder: Callable[..., tuple[dict[str, tf.Tensor], tuple[tf.Tensor]]],
         nthreads: Optional[int] = None,
         config: Optional[tf.ConfigProto] = None,
     ) -> None:
@@ -65,8 +65,8 @@ def __init__(
                 self.ops.append(op)
 
     def generate(
-        self, sess: tf.Session, feed: Generator[Dict[str, Any], None, None]
-    ) -> Generator[Tuple, None, None]:
+        self, sess: tf.Session, feed: Generator[dict[str, Any], None, None]
+    ) -> Generator[tuple, None, None]:
         """Returns a generator.
 
         Parameters
diff --git a/deepmd/tf/utils/spin.py b/deepmd/tf/utils/spin.py
index c20d4dcc7b..ab70bdf319 100644
--- a/deepmd/tf/utils/spin.py
+++ b/deepmd/tf/utils/spin.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
 )
 
@@ -25,9 +24,9 @@ class Spin:
 
     def __init__(
         self,
-        use_spin: Optional[List[bool]] = None,
-        spin_norm: Optional[List[float]] = None,
-        virtual_len: Optional[List[float]] = None,
+        use_spin: Optional[list[bool]] = None,
+        spin_norm: Optional[list[float]] = None,
+        virtual_len: Optional[list[float]] = None,
     ) -> None:
         """Constructor."""
         self.use_spin = use_spin
@@ -74,14 +73,14 @@ def get_ntypes_spin(self) -> int:
         """Returns the number of atom types which contain spin."""
         return self.ntypes_spin
 
-    def get_use_spin(self) -> List[bool]:
+    def get_use_spin(self) -> list[bool]:
         """Returns the list of whether to use spin for each atom type."""
         return self.use_spin
 
-    def get_spin_norm(self) -> List[float]:
+    def get_spin_norm(self) -> list[float]:
         """Returns the list of magnitude of atomic spin for each atom type."""
         return self.spin_norm
 
-    def get_virtual_len(self) -> List[float]:
+    def get_virtual_len(self) -> list[float]:
         """Returns the list of distance between real atom and virtual atom for each atom type."""
         return self.virtual_len
diff --git a/deepmd/tf/utils/tabulate.py b/deepmd/tf/utils/tabulate.py
index e1ab45c44f..1dc6128f62 100644
--- a/deepmd/tf/utils/tabulate.py
+++ b/deepmd/tf/utils/tabulate.py
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from functools import (
+    cached_property,
     lru_cache,
 )
 from typing import (
     Callable,
-    Dict,
-    List,
-    Tuple,
 )
 
 import numpy as np
@@ -53,7 +51,7 @@ class DPTabulate:
             The graph_def of the original model
     type_one_side
             Try to build N_types tables. Otherwise, building N_types^2 tables
-    exclude_types : List[List[int]]
+    exclude_types : list[list[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     activation_function
@@ -65,11 +63,11 @@ class DPTabulate:
     def __init__(
         self,
         descrpt: Descriptor,
-        neuron: List[int],
+        neuron: list[int],
         graph: tf.Graph,
         graph_def: tf.GraphDef,
         type_one_side: bool = False,
-        exclude_types: List[List[int]] = [],
+        exclude_types: list[list[int]] = [],
         activation_fn: Callable[[tf.Tensor], tf.Tensor] = tf.nn.tanh,
         suffix: str = "",
     ) -> None:
@@ -160,7 +158,7 @@ def __init__(
 
     def build(
         self, min_nbor_dist: float, extrapolate: float, stride0: float, stride1: float
-    ) -> Tuple[Dict[str, int], Dict[str, int]]:
+    ) -> tuple[dict[str, int], dict[str, int]]:
         r"""Build the tables for model compression.
 
         Parameters
@@ -773,8 +771,7 @@ def _get_layer_size(self):
             raise RuntimeError("Unsupported descriptor")
         return layer_size
 
-    @property
-    @lru_cache
+    @cached_property
     def _n_all_excluded(self) -> int:
         """Then number of types excluding all types."""
         return sum(int(self._all_excluded(ii)) for ii in range(0, self.ntypes))
diff --git a/deepmd/tf/utils/type_embed.py b/deepmd/tf/utils/type_embed.py
index 7d74b0a856..13d02a858c 100644
--- a/deepmd/tf/utils/type_embed.py
+++ b/deepmd/tf/utils/type_embed.py
@@ -2,7 +2,6 @@
 import logging
 import re
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -105,7 +104,7 @@ class TypeEmbedNet:
             Whether to use electronic configuration type embedding.
     use_tebd_bias : bool, Optional
             Whether to use bias in the type embedding layer.
-    type_map: List[str], Optional
+    type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
 
@@ -113,7 +112,7 @@ def __init__(
         self,
         *,
         ntypes: int,
-        neuron: List[int],
+        neuron: list[int],
         resnet_dt: bool = False,
         activation_function: Union[str, None] = "tanh",
         precision: str = "default",
@@ -123,7 +122,7 @@ def __init__(
         padding: bool = False,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         **kwargs,
     ) -> None:
         """Constructor."""
diff --git a/deepmd/tf/utils/update_sel.py b/deepmd/tf/utils/update_sel.py
index 726aec4d41..8915eb0147 100644
--- a/deepmd/tf/utils/update_sel.py
+++ b/deepmd/tf/utils/update_sel.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    Type,
-)
 
 from deepmd.tf.utils.neighbor_stat import (
     NeighborStat,
@@ -13,5 +10,5 @@
 
 class UpdateSel(BaseUpdateSel):
     @property
-    def neighbor_stat(self) -> Type[NeighborStat]:
+    def neighbor_stat(self) -> type[NeighborStat]:
         return NeighborStat
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index a799b6b0c4..1a5e1cc3b2 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -4,8 +4,6 @@
 import warnings
 from typing import (
     Callable,
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -92,7 +90,7 @@ def type_embedding_args():
     doc_use_tebd_bias = "Whether to use bias in the type embedding layer."
 
     return [
-        Argument("neuron", List[int], optional=True, default=[8], doc=doc_neuron),
+        Argument("neuron", list[int], optional=True, default=[8], doc=doc_neuron),
         Argument(
             "activation_function",
             str,
@@ -136,22 +134,22 @@ def spin_args():
     )
 
     return [
-        Argument("use_spin", [List[bool], List[int]], doc=doc_use_spin),
+        Argument("use_spin", [list[bool], list[int]], doc=doc_use_spin),
         Argument(
             "spin_norm",
-            List[float],
+            list[float],
             optional=True,
             doc=doc_only_tf_supported + doc_spin_norm,
         ),
         Argument(
             "virtual_len",
-            List[float],
+            list[float],
             optional=True,
             doc=doc_only_tf_supported + doc_virtual_len,
         ),
         Argument(
             "virtual_scale",
-            [List[float], float],
+            [list[float], float],
             optional=True,
             doc=doc_only_pt_supported + doc_virtual_scale,
         ),
@@ -166,10 +164,10 @@ def __init__(self) -> None:
         self.__plugin = Plugin()
 
     def register(
-        self, name: str, alias: Optional[List[str]] = None, doc: str = ""
+        self, name: str, alias: Optional[list[str]] = None, doc: str = ""
     ) -> Callable[
-        [Union[Callable[[], Argument], Callable[[], List[Argument]]]],
-        Union[Callable[[], Argument], Callable[[], List[Argument]]],
+        [Union[Callable[[], Argument], Callable[[], list[Argument]]]],
+        Union[Callable[[], Argument], Callable[[], list[Argument]]],
     ]:
         """Register a descriptor argument plugin.
 
@@ -177,12 +175,12 @@ def register(
         ----------
         name : str
             the name of a descriptor
-        alias : List[str], optional
+        alias : list[str], optional
             the list of aliases of this descriptor
 
         Returns
         -------
-        Callable[[Union[Callable[[], Argument], Callable[[], List[Argument]]]], Union[Callable[[], Argument], Callable[[], List[Argument]]]]
+        Callable[[Union[Callable[[], Argument], Callable[[], list[Argument]]]], Union[Callable[[], Argument], Callable[[], list[Argument]]]]
             decorator to return the registered descriptor argument method
 
         Examples
@@ -197,7 +195,7 @@ def descrpt_some_descrpt_args():
             alias = tuple(alias)
         return self.__plugin.register((name, alias, doc))
 
-    def get_all_argument(self, exclude_hybrid: bool = False) -> List[Argument]:
+    def get_all_argument(self, exclude_hybrid: bool = False) -> list[Argument]:
         """Get all arguments.
 
         Parameters
@@ -207,7 +205,7 @@ def get_all_argument(self, exclude_hybrid: bool = False) -> List[Argument]:
 
         Returns
         -------
-        List[Argument]
+        list[Argument]
             all arguments
         """
         arguments = []
@@ -245,17 +243,17 @@ def descrpt_local_frame_args():
 - axis_rule[i*6+5]: index of the axis atom defining the second axis. Note that the neighbors with the same class and type are sorted according to their relative distance."
 
     return [
-        Argument("sel_a", List[int], optional=False, doc=doc_sel_a),
-        Argument("sel_r", List[int], optional=False, doc=doc_sel_r),
+        Argument("sel_a", list[int], optional=False, doc=doc_sel_a),
+        Argument("sel_r", list[int], optional=False, doc=doc_sel_r),
         Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
-        Argument("axis_rule", List[int], optional=False, doc=doc_axis_rule),
+        Argument("axis_rule", list[int], optional=False, doc=doc_axis_rule),
     ]
 
 
 @descrpt_args_plugin.register("se_e2_a", alias=["se_a"])
 def descrpt_se_a_args():
     doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
-    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
+    - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
     - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
     doc_rcut = "The cut-off radius."
     doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
@@ -272,11 +270,11 @@ def descrpt_se_a_args():
     doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used"
 
     return [
-        Argument("sel", [List[int], str], optional=True, default="auto", doc=doc_sel),
+        Argument("sel", [list[int], str], optional=True, default="auto", doc=doc_sel),
         Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
         Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
         Argument(
-            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
+            "neuron", list[int], optional=True, default=[10, 20, 40], doc=doc_neuron
         ),
         Argument(
             "axis_neuron",
@@ -302,7 +300,7 @@ def descrpt_se_a_args():
         Argument("seed", [int, None], optional=True, doc=doc_seed),
         Argument(
             "exclude_types",
-            List[List[int]],
+            list[list[int]],
             optional=True,
             default=[],
             doc=doc_exclude_types,
@@ -323,7 +321,7 @@ def descrpt_se_a_args():
 @descrpt_args_plugin.register("se_e3", alias=["se_at", "se_a_3be", "se_t"])
 def descrpt_se_t_args():
     doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
-    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
+    - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
     - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
     doc_rcut = "The cut-off radius."
     doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
@@ -338,11 +336,11 @@ def descrpt_se_t_args():
     doc_env_protection = "Protection parameter to prevent division by zero errors during environment matrix calculations. For example, when using paddings, there may be zero distances of neighbors, which may make division by zero error during environment matrix calculations without protection."
 
     return [
-        Argument("sel", [List[int], str], optional=True, default="auto", doc=doc_sel),
+        Argument("sel", [list[int], str], optional=True, default="auto", doc=doc_sel),
         Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
         Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
         Argument(
-            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
+            "neuron", list[int], optional=True, default=[10, 20, 40], doc=doc_neuron
         ),
         Argument(
             "activation_function",
@@ -360,7 +358,7 @@ def descrpt_se_t_args():
         ),
         Argument(
             "exclude_types",
-            List[List[int]],
+            list[list[int]],
             optional=True,
             default=[],
             doc=doc_exclude_types,
@@ -392,7 +390,7 @@ def descrpt_se_a_tpe_args():
 @descrpt_args_plugin.register("se_e2_r", alias=["se_r"])
 def descrpt_se_r_args():
     doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
-    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
+    - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
     - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
     doc_rcut = "The cut-off radius."
     doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
@@ -408,11 +406,11 @@ def descrpt_se_r_args():
     doc_env_protection = "Protection parameter to prevent division by zero errors during environment matrix calculations. For example, when using paddings, there may be zero distances of neighbors, which may make division by zero error during environment matrix calculations without protection."
 
     return [
-        Argument("sel", [List[int], str], optional=True, default="auto", doc=doc_sel),
+        Argument("sel", [list[int], str], optional=True, default="auto", doc=doc_sel),
         Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
         Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
         Argument(
-            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
+            "neuron", list[int], optional=True, default=[10, 20, 40], doc=doc_neuron
         ),
         Argument(
             "activation_function",
@@ -430,7 +428,7 @@ def descrpt_se_r_args():
         Argument("seed", [int, None], optional=True, doc=doc_seed),
         Argument(
             "exclude_types",
-            List[List[int]],
+            list[list[int]],
             optional=True,
             default=[],
             doc=doc_exclude_types,
@@ -469,7 +467,7 @@ def descrpt_hybrid_args():
 def descrpt_se_atten_common_args():
     doc_sel = 'This parameter set the number of selected neighbors. Note that this parameter is a little different from that in other descriptors. Instead of separating each type of atoms, only the summation matters. And this number is highly related with the efficiency, thus one should not make it too large. Usually 200 or less is enough, far away from the GPU limitation 4096. It can be:\n\n\
     - `int`. The maximum number of neighbor atoms to be considered. We recommend it to be less than 200. \n\n\
-    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.\
+    - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.\
     - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
     doc_rcut = "The cut-off radius."
     doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
@@ -490,12 +488,12 @@ def descrpt_se_atten_common_args():
 
     return [
         Argument(
-            "sel", [int, List[int], str], optional=True, default="auto", doc=doc_sel
+            "sel", [int, list[int], str], optional=True, default="auto", doc=doc_sel
         ),
         Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
         Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
         Argument(
-            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
+            "neuron", list[int], optional=True, default=[10, 20, 40], doc=doc_neuron
         ),
         Argument(
             "axis_neuron",
@@ -521,7 +519,7 @@ def descrpt_se_atten_common_args():
         Argument("seed", [int, None], optional=True, doc=doc_seed),
         Argument(
             "exclude_types",
-            List[List[int]],
+            list[list[int]],
             optional=True,
             default=[],
             doc=doc_exclude_types,
@@ -666,7 +664,7 @@ def descrpt_se_atten_args():
 def descrpt_se_e3_tebd_args():
     doc_sel = 'This parameter set the number of selected neighbors. Note that this parameter is a little different from that in other descriptors. Instead of separating each type of atoms, only the summation matters. And this number is highly related with the efficiency, thus one should not make it too large. Usually 200 or less is enough, far away from the GPU limitation 4096. It can be:\n\n\
     - `int`. The maximum number of neighbor atoms to be considered. We recommend it to be less than 200. \n\n\
-    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.\
+    - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.\
     - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
     doc_rcut = "The cut-off radius."
     doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`"
@@ -697,12 +695,12 @@ def descrpt_se_e3_tebd_args():
 
     return [
         Argument(
-            "sel", [int, List[int], str], optional=True, default="auto", doc=doc_sel
+            "sel", [int, list[int], str], optional=True, default="auto", doc=doc_sel
         ),
         Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
         Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
         Argument(
-            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
+            "neuron", list[int], optional=True, default=[10, 20, 40], doc=doc_neuron
         ),
         Argument(
             "tebd_dim",
@@ -745,7 +743,7 @@ def descrpt_se_e3_tebd_args():
         ),
         Argument(
             "exclude_types",
-            List[List[int]],
+            list[list[int]],
             optional=True,
             default=[],
             doc=doc_exclude_types,
@@ -898,7 +896,7 @@ def descrpt_dpa2_args():
         Argument("smooth", bool, optional=True, default=True, doc=doc_smooth),
         Argument(
             "exclude_types",
-            List[List[int]],
+            list[list[int]],
             optional=True,
             default=[],
             doc=doc_exclude_types,
@@ -1338,7 +1336,7 @@ def descrpt_se_a_ebd_v2_args():
 @descrpt_args_plugin.register("se_a_mask", doc=doc_only_tf_supported)
 def descrpt_se_a_mask_args():
     doc_sel = 'This parameter sets the number of selected neighbors for each type of atom. It can be:\n\n\
-    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
+    - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
     - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
 
     doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
@@ -1352,9 +1350,9 @@ def descrpt_se_a_mask_args():
     doc_seed = "Random seed for parameter initialization"
 
     return [
-        Argument("sel", [List[int], str], optional=True, default="auto", doc=doc_sel),
+        Argument("sel", [list[int], str], optional=True, default="auto", doc=doc_sel),
         Argument(
-            "neuron", List[int], optional=True, default=[10, 20, 40], doc=doc_neuron
+            "neuron", list[int], optional=True, default=[10, 20, 40], doc=doc_neuron
         ),
         Argument(
             "axis_neuron",
@@ -1377,7 +1375,7 @@ def descrpt_se_a_mask_args():
         ),
         Argument(
             "exclude_types",
-            List[List[int]],
+            list[list[int]],
             optional=True,
             default=[],
             doc=doc_exclude_types,
@@ -1451,7 +1449,7 @@ def fitting_ener():
         Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
         Argument(
             "neuron",
-            List[int],
+            list[int],
             optional=True,
             default=[120, 120, 120],
             alias=["n_neuron"],
@@ -1468,7 +1466,7 @@ def fitting_ener():
         Argument("resnet_dt", bool, optional=True, default=True, doc=doc_resnet_dt),
         Argument(
             "trainable",
-            [List[bool], bool],
+            [list[bool], bool],
             optional=True,
             default=True,
             doc=doc_trainable,
@@ -1479,12 +1477,12 @@ def fitting_ener():
         Argument("seed", [int, None], optional=True, doc=doc_seed),
         Argument(
             "atom_ener",
-            List[Optional[float]],
+            list[Optional[float]],
             optional=True,
             default=[],
             doc=doc_atom_ener,
         ),
-        Argument("layer_name", List[str], optional=True, doc=doc_layer_name),
+        Argument("layer_name", list[str], optional=True, doc=doc_layer_name),
         Argument(
             "use_aparam_as_mask",
             bool,
@@ -1516,7 +1514,7 @@ def fitting_dos():
         Argument("numb_fparam", int, optional=True, default=0, doc=doc_numb_fparam),
         Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
         Argument(
-            "neuron", List[int], optional=True, default=[120, 120, 120], doc=doc_neuron
+            "neuron", list[int], optional=True, default=[120, 120, 120], doc=doc_neuron
         ),
         Argument(
             "activation_function",
@@ -1529,7 +1527,7 @@ def fitting_dos():
         Argument("resnet_dt", bool, optional=True, default=True, doc=doc_resnet_dt),
         Argument(
             "trainable",
-            [List[bool], bool],
+            [list[bool], bool],
             optional=True,
             default=True,
             doc=doc_trainable,
@@ -1559,7 +1557,7 @@ def fitting_property():
         Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
         Argument(
             "neuron",
-            List[int],
+            list[int],
             optional=True,
             default=[120, 120, 120],
             alias=["n_neuron"],
@@ -1601,7 +1599,7 @@ def fitting_polar():
     return [
         Argument(
             "neuron",
-            List[int],
+            list[int],
             optional=True,
             default=[120, 120, 120],
             alias=["n_neuron"],
@@ -1618,13 +1616,13 @@ def fitting_polar():
         Argument("precision", str, optional=True, default="default", doc=doc_precision),
         Argument("fit_diag", bool, optional=True, default=True, doc=doc_fit_diag),
         Argument(
-            "scale", [List[float], float], optional=True, default=1.0, doc=doc_scale
+            "scale", [list[float], float], optional=True, default=1.0, doc=doc_scale
         ),
         # Argument("diag_shift", [list,float], optional = True, default = 0.0, doc = doc_diag_shift),
         Argument("shift_diag", bool, optional=True, default=True, doc=doc_shift_diag),
         Argument(
             "sel_type",
-            [List[int], int, None],
+            [list[int], int, None],
             optional=True,
             alias=["pol_type"],
             doc=doc_sel_type + doc_only_tf_supported,
@@ -1648,7 +1646,7 @@ def fitting_dipole():
     return [
         Argument(
             "neuron",
-            List[int],
+            list[int],
             optional=True,
             default=[120, 120, 120],
             alias=["n_neuron"],
@@ -1665,7 +1663,7 @@ def fitting_dipole():
         Argument("precision", str, optional=True, default="default", doc=doc_precision),
         Argument(
             "sel_type",
-            [List[int], int, None],
+            [list[int], int, None],
             optional=True,
             alias=["dipole_type"],
             doc=doc_sel_type + doc_only_tf_supported,
@@ -1702,9 +1700,9 @@ def modifier_dipole_charge():
     return [
         Argument("model_name", str, optional=False, doc=doc_model_name),
         Argument(
-            "model_charge_map", List[float], optional=False, doc=doc_model_charge_map
+            "model_charge_map", list[float], optional=False, doc=doc_model_charge_map
         ),
-        Argument("sys_charge_map", List[float], optional=False, doc=doc_sys_charge_map),
+        Argument("sys_charge_map", list[float], optional=False, doc=doc_sys_charge_map),
         Argument("ewald_beta", float, optional=True, default=0.4, doc=doc_ewald_beta),
         Argument("ewald_h", float, optional=True, default=1.0, doc=doc_ewald_h),
     ]
@@ -1733,7 +1731,7 @@ def model_compression():
 
     return [
         Argument("model_file", str, optional=False, doc=doc_model_file),
-        Argument("table_config", List[float], optional=False, doc=doc_table_config),
+        Argument("table_config", list[float], optional=False, doc=doc_table_config),
         Argument("min_nbor_dist", float, optional=False, doc=doc_min_nbor_dist),
     ]
 
@@ -1785,7 +1783,7 @@ def model_args(exclude_hybrid=False):
         "model",
         dict,
         [
-            Argument("type_map", List[str], optional=True, doc=doc_type_map),
+            Argument("type_map", list[str], optional=True, doc=doc_type_map),
             Argument(
                 "data_stat_nbatch",
                 int,
@@ -1837,7 +1835,7 @@ def model_args(exclude_hybrid=False):
             ),
             Argument(
                 "preset_out_bias",
-                Dict[str, List[Optional[Union[float, List[float]]]]],
+                dict[str, list[Optional[Union[float, list[float]]]]],
                 optional=True,
                 default=None,
                 doc=doc_only_pt_supported + doc_preset_out_bias,
@@ -1960,7 +1958,7 @@ def pairtab_model_args() -> Argument:
     doc_rcut = "The cut-off radius."
     doc_sel = 'This parameter set the number of selected neighbors. Note that this parameter is a little different from that in other descriptors. Instead of separating each type of atoms, only the summation matters. And this number is highly related with the efficiency, thus one should not make it too large. Usually 200 or less is enough, far away from the GPU limitation 4096. It can be:\n\n\
     - `int`. The maximum number of neighbor atoms to be considered. We recommend it to be less than 200. \n\n\
-    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.\
+    - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.\
     - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
     ca = Argument(
         "pairtab",
@@ -1968,7 +1966,7 @@ def pairtab_model_args() -> Argument:
         [
             Argument("tab_file", str, optional=False, doc=doc_tab_file),
             Argument("rcut", float, optional=False, doc=doc_rcut),
-            Argument("sel", [int, List[int], str], optional=False, doc=doc_sel),
+            Argument("sel", [int, list[int], str], optional=False, doc=doc_sel),
         ],
         doc=doc_only_tf_supported + "Pairwise tabulation energy model.",
     )
@@ -2494,11 +2492,11 @@ def training_data_args():  # ! added by Ziyao: new specification style for data
 
     args = [
         Argument(
-            "systems", [List[str], str], optional=False, default=".", doc=doc_systems
+            "systems", [list[str], str], optional=False, default=".", doc=doc_systems
         ),
         Argument(
             "batch_size",
-            [List[int], int, str],
+            [list[int], int, str],
             optional=True,
             default="auto",
             doc=doc_batch_size,
@@ -2515,7 +2513,7 @@ def training_data_args():  # ! added by Ziyao: new specification style for data
         ),
         Argument(
             "sys_probs",
-            List[float],
+            list[float],
             optional=True,
             default=None,
             doc=doc_sys_probs,
@@ -2560,11 +2558,11 @@ def validation_data_args():  # ! added by Ziyao: new specification style for dat
 
     args = [
         Argument(
-            "systems", [List[str], str], optional=False, default=".", doc=doc_systems
+            "systems", [list[str], str], optional=False, default=".", doc=doc_systems
         ),
         Argument(
             "batch_size",
-            [List[int], int, str],
+            [list[int], int, str],
             optional=True,
             default="auto",
             doc=doc_batch_size,
@@ -2581,7 +2579,7 @@ def validation_data_args():  # ! added by Ziyao: new specification style for dat
         ),
         Argument(
             "sys_probs",
-            List[float],
+            list[float],
             optional=True,
             default=None,
             doc=doc_sys_probs,
@@ -2877,7 +2875,7 @@ def gen_json(multi_task: bool = False, **kwargs) -> str:
     )
 
 
-def gen_args(multi_task: bool = False) -> List[Argument]:
+def gen_args(multi_task: bool = False) -> list[Argument]:
     if not multi_task:
         return [
             model_args(),
diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py
index 8fe67ad6fc..0394993854 100644
--- a/deepmd/utils/batch_size.py
+++ b/deepmd/utils/batch_size.py
@@ -7,7 +7,6 @@
 )
 from typing import (
     Callable,
-    Tuple,
 )
 
 import array_api_compat
@@ -81,7 +80,7 @@ def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None:
 
     def execute(
         self, callable: Callable, start_index: int, natoms: int
-    ) -> Tuple[int, tuple]:
+    ) -> tuple[int, tuple]:
         """Excuate a method with given batch size.
 
         Parameters
@@ -153,7 +152,7 @@ def _adjust_batch_size(self, factor: float):
 
     def execute_all(
         self, callable: Callable, total_size: int, natoms: int, *args, **kwargs
-    ) -> Tuple[np.ndarray]:
+    ) -> tuple[np.ndarray]:
         """Excuate a method with all given data.
 
         This method is compatible with Array API.
@@ -174,7 +173,7 @@ def execute_all(
 
         def execute_with_batch_size(
             batch_size: int, start_index: int
-        ) -> Tuple[int, Tuple[np.ndarray]]:
+        ) -> tuple[int, tuple[np.ndarray]]:
             end_index = start_index + batch_size
             end_index = min(end_index, total_size)
             return (end_index - start_index), callable(
diff --git a/deepmd/utils/compat.py b/deepmd/utils/compat.py
index edd01b8291..83cbe46fad 100644
--- a/deepmd/utils/compat.py
+++ b/deepmd/utils/compat.py
@@ -3,14 +3,15 @@
 
 import json
 import warnings
+from collections.abc import (
+    Sequence,
+)
 from pathlib import (
     Path,
 )
 from typing import (
     Any,
-    Dict,
     Optional,
-    Sequence,
     Union,
 )
 
@@ -22,13 +23,13 @@
 
 
 def convert_input_v0_v1(
-    jdata: Dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
-) -> Dict[str, Any]:
+    jdata: dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
+) -> dict[str, Any]:
     """Convert input from v0 format to v1.
 
     Parameters
     ----------
-    jdata : Dict[str, Any]
+    jdata : dict[str, Any]
         loaded json/yaml file
     warning : bool, optional
         whether to show deprecation warning, by default True
@@ -37,7 +38,7 @@ def convert_input_v0_v1(
 
     Returns
     -------
-    Dict[str, Any]
+    dict[str, Any]
         converted output
     """
     output = {}
@@ -63,19 +64,19 @@ def _warning_input_v0_v1(fname: Optional[Union[str, Path]]):
     warnings.warn(msg)
 
 
-def _model(jdata: Dict[str, Any], smooth: bool) -> Dict[str, Dict[str, Any]]:
+def _model(jdata: dict[str, Any], smooth: bool) -> dict[str, dict[str, Any]]:
     """Convert data to v1 input for non-smooth model.
 
     Parameters
     ----------
-    jdata : Dict[str, Any]
+    jdata : dict[str, Any]
         parsed input json/yaml data
     smooth : bool
         whether to use smooth or non-smooth descriptor version
 
     Returns
     -------
-    Dict[str, Dict[str, Any]]
+    dict[str, dict[str, Any]]
         dictionary with model input parameters and sub-dictionaries for descriptor and
         fitting net
     """
@@ -87,17 +88,17 @@ def _model(jdata: Dict[str, Any], smooth: bool) -> Dict[str, Dict[str, Any]]:
     return model
 
 
-def _nonsmth_descriptor(jdata: Dict[str, Any]) -> Dict[str, Any]:
+def _nonsmth_descriptor(jdata: dict[str, Any]) -> dict[str, Any]:
     """Convert data to v1 input for non-smooth descriptor.
 
     Parameters
     ----------
-    jdata : Dict[str, Any]
+    jdata : dict[str, Any]
         parsed input json/yaml data
 
     Returns
     -------
-    Dict[str, Any]
+    dict[str, Any]
         dict with descriptor parameters
     """
     descriptor = {}
@@ -106,17 +107,17 @@ def _nonsmth_descriptor(jdata: Dict[str, Any]) -> Dict[str, Any]:
     return descriptor
 
 
-def _smth_descriptor(jdata: Dict[str, Any]) -> Dict[str, Any]:
+def _smth_descriptor(jdata: dict[str, Any]) -> dict[str, Any]:
     """Convert data to v1 input for smooth descriptor.
 
     Parameters
     ----------
-    jdata : Dict[str, Any]
+    jdata : dict[str, Any]
         parsed input json/yaml data
 
     Returns
     -------
-    Dict[str, Any]
+    dict[str, Any]
         dict with descriptor parameters
     """
     descriptor = {}
@@ -136,17 +137,17 @@ def _smth_descriptor(jdata: Dict[str, Any]) -> Dict[str, Any]:
     return descriptor
 
 
-def _fitting_net(jdata: Dict[str, Any]) -> Dict[str, Any]:
+def _fitting_net(jdata: dict[str, Any]) -> dict[str, Any]:
     """Convert data to v1 input for fitting net.
 
     Parameters
     ----------
-    jdata : Dict[str, Any]
+    jdata : dict[str, Any]
         parsed input json/yaml data
 
     Returns
     -------
-    Dict[str, Any]
+    dict[str, Any]
         dict with fitting net parameters
     """
     fitting_net = {}
@@ -163,17 +164,17 @@ def _fitting_net(jdata: Dict[str, Any]) -> Dict[str, Any]:
     return fitting_net
 
 
-def _learning_rate(jdata: Dict[str, Any]) -> Dict[str, Any]:
+def _learning_rate(jdata: dict[str, Any]) -> dict[str, Any]:
     """Convert data to v1 input for learning rate section.
 
     Parameters
     ----------
-    jdata : Dict[str, Any]
+    jdata : dict[str, Any]
         parsed input json/yaml data
 
     Returns
     -------
-    Dict[str, Any]
+    dict[str, Any]
         dict with learning rate parameters
     """
     learning_rate = {}
@@ -182,20 +183,20 @@ def _learning_rate(jdata: Dict[str, Any]) -> Dict[str, Any]:
     return learning_rate
 
 
-def _loss(jdata: Dict[str, Any]) -> Dict[str, Any]:
+def _loss(jdata: dict[str, Any]) -> dict[str, Any]:
     """Convert data to v1 input for loss function.
 
     Parameters
     ----------
-    jdata : Dict[str, Any]
+    jdata : dict[str, Any]
         parsed input json/yaml data
 
     Returns
     -------
-    Dict[str, Any]
+    dict[str, Any]
         dict with loss function parameters
     """
-    loss: Dict[str, Any] = {}
+    loss: dict[str, Any] = {}
     _jcopy(
         jdata,
         loss,
@@ -215,17 +216,17 @@ def _loss(jdata: Dict[str, Any]) -> Dict[str, Any]:
     return loss
 
 
-def _training(jdata: Dict[str, Any]) -> Dict[str, Any]:
+def _training(jdata: dict[str, Any]) -> dict[str, Any]:
     """Convert data to v1 input for training.
 
     Parameters
     ----------
-    jdata : Dict[str, Any]
+    jdata : dict[str, Any]
         parsed input json/yaml data
 
     Returns
     -------
-    Dict[str, Any]
+    dict[str, Any]
         dict with training parameters
     """
     training = {}
@@ -250,14 +251,14 @@ def _training(jdata: Dict[str, Any]) -> Dict[str, Any]:
     return training
 
 
-def _jcopy(src: Dict[str, Any], dst: Dict[str, Any], keys: Sequence[str]):
+def _jcopy(src: dict[str, Any], dst: dict[str, Any], keys: Sequence[str]):
     """Copy specified keys from one dict to another.
 
     Parameters
     ----------
-    src : Dict[str, Any]
+    src : dict[str, Any]
         source dictionary
-    dst : Dict[str, Any]
+    dst : dict[str, Any]
         destination dictionary, will be modified in place
     keys : Sequence[str]
         list of keys to copy
@@ -267,12 +268,12 @@ def _jcopy(src: Dict[str, Any], dst: Dict[str, Any], keys: Sequence[str]):
             dst[k] = src[k]
 
 
-def remove_decay_rate(jdata: Dict[str, Any]):
+def remove_decay_rate(jdata: dict[str, Any]):
     """Convert decay_rate to stop_lr.
 
     Parameters
     ----------
-    jdata : Dict[str, Any]
+    jdata : dict[str, Any]
         input data
     """
     lr = jdata["learning_rate"]
@@ -287,8 +288,8 @@ def remove_decay_rate(jdata: Dict[str, Any]):
 
 
 def convert_input_v1_v2(
-    jdata: Dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
-) -> Dict[str, Any]:
+    jdata: dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
+) -> dict[str, Any]:
     tr_cfg = jdata["training"]
     tr_data_keys = {
         "systems",
@@ -334,15 +335,15 @@ def _warning_input_v1_v2(fname: Optional[Union[str, Path]]):
 
 
 def deprecate_numb_test(
-    jdata: Dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
-) -> Dict[str, Any]:
+    jdata: dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
+) -> dict[str, Any]:
     """Deprecate `numb_test` since v2.1. It has taken no effect since v2.0.
 
     See `#1243 <https://github.com/deepmodeling/deepmd-kit/discussions/1243>`_.
 
     Parameters
     ----------
-    jdata : Dict[str, Any]
+    jdata : dict[str, Any]
         loaded json/yaml file
     warning : bool, optional
         whether to show deprecation warning, by default True
@@ -351,7 +352,7 @@ def deprecate_numb_test(
 
     Returns
     -------
-    Dict[str, Any]
+    dict[str, Any]
         converted output
     """
     try:
@@ -372,8 +373,8 @@ def deprecate_numb_test(
 
 
 def update_deepmd_input(
-    jdata: Dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
-) -> Dict[str, Any]:
+    jdata: dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
+) -> dict[str, Any]:
     def is_deepmd_v0_input(jdata):
         return "model" not in jdata.keys()
 
diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index 5d324afb95..4c77bcf59a 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -4,7 +4,6 @@
 import bisect
 import logging
 from typing import (
-    List,
     Optional,
 )
 
@@ -53,7 +52,7 @@ def __init__(
         sys_path: str,
         set_prefix: str = "set",
         shuffle_test: bool = True,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         optional_type_map: bool = True,
         modifier=None,
         trn_all_set: bool = False,
@@ -134,7 +133,7 @@ def add(
         atomic: bool = False,
         must: bool = False,
         high_prec: bool = False,
-        type_sel: Optional[List[int]] = None,
+        type_sel: Optional[list[int]] = None,
         repeat: int = 1,
         default: float = 0.0,
         dtype: Optional[np.dtype] = None,
@@ -304,11 +303,11 @@ def get_ntypes(self) -> int:
         else:
             return max(self.get_atom_type()) + 1
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map."""
         return self.type_map
 
-    def get_atom_type(self) -> List[int]:
+    def get_atom_type(self) -> list[int]:
         """Get atom types."""
         return self.atom_type
 
@@ -738,7 +737,7 @@ def __init__(
         atomic: bool = False,
         must: bool = False,
         high_prec: bool = False,
-        type_sel: Optional[List[int]] = None,
+        type_sel: Optional[list[int]] = None,
         repeat: int = 1,
         default: float = 0.0,
         dtype: Optional[np.dtype] = None,
diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index 235930527b..7bec0b16f4 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -3,12 +3,10 @@
 import logging
 import warnings
 from functools import (
-    lru_cache,
+    cached_property,
 )
 from typing import (
     Any,
-    Dict,
-    List,
     Optional,
     Union,
 )
@@ -45,13 +43,13 @@ class DeepmdDataSystem:
 
     def __init__(
         self,
-        systems: List[str],
+        systems: list[str],
         batch_size: int,
         test_size: int,
         rcut: Optional[float] = None,
         set_prefix: str = "set",
         shuffle_test: bool = True,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
         optional_type_map: bool = True,
         modifier=None,
         trn_all_set=False,
@@ -240,9 +238,8 @@ def _load_test(self, ntests=-1):
             for nn in test_system_data:
                 self.test_data[nn].append(test_system_data[nn])
 
-    @property
-    @lru_cache(maxsize=None)
-    def default_mesh(self) -> List[np.ndarray]:
+    @cached_property
+    def default_mesh(self) -> list[np.ndarray]:
         """Mesh for each system."""
         return [
             make_default_mesh(
@@ -266,7 +263,7 @@ def compute_energy_shift(self, rcond=None, key="energy"):
         )
         return energy_shift.ravel()
 
-    def add_dict(self, adict: Dict[str, Dict[str, Any]]) -> None:
+    def add_dict(self, adict: dict[str, dict[str, Any]]) -> None:
         """Add items to the data system by a `dict`.
         `adict` should have items like
         .. code-block:: python.
@@ -299,7 +296,7 @@ def add_dict(self, adict: Dict[str, Dict[str, Any]]) -> None:
             )
 
     def add_data_requirements(
-        self, data_requirements: List[DataRequirementItem]
+        self, data_requirements: list[DataRequirementItem]
     ) -> None:
         """Add items to the data system by a list of `DataRequirementItem`."""
         self.add_dict({rr.key: rr.dict for rr in data_requirements})
@@ -311,7 +308,7 @@ def add(
         atomic: bool = False,
         must: bool = False,
         high_prec: bool = False,
-        type_sel: Optional[List[int]] = None,
+        type_sel: Optional[list[int]] = None,
         repeat: int = 1,
         default: float = 0.0,
         dtype: Optional[np.dtype] = None,
@@ -468,7 +465,7 @@ def get_batch_mixed(self) -> dict:
         b_data = self._merge_batch_data(batch_data)
         return b_data
 
-    def _merge_batch_data(self, batch_data: List[dict]) -> dict:
+    def _merge_batch_data(self, batch_data: list[dict]) -> dict:
         """Merge batch data from different systems.
 
         Parameters
@@ -550,7 +547,7 @@ def get_sys_ntest(self, sys_idx=None):
         else:
             return self.test_size[self.pick_idx]
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         """Get the type map."""
         return self.type_map
 
@@ -635,12 +632,12 @@ def _format_name_length(name, width):
 def print_summary(
     name: str,
     nsystems: int,
-    system_dirs: List[str],
-    natoms: List[int],
-    batch_size: List[int],
-    nbatches: List[int],
-    sys_probs: List[float],
-    pbc: List[bool],
+    system_dirs: list[str],
+    natoms: list[int],
+    batch_size: list[int],
+    nbatches: list[int],
+    sys_probs: list[float],
+    pbc: list[bool],
 ):
     """Print summary of systems.
 
@@ -732,7 +729,7 @@ def prob_sys_size_ext(keywords, nsystems, nbatch):
     return sys_probs
 
 
-def process_systems(systems: Union[str, List[str]]) -> List[str]:
+def process_systems(systems: Union[str, list[str]]) -> list[str]:
     """Process the user-input systems.
 
     If it is a single directory, search for all the systems in the directory.
@@ -773,7 +770,7 @@ def process_systems(systems: Union[str, List[str]]) -> List[str]:
 
 
 def get_data(
-    jdata: Dict[str, Any], rcut, type_map, modifier, multi_task_mode=False
+    jdata: dict[str, Any], rcut, type_map, modifier, multi_task_mode=False
 ) -> DeepmdDataSystem:
     """Get the data system.
 
diff --git a/deepmd/utils/econf_embd.py b/deepmd/utils/econf_embd.py
index 7f12206ae3..99c7edf284 100644
--- a/deepmd/utils/econf_embd.py
+++ b/deepmd/utils/econf_embd.py
@@ -1,8 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-from typing import (
-    Dict,
-    List,
-)
 
 import numpy as np
 from mendeleev import (
@@ -228,8 +224,8 @@ def make_element_embedding_list_vec(
 
 
 def make_econf_embedding(
-    types: List[str], flatten: bool = True
-) -> Dict[str, np.ndarray]:
+    types: list[str], flatten: bool = True
+) -> dict[str, np.ndarray]:
     """Make the electronic configuration embedding."""
     all_ret = {}
     for ii in types:
@@ -240,7 +236,7 @@ def make_econf_embedding(
     return all_ret
 
 
-def transform_to_spin_rep(res: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
+def transform_to_spin_rep(res: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
     """Tranform electron occupation of 0/1/2 to -1,-1/-1,1/1,1."""
     ret = {}
 
@@ -261,7 +257,7 @@ def transform(ii):
     return ret
 
 
-def print_econf_embedding(res: Dict[str, np.ndarray]):
+def print_econf_embedding(res: dict[str, np.ndarray]):
     """Print electron configuration embedding."""
     for kk, vv in res.items():
         vvstr = ",".join([str(ii) for ii in vv])
diff --git a/deepmd/utils/env_mat_stat.py b/deepmd/utils/env_mat_stat.py
index bbb43fd703..ecc0b7b62f 100644
--- a/deepmd/utils/env_mat_stat.py
+++ b/deepmd/utils/env_mat_stat.py
@@ -7,10 +7,10 @@
 from collections import (
     defaultdict,
 )
-from typing import (
-    Dict,
+from collections.abc import (
     Iterator,
-    List,
+)
+from typing import (
     Optional,
 )
 
@@ -98,12 +98,12 @@ def __init__(self) -> None:
         super().__init__()
         self.stats = defaultdict(StatItem)
 
-    def compute_stats(self, data: List[Dict[str, np.ndarray]]) -> None:
+    def compute_stats(self, data: list[dict[str, np.ndarray]]) -> None:
         """Compute the statistics of the environment matrix.
 
         Parameters
         ----------
-        data : List[Dict[str, np.ndarray]]
+        data : list[dict[str, np.ndarray]]
             The environment matrix.
         """
         if len(self.stats) > 0:
@@ -113,17 +113,17 @@ def compute_stats(self, data: List[Dict[str, np.ndarray]]) -> None:
                 self.stats[kk] += iter_stats[kk]
 
     @abstractmethod
-    def iter(self, data: List[Dict[str, np.ndarray]]) -> Iterator[Dict[str, StatItem]]:
+    def iter(self, data: list[dict[str, np.ndarray]]) -> Iterator[dict[str, StatItem]]:
         """Get the iterator of the environment matrix.
 
         Parameters
         ----------
-        data : List[Dict[str, np.ndarray]]
+        data : list[dict[str, np.ndarray]]
             The environment matrix.
 
         Yields
         ------
-        Dict[str, StatItem]
+        dict[str, StatItem]
             The statistics of the environment matrix.
         """
 
@@ -160,7 +160,7 @@ def load_stats(self, path: DPPath) -> None:
             )
 
     def load_or_compute_stats(
-        self, data: List[Dict[str, np.ndarray]], path: Optional[DPPath] = None
+        self, data: list[dict[str, np.ndarray]], path: Optional[DPPath] = None
     ) -> None:
         """Load the statistics of the environment matrix if it exists, otherwise compute and save it.
 
@@ -168,7 +168,7 @@ def load_or_compute_stats(
         ----------
         path : DPPath
             The path to load the statistics of the environment matrix.
-        data : List[Dict[str, np.ndarray]]
+        data : list[dict[str, np.ndarray]]
             The environment matrix.
         """
         if path is not None and path.is_dir():
@@ -180,7 +180,7 @@ def load_or_compute_stats(
                 self.save_stats(path)
                 log.info(f"Save stats to {path}.")
 
-    def get_avg(self, default: float = 0) -> Dict[str, float]:
+    def get_avg(self, default: float = 0) -> dict[str, float]:
         """Get the average of the environment matrix.
 
         Parameters
@@ -190,14 +190,14 @@ def get_avg(self, default: float = 0) -> Dict[str, float]:
 
         Returns
         -------
-        Dict[str, float]
+        dict[str, float]
             The average of the environment matrix.
         """
         return {kk: vv.compute_avg(default=default) for kk, vv in self.stats.items()}
 
     def get_std(
         self, default: float = 1e-1, protection: float = 1e-2
-    ) -> Dict[str, float]:
+    ) -> dict[str, float]:
         """Get the standard deviation of the environment matrix.
 
         Parameters
@@ -209,7 +209,7 @@ def get_std(
 
         Returns
         -------
-        Dict[str, float]
+        dict[str, float]
             The standard deviation of the environment matrix.
         """
         return {
diff --git a/deepmd/utils/finetune.py b/deepmd/utils/finetune.py
index 9baa1b5aa8..d8d035a853 100644
--- a/deepmd/utils/finetune.py
+++ b/deepmd/utils/finetune.py
@@ -1,9 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
-from typing import (
-    List,
-    Tuple,
-)
 
 log = logging.getLogger(__name__)
 
@@ -11,8 +7,8 @@
 class FinetuneRuleItem:
     def __init__(
         self,
-        p_type_map: List[str],
-        type_map: List[str],
+        p_type_map: list[str],
+        type_map: list[str],
         model_branch: str = "Default",
         random_fitting: bool = False,
         resuming: bool = False,
@@ -74,21 +70,21 @@ def get_finetune_tmap(self):
 
 
 def get_index_between_two_maps(
-    old_map: List[str],
-    new_map: List[str],
+    old_map: list[str],
+    new_map: list[str],
 ):
     """Returns the mapping index of types in new_map to those in the old_map.
 
     Parameters
     ----------
-    old_map : List[str]
+    old_map : list[str]
         The old list of atom type names.
-    new_map : List[str]
+    new_map : list[str]
         The new list of atom type names.
 
     Returns
     -------
-    index_map: List[int]
+    index_map: list[int]
         List contains `len(new_map)` indices, where `index_map[i]` is the index of `new_map[i]` in `old_map`.
         If `new_map[i]` is not in the `old_map`, the index will be `i - len(new_map)`.
     has_new_type: bool
@@ -112,21 +108,21 @@ def get_index_between_two_maps(
 
 
 def map_atom_exclude_types(
-    atom_exclude_types: List[int],
-    remap_index: List[int],
+    atom_exclude_types: list[int],
+    remap_index: list[int],
 ):
     """Return the remapped atom_exclude_types according to remap_index.
 
     Parameters
     ----------
-    atom_exclude_types : List[int]
+    atom_exclude_types : list[int]
         Exclude the atomic contribution of the given types.
-    remap_index : List[int]
+    remap_index : list[int]
         The indices in the old type list that correspond to the types in the new type list.
 
     Returns
     -------
-    remapped_atom_exclude_types: List[int]
+    remapped_atom_exclude_types: list[int]
         Remapped atom_exclude_types that only keeps the types in the new type list.
 
     """
@@ -137,22 +133,22 @@ def map_atom_exclude_types(
 
 
 def map_pair_exclude_types(
-    pair_exclude_types: List[Tuple[int, int]],
-    remap_index: List[int],
+    pair_exclude_types: list[tuple[int, int]],
+    remap_index: list[int],
 ):
     """Return the remapped atom_exclude_types according to remap_index.
 
     Parameters
     ----------
-    pair_exclude_types : List[Tuple[int, int]]
+    pair_exclude_types : list[tuple[int, int]]
         Exclude the pair of atoms of the given types from computing the output
         of the atomic model.
-    remap_index : List[int]
+    remap_index : list[int]
         The indices in the old type list that correspond to the types in the new type list.
 
     Returns
     -------
-    remapped_pair_exclude_typess: List[Tuple[int, int]]
+    remapped_pair_exclude_typess: list[tuple[int, int]]
         Remapped pair_exclude_types that only keeps the types in the new type list.
 
     """
diff --git a/deepmd/utils/hostlist.py b/deepmd/utils/hostlist.py
index c184b04031..4dac08af19 100644
--- a/deepmd/utils/hostlist.py
+++ b/deepmd/utils/hostlist.py
@@ -1,12 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import socket
-from typing import (
-    List,
-    Tuple,
-)
 
 
-def get_host_names() -> Tuple[str, List[str]]:
+def get_host_names() -> tuple[str, list[str]]:
     """Get host names of all nodes in the cluster.
 
     If mpi4py is not installed or MPI is not used, then the
@@ -16,7 +12,7 @@ def get_host_names() -> Tuple[str, List[str]]:
     -------
     str
         Host name of the current node
-    List[str]
+    list[str]
         List of host names of all nodes in the cluster
     """
     host_name = socket.gethostname()
diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
index 54a4c16b24..40e629d9db 100644
--- a/deepmd/utils/neighbor_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -5,9 +5,8 @@
     ABC,
     abstractmethod,
 )
-from typing import (
+from collections.abc import (
     Iterator,
-    Tuple,
 )
 
 import numpy as np
@@ -46,7 +45,7 @@ def __init__(
         self.ntypes = ntypes
         self.mixed_type = mixed_type
 
-    def get_stat(self, data: DeepmdDataSystem) -> Tuple[float, np.ndarray]:
+    def get_stat(self, data: DeepmdDataSystem) -> tuple[float, np.ndarray]:
         """Get the data statistics of the training data, including nearest nbor distance between atoms, max nbor size of atoms.
 
         Parameters
@@ -89,7 +88,7 @@ def get_stat(self, data: DeepmdDataSystem) -> Tuple[float, np.ndarray]:
     @abstractmethod
     def iterator(
         self, data: DeepmdDataSystem
-    ) -> Iterator[Tuple[np.ndarray, float, str]]:
+    ) -> Iterator[tuple[np.ndarray, float, str]]:
         """Abstract method for producing data.
 
         Yields
diff --git a/deepmd/utils/out_stat.py b/deepmd/utils/out_stat.py
index fd09e6815b..43af191e62 100644
--- a/deepmd/utils/out_stat.py
+++ b/deepmd/utils/out_stat.py
@@ -3,7 +3,6 @@
 
 from typing import (
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -14,7 +13,7 @@ def compute_stats_from_redu(
     natoms: np.ndarray,
     assigned_bias: Optional[np.ndarray] = None,
     rcond: Optional[float] = None,
-) -> Tuple[np.ndarray, np.ndarray]:
+) -> tuple[np.ndarray, np.ndarray]:
     """Compute the output statistics.
 
     Given the reduced output value and the number of atoms for each atom,
@@ -86,7 +85,7 @@ def compute_stats_from_redu(
 def compute_stats_from_atomic(
     output: np.ndarray,
     atype: np.ndarray,
-) -> Tuple[np.ndarray, np.ndarray]:
+) -> tuple[np.ndarray, np.ndarray]:
     """Compute the output statistics.
 
     Given the output value and the type of atoms,
diff --git a/deepmd/utils/pair_tab.py b/deepmd/utils/pair_tab.py
index 73980a2fd6..cddc358f27 100644
--- a/deepmd/utils/pair_tab.py
+++ b/deepmd/utils/pair_tab.py
@@ -4,7 +4,6 @@
 import logging
 from typing import (
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -199,7 +198,7 @@ def _check_table_upper_boundary(self) -> None:
 
                 self.vdata = np.concatenate((self.vdata, pad_extrapolation), axis=0)
 
-    def get(self) -> Tuple[np.array, np.array]:
+    def get(self) -> tuple[np.array, np.array]:
         """Get the serialized table."""
         return self.tab_info, self.tab_data
 
diff --git a/deepmd/utils/path.py b/deepmd/utils/path.py
index e794a36cab..6c52caac1d 100644
--- a/deepmd/utils/path.py
+++ b/deepmd/utils/path.py
@@ -13,8 +13,6 @@
 )
 from typing import (
     ClassVar,
-    Dict,
-    List,
     Optional,
 )
 
@@ -77,7 +75,7 @@ def save_numpy(self, arr: np.ndarray) -> None:
         """
 
     @abstractmethod
-    def glob(self, pattern: str) -> List["DPPath"]:
+    def glob(self, pattern: str) -> list["DPPath"]:
         """Search path using the glob pattern.
 
         Parameters
@@ -87,12 +85,12 @@ def glob(self, pattern: str) -> List["DPPath"]:
 
         Returns
         -------
-        List[DPPath]
+        list[DPPath]
             list of paths
         """
 
     @abstractmethod
-    def rglob(self, pattern: str) -> List["DPPath"]:
+    def rglob(self, pattern: str) -> list["DPPath"]:
         """This is like calling :meth:`DPPath.glob()` with `**/` added in front
         of the given relative pattern.
 
@@ -103,7 +101,7 @@ def rglob(self, pattern: str) -> List["DPPath"]:
 
         Returns
         -------
-        List[DPPath]
+        list[DPPath]
             list of paths
         """
 
@@ -206,7 +204,7 @@ def save_numpy(self, arr: np.ndarray) -> None:
         with self.path.open("wb") as f:
             np.save(f, arr)
 
-    def glob(self, pattern: str) -> List["DPPath"]:
+    def glob(self, pattern: str) -> list["DPPath"]:
         """Search path using the glob pattern.
 
         Parameters
@@ -216,13 +214,13 @@ def glob(self, pattern: str) -> List["DPPath"]:
 
         Returns
         -------
-        List[DPPath]
+        list[DPPath]
             list of paths
         """
         # currently DPOSPath will only derivative DPOSPath
         return [type(self)(p, mode=self.mode) for p in self.path.glob(pattern)]
 
-    def rglob(self, pattern: str) -> List["DPPath"]:
+    def rglob(self, pattern: str) -> list["DPPath"]:
         """This is like calling :meth:`DPPath.glob()` with `**/` added in front
         of the given relative pattern.
 
@@ -233,7 +231,7 @@ def rglob(self, pattern: str) -> List["DPPath"]:
 
         Returns
         -------
-        List[DPPath]
+        list[DPPath]
             list of paths
         """
         return [type(self)(p, mode=self.mode) for p in self.path.rglob(pattern)]
@@ -360,7 +358,7 @@ def save_numpy(self, arr: np.ndarray) -> None:
         self.root.flush()
         self._new_keys.append(self._name)
 
-    def glob(self, pattern: str) -> List["DPPath"]:
+    def glob(self, pattern: str) -> list["DPPath"]:
         """Search path using the glob pattern.
 
         Parameters
@@ -370,7 +368,7 @@ def glob(self, pattern: str) -> List["DPPath"]:
 
         Returns
         -------
-        List[DPPath]
+        list[DPPath]
             list of paths
         """
         # got paths starts with current path first, which is faster
@@ -384,7 +382,7 @@ def glob(self, pattern: str) -> List["DPPath"]:
             for pp in globfilter(subpaths, self._connect_path(pattern))
         ]
 
-    def rglob(self, pattern: str) -> List["DPPath"]:
+    def rglob(self, pattern: str) -> list["DPPath"]:
         """This is like calling :meth:`DPPath.glob()` with `**/` added in front
         of the given relative pattern.
 
@@ -395,17 +393,17 @@ def rglob(self, pattern: str) -> List["DPPath"]:
 
         Returns
         -------
-        List[DPPath]
+        list[DPPath]
             list of paths
         """
         return self.glob("**" + pattern)
 
     @property
-    def _keys(self) -> List[str]:
+    def _keys(self) -> list[str]:
         """Walk all groups and dataset."""
         return self._file_keys(self.root)
 
-    __file_new_keys: ClassVar[Dict[h5py.File, List[str]]] = {}
+    __file_new_keys: ClassVar[dict[h5py.File, list[str]]] = {}
 
     @property
     def _new_keys(self):
@@ -415,7 +413,7 @@ def _new_keys(self):
 
     @classmethod
     @lru_cache(None)
-    def _file_keys(cls, file: h5py.File) -> List[str]:
+    def _file_keys(cls, file: h5py.File) -> list[str]:
         """Walk all groups and dataset."""
         l = []
         file.visit(lambda x: l.append("/" + x))
diff --git a/deepmd/utils/plugin.py b/deepmd/utils/plugin.py
index b5c89eb4d3..ce8b015ddf 100644
--- a/deepmd/utils/plugin.py
+++ b/deepmd/utils/plugin.py
@@ -8,9 +8,7 @@
 )
 from typing import (
     Callable,
-    Dict,
     Optional,
-    Type,
 )
 
 
@@ -19,7 +17,7 @@ class Plugin:
 
     Attributes
     ----------
-    plugins : Dict[str, object]
+    plugins : dict[str, object]
         plugins
 
     Examples
@@ -99,7 +97,7 @@ class PluginVariant(metaclass=VariantABCMeta):
     pass
 
 
-def make_plugin_registry(name: Optional[str] = None) -> Type[object]:
+def make_plugin_registry(name: Optional[str] = None) -> type[object]:
     """Make a plugin registry.
 
     Parameters
@@ -141,7 +139,7 @@ class SomeClass(BaseClass):
             return PR.__plugins.register(key)
 
         @classmethod
-        def get_class_by_type(cls, class_type: str) -> Type[object]:
+        def get_class_by_type(cls, class_type: str) -> type[object]:
             """Get the class by the plugin type."""
             if class_type in PR.__plugins.plugins:
                 return PR.__plugins.plugins[class_type]
@@ -154,7 +152,7 @@ def get_class_by_type(cls, class_type: str) -> Type[object]:
                 raise RuntimeError(f"Unknown {name} type: {class_type}. {dym_message}")
 
         @classmethod
-        def get_plugins(cls) -> Dict[str, Type[object]]:
+        def get_plugins(cls) -> dict[str, type[object]]:
             """Get all the registered plugins."""
             return PR.__plugins.plugins
 
diff --git a/deepmd/utils/random.py b/deepmd/utils/random.py
index 44ea6a1dac..440faca177 100644
--- a/deepmd/utils/random.py
+++ b/deepmd/utils/random.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Optional,
-    Tuple,
     Union,
 )
 
@@ -12,7 +11,7 @@
 
 def choice(
     a: Union[np.ndarray, int],
-    size: Optional[Union[int, Tuple[int, ...]]] = None,
+    size: Optional[Union[int, tuple[int, ...]]] = None,
     replace: bool = True,
     p: Optional[np.ndarray] = None,
 ):
diff --git a/deepmd/utils/spin.py b/deepmd/utils/spin.py
index 101867d3e4..41ea52df88 100644
--- a/deepmd/utils/spin.py
+++ b/deepmd/utils/spin.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import copy
 from typing import (
-    List,
-    Tuple,
     Union,
 )
 
@@ -20,10 +18,10 @@ class Spin:
 
     Parameters
     ----------
-    use_spin: List[bool]
+    use_spin: list[bool]
                 A list of boolean values indicating whether to use atomic spin for each atom type.
                 True for spin and False for not. List of bool values with shape of [ntypes].
-    virtual_scale: List[float], float
+    virtual_scale: list[float], float
                 The scaling factor to determine the virtual distance
                 between a virtual atom representing spin and its corresponding real atom
                 for each atom type with spin. This factor is defined as the virtual distance
@@ -35,8 +33,8 @@ class Spin:
 
     def __init__(
         self,
-        use_spin: List[bool],
-        virtual_scale: Union[List[float], float],
+        use_spin: list[bool],
+        virtual_scale: Union[list[float], float],
     ) -> None:
         self.ntypes_real = len(use_spin)
         self.ntypes_spin = use_spin.count(True)
@@ -93,7 +91,7 @@ def get_ntypes_input(self) -> int:
         """Returns the number of double real atom types for input placeholder."""
         return self.ntypes_input
 
-    def get_use_spin(self) -> List[bool]:
+    def get_use_spin(self) -> list[bool]:
         """Returns the list of whether to use spin for each atom type."""
         return self.use_spin
 
@@ -127,7 +125,7 @@ def init_atom_exclude_types_placeholder(self) -> None:
         """
         self.atom_exclude_types_p = self.placeholder_type.tolist()
 
-    def get_pair_exclude_types(self, exclude_types=None) -> List[Tuple[int, int]]:
+    def get_pair_exclude_types(self, exclude_types=None) -> list[tuple[int, int]]:
         """
         Return the pair-wise exclusion types for descriptor.
         The placeholder types for those without spin are excluded.
@@ -135,7 +133,7 @@ def get_pair_exclude_types(self, exclude_types=None) -> List[Tuple[int, int]]:
         if exclude_types is None:
             return self.pair_exclude_types
         else:
-            _exclude_types: List[Tuple[int, int]] = copy.deepcopy(
+            _exclude_types: list[tuple[int, int]] = copy.deepcopy(
                 self.pair_exclude_types
             )
             for tt in exclude_types:
@@ -143,7 +141,7 @@ def get_pair_exclude_types(self, exclude_types=None) -> List[Tuple[int, int]]:
                 _exclude_types.append((tt[0], tt[1]))
             return _exclude_types
 
-    def get_atom_exclude_types(self, exclude_types=None) -> List[int]:
+    def get_atom_exclude_types(self, exclude_types=None) -> list[int]:
         """
         Return the atom-wise exclusion types for fitting before out_def.
         Both the placeholder types and spin types are excluded.
@@ -151,12 +149,12 @@ def get_atom_exclude_types(self, exclude_types=None) -> List[int]:
         if exclude_types is None:
             return self.atom_exclude_types_ps
         else:
-            _exclude_types: List[int] = copy.deepcopy(self.atom_exclude_types_ps)
+            _exclude_types: list[int] = copy.deepcopy(self.atom_exclude_types_ps)
             _exclude_types += exclude_types
             _exclude_types = list(set(_exclude_types))
             return _exclude_types
 
-    def get_atom_exclude_types_placeholder(self, exclude_types=None) -> List[int]:
+    def get_atom_exclude_types_placeholder(self, exclude_types=None) -> list[int]:
         """
         Return the atom-wise exclusion types for fitting after out_def.
         The placeholder types for those without spin are excluded.
@@ -164,7 +162,7 @@ def get_atom_exclude_types_placeholder(self, exclude_types=None) -> List[int]:
         if exclude_types is None:
             return self.atom_exclude_types_p
         else:
-            _exclude_types: List[int] = copy.deepcopy(self.atom_exclude_types_p)
+            _exclude_types: list[int] = copy.deepcopy(self.atom_exclude_types_p)
             _exclude_types += exclude_types
             _exclude_types = list(set(_exclude_types))
             return _exclude_types
diff --git a/deepmd/utils/update_sel.py b/deepmd/utils/update_sel.py
index 6feed525e5..ba1457b19c 100644
--- a/deepmd/utils/update_sel.py
+++ b/deepmd/utils/update_sel.py
@@ -5,10 +5,7 @@
     abstractmethod,
 )
 from typing import (
-    List,
     Optional,
-    Tuple,
-    Type,
     Union,
 )
 
@@ -28,11 +25,11 @@ class BaseUpdateSel(ABC):
     def update_one_sel(
         self,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         rcut: float,
-        sel: Union[int, List[int], str],
+        sel: Union[int, list[int], str],
         mixed_type: bool = False,
-    ) -> Tuple[float, List[int]]:
+    ) -> tuple[float, list[int]]:
         min_nbor_dist, tmp_sel = self.get_nbor_stat(
             train_data,
             type_map,
@@ -86,17 +83,17 @@ def wrap_up_4(self, xx):
     def get_nbor_stat(
         self,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         rcut: float,
         mixed_type: bool = False,
-    ) -> Tuple[float, Union[int, List[int]]]:
+    ) -> tuple[float, Union[int, list[int]]]:
         """Get the neighbor statistics of the data.
 
         Parameters
         ----------
         train_data : DeepmdDataSystem
             The training data.
-        type_map : Optional[List[str]]
+        type_map : Optional[list[str]]
             The type map.
         rcut : float
             The cutoff radius.
@@ -107,7 +104,7 @@ def get_nbor_stat(
         -------
         min_nbor_dist : float
             The minimum neighbor distance.
-        max_nbor_size : List[int]
+        max_nbor_size : list[int]
             The maximum neighbor size.
         """
         if type_map and len(type_map) == 0:
@@ -128,7 +125,7 @@ def get_nbor_stat(
 
     @property
     @abstractmethod
-    def neighbor_stat(self) -> Type[NeighborStat]:
+    def neighbor_stat(self) -> type[NeighborStat]:
         pass
 
     def get_min_nbor_dist(
diff --git a/deepmd/utils/weight_avg.py b/deepmd/utils/weight_avg.py
index b344d3bb75..7c75d18e68 100644
--- a/deepmd/utils/weight_avg.py
+++ b/deepmd/utils/weight_avg.py
@@ -2,21 +2,16 @@
 from collections import (
     defaultdict,
 )
-from typing import (
-    Dict,
-    List,
-    Tuple,
-)
 
 import numpy as np
 
 
-def weighted_average(errors: List[Dict[str, Tuple[float, float]]]) -> Dict:
+def weighted_average(errors: list[dict[str, tuple[float, float]]]) -> dict:
     """Compute wighted average of prediction errors (MAE or RMSE) for model.
 
     Parameters
     ----------
-    errors : List[Dict[str, Tuple[float, float]]]
+    errors : list[dict[str, tuple[float, float]]]
         List: the error of systems
         Dict: the error of quantities, name given by the key
         str: the name of the quantity, must starts with 'mae' or 'rmse'
diff --git a/doc/development/coding-conventions.rst b/doc/development/coding-conventions.rst
index 137b0d0d51..bf186d1231 100644
--- a/doc/development/coding-conventions.rst
+++ b/doc/development/coding-conventions.rst
@@ -30,7 +30,7 @@ Rules
 -----
 
 The code must be compatible with the oldest supported version of python
-which is 3.8.
+which is 3.9.
 
 The project follows the generic coding conventions as
 specified in the `Style Guide for Python Code`_, `Docstring
diff --git a/doc/development/create-a-model-pt.md b/doc/development/create-a-model-pt.md
index 2ab0dee18f..257dd8a25d 100644
--- a/doc/development/create-a-model-pt.md
+++ b/doc/development/create-a-model-pt.md
@@ -73,7 +73,7 @@ class SomeDescript(BaseDescriptor, torch.nn.Module):
     def update_sel(
         cls,
         train_data: DeepmdDataSystem,
-        type_map: Optional[List[str]],
+        type_map: Optional[list[str]],
         local_jdata: dict,
     ):
         pass
@@ -149,7 +149,7 @@ from deepmd.utils.argcheck import descrpt_args_plugin
 
 
 @descrpt_args_plugin.register("some_descrpt")
-def descrpt_some_args() -> List[Argument]:
+def descrpt_some_args() -> list[Argument]:
     return [
         Argument("arg1", bool, optional=False, doc="balabala"),
         Argument("arg2", float, optional=True, default=6.0, doc="haha"),
diff --git a/doc/development/create-a-model-tf.md b/doc/development/create-a-model-tf.md
index 9ab3525bb5..95a2f66f23 100644
--- a/doc/development/create-a-model-tf.md
+++ b/doc/development/create-a-model-tf.md
@@ -37,7 +37,7 @@ from deepmd.utils.argcheck import descrpt_args_plugin
 
 
 @descrpt_args_plugin.register("some_descrpt")
-def descrpt_some_args() -> List[Argument]:
+def descrpt_some_args() -> list[Argument]:
     return [
         Argument("arg1", bool, optional=False, doc="balabala"),
         Argument("arg2", float, optional=True, default=6.0, doc="haha"),
diff --git a/doc/getting-started/quick_start.ipynb b/doc/getting-started/quick_start.ipynb
index d1c45ad0b8..0c9563b9e9 100644
--- a/doc/getting-started/quick_start.ipynb
+++ b/doc/getting-started/quick_start.ipynb
@@ -523,7 +523,7 @@
        "  color: #bbbbff;\n",
        "}\n",
        "</style>\n",
-       "<div class=\"dargs-codeblock\"><code class=\"dargs-code dargs-linebegin\"></code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"that's all\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"model\"</code><span class=\"dargs-doc\">model: <br/>    type: <span class=\"dargs-doc-code\">dict</span></span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"type_map\"</code><span class=\"dargs-doc\">type_map: <br/>    type: <span class=\"dargs-doc-code\">typing.List[str]</span>, optional<hr/>A list of strings. Give the name to each type of atoms. It is noted that the number of atom type of training system must be less than 128 in a GPU environment. If not given, type.raw in each system should use the same type indexes, and type_map.raw will take no effect.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">[<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code>  \"H\",<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code>  \"C\"<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code>],</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"descriptor\"</code><span class=\"dargs-doc\">descriptor: <br/>    type: <span class=\"dargs-doc-code\">dict</span><hr/>The descriptor of atomic environment.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"type\"</code><span class=\"dargs-doc\">type:<br/>type: <span class=\"dargs-doc-code\">str</span><hr/>The type of the descritpor. See explanation below. <br/>- <span class=\"dargs-doc-code\">loc_frame</span>: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.<br/>- <span class=\"dargs-doc-code\">se_e2_a</span>: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.<br/>- <span class=\"dargs-doc-code\">se_e2_r</span>: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.<br/>- <span class=\"dargs-doc-code\">se_e3</span>: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Three-body embedding will be used by this descriptor.<br/>- <span class=\"dargs-doc-code\">se_a_tpe</span>: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Type embedding will be used by this descriptor.<br/>- <span class=\"dargs-doc-code\">se_atten</span>: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Attention mechanism will be used by this descriptor.<br/>- <span class=\"dargs-doc-code\">se_atten_v2</span>: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Attention mechanism with new modifications will be used by this descriptor.<br/>- <span class=\"dargs-doc-code\">se_a_mask</span>: Used by the smooth edition of Deep Potential. It can accept a variable number of atoms in a frame (Non-PBC system). <i>aparam</i> are required as an indicator matrix for the real/virtual sign of input atoms. <br/>- <span class=\"dargs-doc-code\">hybrid</span>: Concatenate of a list of descriptors as a new descriptor.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"se_e2_a\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"sel\"</code><span class=\"dargs-doc\">sel: <br/>    type: <span class=\"dargs-doc-code\">str</span> | <span class=\"dargs-doc-code\">typing.List[int]</span>, optional, default: <span class=\"dargs-doc-code\">auto</span><hr/>This parameter set the number of selected neighbors for each type of atom. It can be:<br/>    - <span class=\"dargs-doc-code\">List[int]</span>. The length of the list should be the same as the number of atom types in the system. <span class=\"dargs-doc-code\">sel[i]</span> gives the selected number of type-i neighbors. <span class=\"dargs-doc-code\">sel[i]</span> is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.<br/>    - <span class=\"dargs-doc-code\">str</span>. Can be \"auto:factor\" or \"auto\". \"factor\" is a float number larger than 1. This option will automatically determine the <span class=\"dargs-doc-code\">sel</span>. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the \"factor\". Finally the number is wraped up to 4 divisible. The option \"auto\" is equivalent to \"auto:1.1\".</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"auto\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"rcut_smth\"</code><span class=\"dargs-doc\">rcut_smth: <br/>    type: <span class=\"dargs-doc-code\">float</span>, optional, default: <span class=\"dargs-doc-code\">0.5</span><hr/>Where to start smoothing. For example the 1/r term is smoothed from <span class=\"dargs-doc-code\">rcut</span> to <span class=\"dargs-doc-code\">rcut_smth</span></span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">0.5,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"rcut\"</code><span class=\"dargs-doc\">rcut: <br/>    type: <span class=\"dargs-doc-code\">float</span>, optional, default: <span class=\"dargs-doc-code\">6.0</span><hr/>The cut-off radius.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">6.0,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"neuron\"</code><span class=\"dargs-doc\">neuron: <br/>    type: <span class=\"dargs-doc-code\">typing.List[int]</span>, optional, default: <span class=\"dargs-doc-code\">[10, 20, 40]</span><hr/>Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">[<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  25,<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  50,<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  100<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>],</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"resnet_dt\"</code><span class=\"dargs-doc\">resnet_dt: <br/>    type: <span class=\"dargs-doc-code\">bool</span>, optional, default: <span class=\"dargs-doc-code\">False</span><hr/>Whether to use a \"Timestep\" in the skip connection</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">false,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"axis_neuron\"</code><span class=\"dargs-doc\">axis_neuron: <br/>    type: <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">4</span>, alias: <i>n_axis_neuron</i><hr/>Size of the submatrix of G (embedding matrix).</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">16,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"seed\"</code><span class=\"dargs-doc\">seed: <br/>    type: <span class=\"dargs-doc-code\">NoneType</span> | <span class=\"dargs-doc-code\">int</span>, optional<hr/>Random seed for parameter initialization</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\" that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"fitting_net\"</code><span class=\"dargs-doc\">fitting_net: <br/>    type: <span class=\"dargs-doc-code\">dict</span><hr/>The fitting of physical properties.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"neuron\"</code><span class=\"dargs-doc\">neuron: <br/>    type: <span class=\"dargs-doc-code\">typing.List[int]</span>, optional, default: <span class=\"dargs-doc-code\">[120, 120, 120]</span>, alias: <i>n_neuron</i><hr/>The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">[<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  240,<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  240,<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  240<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>],</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"resnet_dt\"</code><span class=\"dargs-doc\">resnet_dt: <br/>    type: <span class=\"dargs-doc-code\">bool</span>, optional, default: <span class=\"dargs-doc-code\">True</span><hr/>Whether to use a \"Timestep\" in the skip connection</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">true,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"seed\"</code><span class=\"dargs-doc\">seed: <br/>    type: <span class=\"dargs-doc-code\">NoneType</span> | <span class=\"dargs-doc-code\">int</span>, optional<hr/>Random seed for parameter initialization of the fitting net</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\" that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\" that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"learning_rate\"</code><span class=\"dargs-doc\">learning_rate: <br/>    type: <span class=\"dargs-doc-code\">dict</span>, optional<hr/>The definitio of learning rate</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"type\"</code><span class=\"dargs-doc\">type:<br/>type: <span class=\"dargs-doc-code\">str</span>, default: <span class=\"dargs-doc-code\">exp</span><hr/>The type of the learning rate.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"exp\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"decay_steps\"</code><span class=\"dargs-doc\">decay_steps: <br/>    type: <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">5000</span><hr/>The learning rate is decaying every this number of training steps.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">50,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"start_lr\"</code><span class=\"dargs-doc\">start_lr: <br/>    type: <span class=\"dargs-doc-code\">float</span>, optional, default: <span class=\"dargs-doc-code\">0.001</span><hr/>The learning rate at the start of the training.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">0.001,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"stop_lr\"</code><span class=\"dargs-doc\">stop_lr: <br/>    type: <span class=\"dargs-doc-code\">float</span>, optional, default: <span class=\"dargs-doc-code\">1e-08</span><hr/>The desired learning rate at the end of the training.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">3.51e-08,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"loss\"</code><span class=\"dargs-doc\">loss: <br/>    type: <span class=\"dargs-doc-code\">dict</span>, optional<hr/>The definition of loss function. The loss type should be set to <span class=\"dargs-doc-code\">tensor</span>, <span class=\"dargs-doc-code\">ener</span> or left unset.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"type\"</code><span class=\"dargs-doc\">type:<br/>type: <span class=\"dargs-doc-code\">str</span>, default: <span class=\"dargs-doc-code\">ener</span><hr/>The type of the loss. When the fitting type is <span class=\"dargs-doc-code\">ener</span>, the loss type should be set to <span class=\"dargs-doc-code\">ener</span> or left unset. When the fitting type is <span class=\"dargs-doc-code\">dipole</span> or <span class=\"dargs-doc-code\">polar</span>, the loss type should be set to <span class=\"dargs-doc-code\">tensor</span>.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"ener\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"start_pref_e\"</code><span class=\"dargs-doc\">start_pref_e: <br/>    type: <span class=\"dargs-doc-code\">float</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">0.02</span><hr/>The prefactor of energy loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the energy label should be provided by file energy.npy in each data system. If both start_pref_e and limit_pref_e are set to 0, then the energy will be ignored.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">0.02,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"limit_pref_e\"</code><span class=\"dargs-doc\">limit_pref_e: <br/>    type: <span class=\"dargs-doc-code\">float</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">1.0</span><hr/>The prefactor of energy loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"start_pref_f\"</code><span class=\"dargs-doc\">start_pref_f: <br/>    type: <span class=\"dargs-doc-code\">float</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">1000</span><hr/>The prefactor of force loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the force label should be provided by file force.npy in each data system. If both start_pref_f and limit_pref_f are set to 0, then the force will be ignored.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1000,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"limit_pref_f\"</code><span class=\"dargs-doc\">limit_pref_f: <br/>    type: <span class=\"dargs-doc-code\">float</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">1.0</span><hr/>The prefactor of force loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"start_pref_v\"</code><span class=\"dargs-doc\">start_pref_v: <br/>    type: <span class=\"dargs-doc-code\">float</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">0.0</span><hr/>The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_v and limit_pref_v are set to 0, then the virial will be ignored.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">0,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"limit_pref_v\"</code><span class=\"dargs-doc\">limit_pref_v: <br/>    type: <span class=\"dargs-doc-code\">float</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">0.0</span><hr/>The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">0,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\" that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"training\"</code><span class=\"dargs-doc\">training: <br/>    type: <span class=\"dargs-doc-code\">dict</span><hr/>The training options.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"training_data\"</code><span class=\"dargs-doc\">training_data: <br/>    type: <span class=\"dargs-doc-code\">dict</span>, optional<hr/>Configurations of training data.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"systems\"</code><span class=\"dargs-doc\">systems: <br/>    type: <span class=\"dargs-doc-code\">str</span> | <span class=\"dargs-doc-code\">typing.List[str]</span><hr/>The data systems for training. This key can be provided with a list that specifies the systems, or be provided with a string by which the prefix of all systems are given and the list of the systems is automatically generated.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">[<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  \"../00.data/training_data\"<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>],</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"batch_size\"</code><span class=\"dargs-doc\">batch_size: <br/>    type: <span class=\"dargs-doc-code\">str</span> | <span class=\"dargs-doc-code\">typing.List[int]</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">auto</span><hr/>This key can be <br/>- list: the length of which is the same as the <span class=\"dargs-doc-code\">systems <training/training_data/systems_></span>_. The batch size of each system is given by the elements of the list.<br/>- int: all <span class=\"dargs-doc-code\">systems <training/training_data/systems_></span>_ use the same batch size.<br/>- string \"auto\": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.<br/>- string \"auto:N\": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.<br/>- string \"mixed:N\": the batch data will be sampled from all systems and merged into a mixed system with the batch size N. Only support the se_atten descriptor.<br/>If MPI is used, the value should be considered as the batch size per task.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"auto\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"validation_data\"</code><span class=\"dargs-doc\">validation_data: <br/>    type: <span class=\"dargs-doc-code\">NoneType</span> | <span class=\"dargs-doc-code\">dict</span>, optional, default: <span class=\"dargs-doc-code\">None</span><hr/>Configurations of validation data. Similar to that of training data, except that a <span class=\"dargs-doc-code\">numb_btch</span> argument may be configured</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"systems\"</code><span class=\"dargs-doc\">systems: <br/>    type: <span class=\"dargs-doc-code\">str</span> | <span class=\"dargs-doc-code\">typing.List[str]</span><hr/>The data systems for validation. This key can be provided with a list that specifies the systems, or be provided with a string by which the prefix of all systems are given and the list of the systems is automatically generated.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">[<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  \"../00.data/validation_data\"<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>],</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"batch_size\"</code><span class=\"dargs-doc\">batch_size: <br/>    type: <span class=\"dargs-doc-code\">str</span> | <span class=\"dargs-doc-code\">typing.List[int]</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">auto</span><hr/>This key can be <br/>- list: the length of which is the same as the <span class=\"dargs-doc-code\">systems <training/validation_data/systems_></span>_. The batch size of each system is given by the elements of the list.<br/>- int: all <span class=\"dargs-doc-code\">systems <training/validation_data/systems_></span>_ use the same batch size.<br/>- string \"auto\": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.<br/>- string \"auto:N\": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"auto\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"numb_btch\"</code><span class=\"dargs-doc\">numb_btch: <br/>    type: <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">1</span>, alias: <i>numb_batch</i><hr/>An integer that specifies the number of batches to be sampled for each validation period.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"numb_steps\"</code><span class=\"dargs-doc\">numb_steps: <br/>    type: <span class=\"dargs-doc-code\">int</span>, alias: <i>stop_batch</i><hr/>Number of training batch. Each training uses one batch of data.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">10000,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"seed\"</code><span class=\"dargs-doc\">seed: <br/>    type: <span class=\"dargs-doc-code\">NoneType</span> | <span class=\"dargs-doc-code\">int</span>, optional<hr/>The random seed for getting frames from the training data set.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">10,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"disp_file\"</code><span class=\"dargs-doc\">disp_file: <br/>    type: <span class=\"dargs-doc-code\">str</span>, optional, default: <span class=\"dargs-doc-code\">lcurve.out</span><hr/>The file for printing learning curve.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"lcurve.out\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"disp_freq\"</code><span class=\"dargs-doc\">disp_freq: <br/>    type: <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">1000</span><hr/>The frequency of printing learning curve.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">200,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"save_freq\"</code><span class=\"dargs-doc\">save_freq: <br/>    type: <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">1000</span><hr/>The frequency of saving check point.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1000,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><code class=\"dargs-code\">}</code><br/><code class=\"dargs-code dargs-linebegin\"></code><code class=\"dargs-code\">}</code><br/></div>"
+       "<div class=\"dargs-codeblock\"><code class=\"dargs-code dargs-linebegin\"></code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"that's all\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"model\"</code><span class=\"dargs-doc\">model: <br/>    type: <span class=\"dargs-doc-code\">dict</span></span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"type_map\"</code><span class=\"dargs-doc\">type_map: <br/>    type: <span class=\"dargs-doc-code\">typing.list[str]</span>, optional<hr/>A list of strings. Give the name to each type of atoms. It is noted that the number of atom type of training system must be less than 128 in a GPU environment. If not given, type.raw in each system should use the same type indexes, and type_map.raw will take no effect.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">[<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code>  \"H\",<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code>  \"C\"<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code>],</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"descriptor\"</code><span class=\"dargs-doc\">descriptor: <br/>    type: <span class=\"dargs-doc-code\">dict</span><hr/>The descriptor of atomic environment.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"type\"</code><span class=\"dargs-doc\">type:<br/>type: <span class=\"dargs-doc-code\">str</span><hr/>The type of the descritpor. See explanation below. <br/>- <span class=\"dargs-doc-code\">loc_frame</span>: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.<br/>- <span class=\"dargs-doc-code\">se_e2_a</span>: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.<br/>- <span class=\"dargs-doc-code\">se_e2_r</span>: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.<br/>- <span class=\"dargs-doc-code\">se_e3</span>: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Three-body embedding will be used by this descriptor.<br/>- <span class=\"dargs-doc-code\">se_a_tpe</span>: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Type embedding will be used by this descriptor.<br/>- <span class=\"dargs-doc-code\">se_atten</span>: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Attention mechanism will be used by this descriptor.<br/>- <span class=\"dargs-doc-code\">se_atten_v2</span>: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Attention mechanism with new modifications will be used by this descriptor.<br/>- <span class=\"dargs-doc-code\">se_a_mask</span>: Used by the smooth edition of Deep Potential. It can accept a variable number of atoms in a frame (Non-PBC system). <i>aparam</i> are required as an indicator matrix for the real/virtual sign of input atoms. <br/>- <span class=\"dargs-doc-code\">hybrid</span>: Concatenate of a list of descriptors as a new descriptor.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"se_e2_a\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"sel\"</code><span class=\"dargs-doc\">sel: <br/>    type: <span class=\"dargs-doc-code\">str</span> | <span class=\"dargs-doc-code\">typing.list[int]</span>, optional, default: <span class=\"dargs-doc-code\">auto</span><hr/>This parameter set the number of selected neighbors for each type of atom. It can be:<br/>    - <span class=\"dargs-doc-code\">list[int]</span>. The length of the list should be the same as the number of atom types in the system. <span class=\"dargs-doc-code\">sel[i]</span> gives the selected number of type-i neighbors. <span class=\"dargs-doc-code\">sel[i]</span> is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.<br/>    - <span class=\"dargs-doc-code\">str</span>. Can be \"auto:factor\" or \"auto\". \"factor\" is a float number larger than 1. This option will automatically determine the <span class=\"dargs-doc-code\">sel</span>. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the \"factor\". Finally the number is wraped up to 4 divisible. The option \"auto\" is equivalent to \"auto:1.1\".</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"auto\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"rcut_smth\"</code><span class=\"dargs-doc\">rcut_smth: <br/>    type: <span class=\"dargs-doc-code\">float</span>, optional, default: <span class=\"dargs-doc-code\">0.5</span><hr/>Where to start smoothing. For example the 1/r term is smoothed from <span class=\"dargs-doc-code\">rcut</span> to <span class=\"dargs-doc-code\">rcut_smth</span></span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">0.5,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"rcut\"</code><span class=\"dargs-doc\">rcut: <br/>    type: <span class=\"dargs-doc-code\">float</span>, optional, default: <span class=\"dargs-doc-code\">6.0</span><hr/>The cut-off radius.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">6.0,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"neuron\"</code><span class=\"dargs-doc\">neuron: <br/>    type: <span class=\"dargs-doc-code\">typing.list[int]</span>, optional, default: <span class=\"dargs-doc-code\">[10, 20, 40]</span><hr/>Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">[<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  25,<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  50,<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  100<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>],</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"resnet_dt\"</code><span class=\"dargs-doc\">resnet_dt: <br/>    type: <span class=\"dargs-doc-code\">bool</span>, optional, default: <span class=\"dargs-doc-code\">False</span><hr/>Whether to use a \"Timestep\" in the skip connection</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">false,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"axis_neuron\"</code><span class=\"dargs-doc\">axis_neuron: <br/>    type: <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">4</span>, alias: <i>n_axis_neuron</i><hr/>Size of the submatrix of G (embedding matrix).</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">16,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"seed\"</code><span class=\"dargs-doc\">seed: <br/>    type: <span class=\"dargs-doc-code\">NoneType</span> | <span class=\"dargs-doc-code\">int</span>, optional<hr/>Random seed for parameter initialization</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\" that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"fitting_net\"</code><span class=\"dargs-doc\">fitting_net: <br/>    type: <span class=\"dargs-doc-code\">dict</span><hr/>The fitting of physical properties.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"neuron\"</code><span class=\"dargs-doc\">neuron: <br/>    type: <span class=\"dargs-doc-code\">typing.list[int]</span>, optional, default: <span class=\"dargs-doc-code\">[120, 120, 120]</span>, alias: <i>n_neuron</i><hr/>The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">[<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  240,<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  240,<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  240<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>],</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"resnet_dt\"</code><span class=\"dargs-doc\">resnet_dt: <br/>    type: <span class=\"dargs-doc-code\">bool</span>, optional, default: <span class=\"dargs-doc-code\">True</span><hr/>Whether to use a \"Timestep\" in the skip connection</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">true,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"seed\"</code><span class=\"dargs-doc\">seed: <br/>    type: <span class=\"dargs-doc-code\">NoneType</span> | <span class=\"dargs-doc-code\">int</span>, optional<hr/>Random seed for parameter initialization of the fitting net</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\" that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\" that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"learning_rate\"</code><span class=\"dargs-doc\">learning_rate: <br/>    type: <span class=\"dargs-doc-code\">dict</span>, optional<hr/>The definitio of learning rate</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"type\"</code><span class=\"dargs-doc\">type:<br/>type: <span class=\"dargs-doc-code\">str</span>, default: <span class=\"dargs-doc-code\">exp</span><hr/>The type of the learning rate.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"exp\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"decay_steps\"</code><span class=\"dargs-doc\">decay_steps: <br/>    type: <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">5000</span><hr/>The learning rate is decaying every this number of training steps.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">50,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"start_lr\"</code><span class=\"dargs-doc\">start_lr: <br/>    type: <span class=\"dargs-doc-code\">float</span>, optional, default: <span class=\"dargs-doc-code\">0.001</span><hr/>The learning rate at the start of the training.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">0.001,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"stop_lr\"</code><span class=\"dargs-doc\">stop_lr: <br/>    type: <span class=\"dargs-doc-code\">float</span>, optional, default: <span class=\"dargs-doc-code\">1e-08</span><hr/>The desired learning rate at the end of the training.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">3.51e-08,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"loss\"</code><span class=\"dargs-doc\">loss: <br/>    type: <span class=\"dargs-doc-code\">dict</span>, optional<hr/>The definition of loss function. The loss type should be set to <span class=\"dargs-doc-code\">tensor</span>, <span class=\"dargs-doc-code\">ener</span> or left unset.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"type\"</code><span class=\"dargs-doc\">type:<br/>type: <span class=\"dargs-doc-code\">str</span>, default: <span class=\"dargs-doc-code\">ener</span><hr/>The type of the loss. When the fitting type is <span class=\"dargs-doc-code\">ener</span>, the loss type should be set to <span class=\"dargs-doc-code\">ener</span> or left unset. When the fitting type is <span class=\"dargs-doc-code\">dipole</span> or <span class=\"dargs-doc-code\">polar</span>, the loss type should be set to <span class=\"dargs-doc-code\">tensor</span>.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"ener\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"start_pref_e\"</code><span class=\"dargs-doc\">start_pref_e: <br/>    type: <span class=\"dargs-doc-code\">float</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">0.02</span><hr/>The prefactor of energy loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the energy label should be provided by file energy.npy in each data system. If both start_pref_e and limit_pref_e are set to 0, then the energy will be ignored.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">0.02,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"limit_pref_e\"</code><span class=\"dargs-doc\">limit_pref_e: <br/>    type: <span class=\"dargs-doc-code\">float</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">1.0</span><hr/>The prefactor of energy loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"start_pref_f\"</code><span class=\"dargs-doc\">start_pref_f: <br/>    type: <span class=\"dargs-doc-code\">float</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">1000</span><hr/>The prefactor of force loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the force label should be provided by file force.npy in each data system. If both start_pref_f and limit_pref_f are set to 0, then the force will be ignored.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1000,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"limit_pref_f\"</code><span class=\"dargs-doc\">limit_pref_f: <br/>    type: <span class=\"dargs-doc-code\">float</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">1.0</span><hr/>The prefactor of force loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"start_pref_v\"</code><span class=\"dargs-doc\">start_pref_v: <br/>    type: <span class=\"dargs-doc-code\">float</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">0.0</span><hr/>The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_v and limit_pref_v are set to 0, then the virial will be ignored.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">0,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"limit_pref_v\"</code><span class=\"dargs-doc\">limit_pref_v: <br/>    type: <span class=\"dargs-doc-code\">float</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">0.0</span><hr/>The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">0,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\" that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"training\"</code><span class=\"dargs-doc\">training: <br/>    type: <span class=\"dargs-doc-code\">dict</span><hr/>The training options.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"training_data\"</code><span class=\"dargs-doc\">training_data: <br/>    type: <span class=\"dargs-doc-code\">dict</span>, optional<hr/>Configurations of training data.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"systems\"</code><span class=\"dargs-doc\">systems: <br/>    type: <span class=\"dargs-doc-code\">str</span> | <span class=\"dargs-doc-code\">typing.list[str]</span><hr/>The data systems for training. This key can be provided with a list that specifies the systems, or be provided with a string by which the prefix of all systems are given and the list of the systems is automatically generated.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">[<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  \"../00.data/training_data\"<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>],</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"batch_size\"</code><span class=\"dargs-doc\">batch_size: <br/>    type: <span class=\"dargs-doc-code\">str</span> | <span class=\"dargs-doc-code\">typing.list[int]</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">auto</span><hr/>This key can be <br/>- list: the length of which is the same as the <span class=\"dargs-doc-code\">systems <training/training_data/systems_></span>_. The batch size of each system is given by the elements of the list.<br/>- int: all <span class=\"dargs-doc-code\">systems <training/training_data/systems_></span>_ use the same batch size.<br/>- string \"auto\": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.<br/>- string \"auto:N\": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.<br/>- string \"mixed:N\": the batch data will be sampled from all systems and merged into a mixed system with the batch size N. Only support the se_atten descriptor.<br/>If MPI is used, the value should be considered as the batch size per task.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"auto\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"validation_data\"</code><span class=\"dargs-doc\">validation_data: <br/>    type: <span class=\"dargs-doc-code\">NoneType</span> | <span class=\"dargs-doc-code\">dict</span>, optional, default: <span class=\"dargs-doc-code\">None</span><hr/>Configurations of validation data. Similar to that of training data, except that a <span class=\"dargs-doc-code\">numb_btch</span> argument may be configured</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">{</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"systems\"</code><span class=\"dargs-doc\">systems: <br/>    type: <span class=\"dargs-doc-code\">str</span> | <span class=\"dargs-doc-code\">typing.list[str]</span><hr/>The data systems for validation. This key can be provided with a list that specifies the systems, or be provided with a string by which the prefix of all systems are given and the list of the systems is automatically generated.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">[<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>  \"../00.data/validation_data\"<br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>],</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"batch_size\"</code><span class=\"dargs-doc\">batch_size: <br/>    type: <span class=\"dargs-doc-code\">str</span> | <span class=\"dargs-doc-code\">typing.list[int]</span> | <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">auto</span><hr/>This key can be <br/>- list: the length of which is the same as the <span class=\"dargs-doc-code\">systems <training/validation_data/systems_></span>_. The batch size of each system is given by the elements of the list.<br/>- int: all <span class=\"dargs-doc-code\">systems <training/validation_data/systems_></span>_ use the same batch size.<br/>- string \"auto\": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.<br/>- string \"auto:N\": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"auto\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"numb_btch\"</code><span class=\"dargs-doc\">numb_btch: <br/>    type: <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">1</span>, alias: <i>numb_batch</i><hr/>An integer that specifies the number of batches to be sampled for each validation period.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class=\"dargs-code\">},</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"numb_steps\"</code><span class=\"dargs-doc\">numb_steps: <br/>    type: <span class=\"dargs-doc-code\">int</span>, alias: <i>stop_batch</i><hr/>Number of training batch. Each training uses one batch of data.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">10000,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"seed\"</code><span class=\"dargs-doc\">seed: <br/>    type: <span class=\"dargs-doc-code\">NoneType</span> | <span class=\"dargs-doc-code\">int</span>, optional<hr/>The random seed for getting frames from the training data set.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">10,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"disp_file\"</code><span class=\"dargs-doc\">disp_file: <br/>    type: <span class=\"dargs-doc-code\">str</span>, optional, default: <span class=\"dargs-doc-code\">lcurve.out</span><hr/>The file for printing learning curve.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"lcurve.out\",</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"disp_freq\"</code><span class=\"dargs-doc\">disp_freq: <br/>    type: <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">1000</span><hr/>The frequency of printing learning curve.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">200,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span class=\"dargs-key\"><code class=\"dargs-code\">\"save_freq\"</code><span class=\"dargs-doc\">save_freq: <br/>    type: <span class=\"dargs-doc-code\">int</span>, optional, default: <span class=\"dargs-doc-code\">1000</span><hr/>The frequency of saving check point.</span></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">1000,</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;&nbsp;&nbsp;</code><span><code class=\"dargs-code\">\"_comment\"</code></span><code class=\"dargs-code\">: </code><code class=\"dargs-code\">\"that's all\"</code><br/><code class=\"dargs-code dargs-linebegin\">&nbsp;&nbsp;</code><code class=\"dargs-code\">}</code><br/><code class=\"dargs-code dargs-linebegin\"></code><code class=\"dargs-code\">}</code><br/></div>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
diff --git a/doc/install/easy-install.md b/doc/install/easy-install.md
index a0c6270287..99962d08b8 100644
--- a/doc/install/easy-install.md
+++ b/doc/install/easy-install.md
@@ -10,7 +10,7 @@ You can refer to [DeepModeling conda FAQ](https://docs.deepmodeling.com/faq/cond
 :::
 
 :::{note}
-Python 3.8 or above is required for Python interface.
+Python 3.9 or above is required for Python interface.
 :::
 
 - [Install off-line packages](#install-off-line-packages)
diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
index a725be0133..4079a8d424 100644
--- a/doc/install/install-from-source.md
+++ b/doc/install/install-from-source.md
@@ -21,7 +21,7 @@ deepmd_source_dir=`pwd`
 ### Install Backend's Python interface
 
 First, check the Python version on your machine.
-Python 3.8 or above is required.
+Python 3.9 or above is required.
 
 ```bash
 python --version
@@ -95,7 +95,7 @@ deactivate
 If one has multiple python interpreters named something like python3.x, it can be specified by, for example
 
 ```bash
-virtualenv -p python3.8 $deepmd_venv
+virtualenv -p python3.9 $deepmd_venv
 ```
 
 One should remember to activate the virtual environment every time he/she uses DeePMD-kit.
diff --git a/doc/install/install-lammps.md b/doc/install/install-lammps.md
index b43f9998a3..00b887e9c3 100644
--- a/doc/install/install-lammps.md
+++ b/doc/install/install-lammps.md
@@ -17,11 +17,11 @@ DeePMD-kit will generate a module called `USER-DEEPMD` in the `build` directory,
 
 ```bash
 cd /some/workspace
-wget https://github.com/lammps/lammps/archive/stable_29Aug2024.tar.gz
-tar xf stable_29Aug2024.tar.gz
+wget https://github.com/lammps/lammps/archive/stable_29Aug2024_update1.tar.gz
+tar xf stable_29Aug2024_update1.tar.gz
 ```
 
-The source code of LAMMPS is stored in the directory `lammps-stable_29Aug2024`.
+The source code of LAMMPS is stored in the directory `lammps-stable_29Aug2024_update1`.
 
 Then, you can [build LAMMPS](https://docs.lammps.org/Build.html) with either make or CMake.
 
@@ -30,7 +30,7 @@ Then, you can [build LAMMPS](https://docs.lammps.org/Build.html) with either mak
 Now go into the LAMMPS code and copy the DeePMD-kit module like this
 
 ```bash
-cd lammps-stable_29Aug2024/src/
+cd lammps-stable_29Aug2024_update1/src/
 cp -r $deepmd_source_dir/source/build/USER-DEEPMD .
 make yes-kspace
 make yes-extra-fix
@@ -60,8 +60,8 @@ make no-user-deepmd
 Now go into the LAMMPS directory and create a directory called `build`:
 
 ```bash
-mkdir -p lammps-stable_29Aug2024/build/
-cd lammps-stable_29Aug2024/build/
+mkdir -p lammps-stable_29Aug2024_update1/build/
+cd lammps-stable_29Aug2024_update1/build/
 ```
 
 Patch the LAMMPS `CMakeLists.txt` file:
@@ -94,15 +94,15 @@ Now download the LAMMPS code (`8Apr2021` or later), and uncompress it:
 
 ```bash
 cd /some/workspace
-wget https://github.com/lammps/lammps/archive/stable_29Aug2024.tar.gz
-tar xf stable_29Aug2024.tar.gz
+wget https://github.com/lammps/lammps/archive/stable_29Aug2024_update1.tar.gz
+tar xf stable_29Aug2024_update1.tar.gz
 ```
 
-The source code of LAMMPS is stored in the directory `lammps-stable_29Aug2024`. The directory of the source code should be specified as the CMAKE argument `LAMMPS_SOURCE_ROOT` during installation of the DeePMD-kit C++ interface. Now go into the LAMMPS directory and create a directory called `build`
+The source code of LAMMPS is stored in the directory `lammps-stable_29Aug2024_update1`. The directory of the source code should be specified as the CMAKE argument `LAMMPS_SOURCE_ROOT` during installation of the DeePMD-kit C++ interface. Now go into the LAMMPS directory and create a directory called `build`
 
 ```bash
-mkdir -p lammps-stable_29Aug2024/build/
-cd lammps-stable_29Aug2024/build/
+mkdir -p lammps-stable_29Aug2024_update1/build/
+cd lammps-stable_29Aug2024_update1/build/
 ```
 
 Now build LAMMPS. Note that `PLUGIN` must be enabled, and `BUILD_SHARED_LIBS` must be set to `yes`. You can install any other package you want.
diff --git a/doc/model/dpa2.md b/doc/model/dpa2.md
index 5de30ee6b2..24ce5222e9 100644
--- a/doc/model/dpa2.md
+++ b/doc/model/dpa2.md
@@ -8,6 +8,16 @@ The DPA-2 model implementation. See https://arxiv.org/abs/2312.15492 for more de
 
 Training example: `examples/water/dpa2/input_torch_medium.json`, see [README](../../examples/water/dpa2/README.md) for inputs in different levels.
 
+## Requirements of installation {{ pytorch_icon }}
+
+If one wants to run the DPA-2 model on LAMMPS, the customized OP library for the Python interface must be installed when [freezing the model](../freeze/freeze.md).
+
+The customized OP library for the Python interface can be installed by setting environment variable {envvar}`DP_ENABLE_PYTORCH` to `1` during installation.
+
+If one runs LAMMPS with MPI, the customized OP library for the C++ interface should be compiled against the same MPI library as the runtime MPI.
+If one runs LAMMPS with MPI and CUDA devices, it is recommended to compile the customized OP library for the C++ interface with a [CUDA-Aware MPI](https://developer.nvidia.com/mpi-solutions-gpus) library and CUDA,
+otherwise the communication between GPU cards falls back to the slower CPU implementation.
+
 ## Data format
 
 DPA-2 supports both the [standard data format](../data/system.md) and the [mixed type data format](../data/system.md#mixed-type).
diff --git a/doc/model/linear.md b/doc/model/linear.md
index 3891559d90..47fdd1750b 100644
--- a/doc/model/linear.md
+++ b/doc/model/linear.md
@@ -1,7 +1,7 @@
-## Linear model {{ tensorflow_icon }}
+## Linear model {{ tensorflow_icon }} {{ pytorch_icon }}
 
 :::{note}
-**Supported backends**: TensorFlow {{ tensorflow_icon }}
+**Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}
 :::
 
 One can linearly combine existing models with arbitrary coefficients:
diff --git a/doc/third-party/lammps-command.md b/doc/third-party/lammps-command.md
index 4baba00e05..6a16605bfc 100644
--- a/doc/third-party/lammps-command.md
+++ b/doc/third-party/lammps-command.md
@@ -4,6 +4,11 @@
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
+:::{note}
+Each MPI rank can only use at most one GPU card.
+See [How to control the parallelism of a job](../troubleshooting/howtoset_num_nodes.md) for details.
+:::
+
 ## units
 
 All units in LAMMPS except `lj` are supported. `lj` is not supported.
diff --git a/doc/troubleshooting/howtoset_num_nodes.md b/doc/troubleshooting/howtoset_num_nodes.md
index 0c547650fb..b09fb80cb6 100644
--- a/doc/troubleshooting/howtoset_num_nodes.md
+++ b/doc/troubleshooting/howtoset_num_nodes.md
@@ -4,11 +4,26 @@ DeePMD-kit has three levels of parallelism.
 To get the best performance, one should control the number of threads used by DeePMD-kit.
 One should make sure the product of the parallel numbers is less than or equal to the number of cores available.
 
-## MPI (optional)
+## MPI or multiprocessing (optional)
 
 Parallelism for MPI is optional and used for multiple nodes, multiple GPU cards, or sometimes multiple CPU cores.
 
-To enable MPI support for training, one should [install horovod](../install/install-from-source.md#install-horovod-and-mpi4py) in advance. Note that the parallelism mode is data parallelism, so it is not expected to see the training time per batch decreases.
+::::{tab-set}
+
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
+
+To enable MPI support for training in the TensorFlow interface, one should [install horovod](../install/install-from-source.md#install-horovod-and-mpi4py) in advance.
+
+:::
+:::{tab-item} PyTorch {{ pytorch_icon }}
+
+Multiprocessing support for training in the PyTorch backend is implemented with [torchrun](https://pytorch.org/docs/stable/elastic/run.html).
+
+:::
+::::
+
+Note that the parallelism mode is data parallelism, so it is not expected to see the training time per batch decreases.
+See [Parallel training](../train/parallel-training.md) for details.
 
 MPI support for inference is not directly supported by DeePMD-kit, but indirectly supported by the third-party software. For example, [LAMMPS enables running simulations in parallel](https://docs.lammps.org/Developer_parallel.html) using the MPI parallel communication standard with distributed data. That software has to build against MPI.
 
@@ -22,6 +37,8 @@ Note that `mpirun` here should be the same as the MPI used to build software. Fo
 
 Sometimes, `$num_nodes` and the nodes information can be directly given by the HPC scheduler system, if the MPI used here is the same as the MPI used to build the scheduler system. Otherwise, one have to manually assign these information.
 
+Each process can use at most one GPU card.
+
 ## Parallelism between independent operators
 
 For CPU devices, TensorFlow and PyTorch use multiple streams to run independent operators (OP).
diff --git a/examples/water/d3/dftd3.txt b/examples/water/d3/dftd3.txt
index bbc9726134..09e5fb697a 100644
--- a/examples/water/d3/dftd3.txt
+++ b/examples/water/d3/dftd3.txt
@@ -97,4 +97,4 @@
 9.700000000000001066e+00 -1.186747936398473687e-05 -7.637113677130612127e-06 -5.528293849956352819e-06
 9.800000000000000711e+00 -1.114523618469756001e-05 -7.174288601187318493e-06 -5.194401230658985063e-06
 9.900000000000000355e+00 -1.047381249252528874e-05 -6.743886368019750717e-06 -4.883815978498405921e-06
-1.000000000000000000e+01  0.000000000000000e00e+00  0.000000000000000e00e+00  0.000000000000000e00e+00
+1.000000000000000000e+01  0.000000000000000000e+00  0.000000000000000000e+00  0.000000000000000000e+00
diff --git a/examples/water/d3/input_pt.json b/examples/water/d3/input_pt.json
new file mode 100644
index 0000000000..c2d9304a7e
--- /dev/null
+++ b/examples/water/d3/input_pt.json
@@ -0,0 +1,96 @@
+{
+  "_comment1": " model parameters",
+  "model": {
+    "type": "linear_ener",
+    "weights": "sum",
+    "type_map": [
+      "O",
+      "H"
+    ],
+    "models": [
+      {
+        "descriptor": {
+          "type": "se_atten",
+          "sel": [
+            46,
+            92
+          ],
+          "rcut_smth": 0.50,
+          "rcut": 6.00,
+          "neuron": [
+            25,
+            50,
+            100
+          ],
+          "resnet_dt": false,
+          "axis_neuron": 16,
+          "type_one_side": true,
+          "precision": "float64",
+          "seed": 1,
+          "_comment2": " that's all"
+        },
+        "fitting_net": {
+          "neuron": [
+            240,
+            240,
+            240
+          ],
+          "resnet_dt": true,
+          "precision": "float64",
+          "seed": 1,
+          "_comment3": " that's all"
+        },
+        "_comment4": " that's all"
+      },
+      {
+        "type": "pairtab",
+        "tab_file": "dftd3.txt",
+        "rcut": 10.0,
+        "sel": 534
+      }
+    ]
+  },
+  "learning_rate": {
+    "type": "exp",
+    "decay_steps": 5000,
+    "start_lr": 0.001,
+    "stop_lr": 3.51e-8,
+    "_comment5": "that's all"
+  },
+  "loss": {
+    "type": "ener",
+    "start_pref_e": 0.02,
+    "limit_pref_e": 1,
+    "start_pref_f": 1000,
+    "limit_pref_f": 1,
+    "start_pref_v": 0,
+    "limit_pref_v": 0,
+    "_comment6": " that's all"
+  },
+  "training": {
+    "training_data": {
+      "systems": [
+        "../data/data_0/",
+        "../data/data_1/",
+        "../data/data_2/"
+      ],
+      "batch_size": "auto",
+      "_comment7": "that's all"
+    },
+    "validation_data": {
+      "systems": [
+        "../data/data_3"
+      ],
+      "batch_size": 1,
+      "numb_btch": 3,
+      "_comment8": "that's all"
+    },
+    "numb_steps": 1000000,
+    "seed": 10,
+    "disp_file": "lcurve.out",
+    "disp_freq": 100,
+    "save_freq": 1000,
+    "_comment9": "that's all"
+  },
+  "_comment10": "that's all"
+}
diff --git a/examples/water/linear/input_pt.json b/examples/water/linear/input_pt.json
new file mode 100644
index 0000000000..e8d8e07136
--- /dev/null
+++ b/examples/water/linear/input_pt.json
@@ -0,0 +1,124 @@
+{
+  "_comment1": " model parameters",
+  "model": {
+    "type": "linear_ener",
+    "weights": "sum",
+    "type_map": [
+      "O",
+      "H"
+    ],
+    "models": [
+      {
+        "descriptor": {
+          "type": "se_atten",
+          "sel": [
+            46,
+            92
+          ],
+          "rcut_smth": 0.50,
+          "rcut": 6.00,
+          "neuron": [
+            25,
+            50,
+            100
+          ],
+          "resnet_dt": false,
+          "axis_neuron": 16,
+          "type_one_side": true,
+          "precision": "float64",
+          "seed": 1,
+          "_comment2": " that's all"
+        },
+        "fitting_net": {
+          "neuron": [
+            240,
+            240,
+            240
+          ],
+          "resnet_dt": true,
+          "precision": "float64",
+          "seed": 1,
+          "_comment3": " that's all"
+        },
+        "_comment4": " that's all"
+      },
+      {
+        "descriptor": {
+          "type": "se_atten",
+          "sel": [
+            46,
+            92
+          ],
+          "rcut_smth": 0.50,
+          "rcut": 6.00,
+          "neuron": [
+            25,
+            50,
+            100
+          ],
+          "resnet_dt": false,
+          "axis_neuron": 16,
+          "type_one_side": true,
+          "precision": "float64",
+          "seed": 1,
+          "_comment2": " that's all"
+        },
+        "fitting_net": {
+          "neuron": [
+            240,
+            240,
+            240
+          ],
+          "resnet_dt": true,
+          "precision": "float64",
+          "seed": 1,
+          "_comment3": " that's all"
+        },
+        "_comment4": " that's all"
+      }
+    ]
+  },
+  "learning_rate": {
+    "type": "exp",
+    "decay_steps": 5000,
+    "start_lr": 0.001,
+    "stop_lr": 3.51e-8,
+    "_comment5": "that's all"
+  },
+  "loss": {
+    "type": "ener",
+    "start_pref_e": 0.02,
+    "limit_pref_e": 1,
+    "start_pref_f": 1000,
+    "limit_pref_f": 1,
+    "start_pref_v": 0,
+    "limit_pref_v": 0,
+    "_comment6": " that's all"
+  },
+  "training": {
+    "training_data": {
+      "systems": [
+        "../data/data_0/",
+        "../data/data_1/",
+        "../data/data_2/"
+      ],
+      "batch_size": "auto",
+      "_comment7": "that's all"
+    },
+    "validation_data": {
+      "systems": [
+        "../data/data_3"
+      ],
+      "batch_size": 1,
+      "numb_btch": 3,
+      "_comment8": "that's all"
+    },
+    "numb_steps": 1000000,
+    "seed": 10,
+    "disp_file": "lcurve.out",
+    "disp_freq": 100,
+    "save_freq": 1000,
+    "_comment9": "that's all"
+  },
+  "_comment10": "that's all"
+}
diff --git a/examples/water/zbl/input.json b/examples/water/zbl/input.json
index cb5602d92d..54586ca0cf 100644
--- a/examples/water/zbl/input.json
+++ b/examples/water/zbl/input.json
@@ -10,7 +10,7 @@
       "H"
     ],
     "descriptor": {
-      "type": "se_e2_a",
+      "type": "se_atten_v2",
       "sel": [
         46,
         92
diff --git a/pyproject.toml b/pyproject.toml
index a1829016cb..b13dceeb07 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,10 @@ classifiers = [
     "Programming Language :: Python :: 3 :: Only",
     "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.2",
     "Intended Audience :: Science/Research",
-    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
     "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Scientific/Engineering :: Physics",
@@ -44,14 +47,14 @@ dependencies = [
     'typing_extensions; python_version < "3.8"',
     'importlib_metadata>=1.4; python_version < "3.8"',
     'h5py',
-    "h5py>=3.6.0,<3.11.0; platform_system=='Linux' and platform_machine=='aarch64'",
+    "h5py>=3.6.0,!=3.11.0; platform_system=='Linux' and platform_machine=='aarch64'",
     'wcmatch',
     'packaging',
     'ml_dtypes',
     'mendeleev',
     'array-api-compat',
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 keywords = ["deepmd"]
 
 [project.entry-points."lammps.plugins"]
@@ -104,7 +107,7 @@ docs = [
     "sphinxcontrib-moderncmakedomain",
 ]
 lmp = [
-    "lammps~=2024.8.29.0.0",
+    "lammps~=2024.8.29.1.0",
 ]
 ipi = [
     "ipi",
@@ -134,6 +137,7 @@ cu12 = [
 ]
 jax = [
     'jax>=0.4.33;python_version>="3.10"',
+    'flax>=0.8.0;python_version>="3.10"',
 ]
 
 [tool.deepmd_build_backend.scripts]
@@ -225,7 +229,7 @@ repair-wheel-command = """delocate-wheel --require-archs {delocate_archs} -w {de
 
 [tool.cibuildwheel.macos.environment]
 PIP_PREFER_BINARY = "1"
-DP_LAMMPS_VERSION = "stable_29Aug2024"
+DP_LAMMPS_VERSION = "stable_29Aug2024_update1"
 DP_ENABLE_IPI = "1"
 DP_ENABLE_PYTORCH = "1"
 # for unclear reason, when enabling PyTorch, OpenMP is found accidentally
@@ -261,7 +265,7 @@ before-build = [
 ]
 [tool.cibuildwheel.linux.environment]
 PIP_PREFER_BINARY = "1"
-DP_LAMMPS_VERSION = "stable_29Aug2024"
+DP_LAMMPS_VERSION = "stable_29Aug2024_update1"
 DP_ENABLE_IPI = "1"
 DP_ENABLE_PYTORCH = "1"
 MPI_HOME = "/usr/lib64/mpich"
diff --git a/source/api_cc/include/DeepPotPT.h b/source/api_cc/include/DeepPotPT.h
index 973c02c434..4144249367 100644
--- a/source/api_cc/include/DeepPotPT.h
+++ b/source/api_cc/include/DeepPotPT.h
@@ -338,6 +338,7 @@ class DeepPotPT : public DeepPotBase {
   int do_message_passing;  // 1:dpa2 model 0:others
   bool gpu_enabled;
   at::Tensor firstneigh_tensor;
+  c10::optional<torch::Tensor> mapping_tensor;
   torch::Dict<std::string, torch::Tensor> comm_dict;
   /**
    * @brief Translate PyTorch exceptions to the DeePMD-kit exception.
diff --git a/source/api_cc/src/DataModifierTF.cc b/source/api_cc/src/DataModifierTF.cc
index 324cb14098..aaa2252955 100644
--- a/source/api_cc/src/DataModifierTF.cc
+++ b/source/api_cc/src/DataModifierTF.cc
@@ -49,8 +49,11 @@ void DipoleChargeModifierTF::init(const std::string& model,
         0.9);
     options.config.mutable_gpu_options()->set_allow_growth(true);
     DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
-    std::string str = "/gpu:";
-    str += std::to_string(gpu_rank % gpu_num);
+    std::string str = "/gpu:0";
+    // See
+    // https://github.com/tensorflow/tensorflow/blame/8fac27b486939f40bc8e362b94a16a4a8bb51869/tensorflow/core/protobuf/config.proto#L80
+    options.config.mutable_gpu_options()->set_visible_device_list(
+        std::to_string(gpu_rank % gpu_num));
     graph::SetDefaultDevice(str, graph_def);
   }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc
index c03576635a..84629042f4 100644
--- a/source/api_cc/src/DeepPotPT.cc
+++ b/source/api_cc/src/DeepPotPT.cc
@@ -164,7 +164,6 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
   std::vector<std::int64_t> atype_64(datype.begin(), datype.end());
   at::Tensor atype_Tensor =
       torch::from_blob(atype_64.data(), {1, nall_real}, int_option).to(device);
-  c10::optional<torch::Tensor> mapping_tensor;
   if (ago == 0) {
     nlist_data.copy_from_nlist(lmp_list);
     nlist_data.shuffle_exclude_empty(fwd_map);
diff --git a/source/api_cc/src/DeepPotTF.cc b/source/api_cc/src/DeepPotTF.cc
index 2c09c17a69..d7a7edfb60 100644
--- a/source/api_cc/src/DeepPotTF.cc
+++ b/source/api_cc/src/DeepPotTF.cc
@@ -447,8 +447,11 @@ void DeepPotTF::init(const std::string& model,
         0.9);
     options.config.mutable_gpu_options()->set_allow_growth(true);
     DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
-    std::string str = "/gpu:";
-    str += std::to_string(gpu_rank % gpu_num);
+    std::string str = "/gpu:0";
+    // See
+    // https://github.com/tensorflow/tensorflow/blame/8fac27b486939f40bc8e362b94a16a4a8bb51869/tensorflow/core/protobuf/config.proto#L80
+    options.config.mutable_gpu_options()->set_visible_device_list(
+        std::to_string(gpu_rank % gpu_num));
     graph::SetDefaultDevice(str, graph_def);
   }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/api_cc/src/DeepTensorTF.cc b/source/api_cc/src/DeepTensorTF.cc
index 34a47bc6f3..c69b7c018e 100644
--- a/source/api_cc/src/DeepTensorTF.cc
+++ b/source/api_cc/src/DeepTensorTF.cc
@@ -46,8 +46,11 @@ void DeepTensorTF::init(const std::string &model,
         0.9);
     options.config.mutable_gpu_options()->set_allow_growth(true);
     DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
-    std::string str = "/gpu:";
-    str += std::to_string(gpu_rank % gpu_num);
+    std::string str = "/gpu:0";
+    // See
+    // https://github.com/tensorflow/tensorflow/blame/8fac27b486939f40bc8e362b94a16a4a8bb51869/tensorflow/core/protobuf/config.proto#L80
+    options.config.mutable_gpu_options()->set_visible_device_list(
+        std::to_string(gpu_rank % gpu_num));
     graph::SetDefaultDevice(str, graph_def);
   }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/install/build_cc.sh b/source/install/build_cc.sh
index 60101eb9a8..17b5ed0de4 100755
--- a/source/install/build_cc.sh
+++ b/source/install/build_cc.sh
@@ -25,7 +25,7 @@ cmake -D ENABLE_TENSORFLOW=ON \
 	-D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
 	-D USE_TF_PYTHON_LIBS=TRUE \
 	${CUDA_ARGS} \
-	-D LAMMPS_VERSION=stable_29Aug2024 \
+	-D LAMMPS_VERSION=stable_29Aug2024_update1 \
 	..
 cmake --build . -j${NPROC}
 cmake --install .
diff --git a/source/install/build_from_c.sh b/source/install/build_from_c.sh
index ff9268f649..22739ec531 100755
--- a/source/install/build_from_c.sh
+++ b/source/install/build_from_c.sh
@@ -13,7 +13,7 @@ NPROC=$(nproc --all)
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DDEEPMD_C_ROOT=${DEEPMD_C_ROOT} -DLAMMPS_VERSION=stable_29Aug2024 ..
+cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DDEEPMD_C_ROOT=${DEEPMD_C_ROOT} -DLAMMPS_VERSION=stable_29Aug2024_update1 ..
 cmake --build . -j${NPROC}
 cmake --install .
 cmake --build . --target=lammps
diff --git a/source/install/build_lammps.sh b/source/install/build_lammps.sh
index a1e62691ca..add1194151 100755
--- a/source/install/build_lammps.sh
+++ b/source/install/build_lammps.sh
@@ -14,7 +14,7 @@ BUILD_TMP_DIR=${SCRIPT_PATH}/../build_lammps
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
 # download LAMMMPS
-LAMMPS_VERSION=stable_29Aug2024
+LAMMPS_VERSION=stable_29Aug2024_update1
 if [ ! -d "lammps-${LAMMPS_VERSION}" ]; then
 	curl -L -o lammps.tar.gz https://github.com/lammps/lammps/archive/refs/tags/${LAMMPS_VERSION}.tar.gz
 	tar vxzf lammps.tar.gz
diff --git a/source/install/build_tf.py b/source/install/build_tf.py
index a65e922098..a9e1e247cd 100755
--- a/source/install/build_tf.py
+++ b/source/install/build_tf.py
@@ -56,8 +56,6 @@
     ignore_patterns,
 )
 from typing import (
-    Dict,
-    List,
     Optional,
 )
 
@@ -225,11 +223,11 @@ class Build(metaclass=ABCMeta):
     """Build process."""
 
     @abstractproperty
-    def resources(self) -> Dict[str, OnlineResource]:
+    def resources(self) -> dict[str, OnlineResource]:
         """Required resources."""
 
     @abstractproperty
-    def dependencies(self) -> Dict[str, "Build"]:
+    def dependencies(self) -> dict[str, "Build"]:
         """Required dependencies."""
 
     def download_all_resources(self):
@@ -364,7 +362,7 @@ def _ignore_patterns(path, names):
     return _ignore_patterns
 
 
-def call(commands: List[str], env={}, **kwargs):
+def call(commands: list[str], env={}, **kwargs):
     """Call commands and print to screen for debug.
 
     Raises
@@ -423,14 +421,14 @@ def __init__(self, version="1.11.0") -> None:
 
     @property
     @lru_cache
-    def resources(self) -> Dict[str, OnlineResource]:
+    def resources(self) -> dict[str, OnlineResource]:
         return {
             "bazelisk": RESOURCES["bazelisk-" + self.version],
         }
 
     @property
     @lru_cache
-    def dependencies(self) -> Dict[str, Build]:
+    def dependencies(self) -> dict[str, Build]:
         return {}
 
     def build(self):
@@ -449,12 +447,12 @@ class BuildNumPy(Build):
 
     @property
     @lru_cache
-    def resources(self) -> Dict[str, OnlineResource]:
+    def resources(self) -> dict[str, OnlineResource]:
         return {}
 
     @property
     @lru_cache
-    def dependencies(self) -> Dict[str, Build]:
+    def dependencies(self) -> dict[str, Build]:
         return {}
 
     @property
@@ -481,12 +479,12 @@ class BuildCUDA(Build):
 
     @property
     @lru_cache
-    def resources(self) -> Dict[str, OnlineResource]:
+    def resources(self) -> dict[str, OnlineResource]:
         return {}
 
     @property
     @lru_cache
-    def dependencies(self) -> Dict[str, Build]:
+    def dependencies(self) -> dict[str, Build]:
         return {}
 
     def build(self):
@@ -554,12 +552,12 @@ class BuildROCM(Build):
 
     @property
     @lru_cache
-    def resources(self) -> Dict[str, OnlineResource]:
+    def resources(self) -> dict[str, OnlineResource]:
         return {}
 
     @property
     @lru_cache
-    def dependencies(self) -> Dict[str, Build]:
+    def dependencies(self) -> dict[str, Build]:
         return {}
 
     def build(self):
@@ -599,14 +597,14 @@ def __init__(
 
     @property
     @lru_cache
-    def resources(self) -> Dict[str, OnlineResource]:
+    def resources(self) -> dict[str, OnlineResource]:
         return {
             "tensorflow": RESOURCES["tensorflow-" + self.version],
         }
 
     @property
     @lru_cache
-    def dependencies(self) -> Dict[str, Build]:
+    def dependencies(self) -> dict[str, Build]:
         optional_dep = {}
         if self.enable_cuda:
             optional_dep["cuda"] = BuildCUDA()
@@ -778,12 +776,12 @@ def _environments(self) -> dict:
         }
 
     @property
-    def _build_targets(self) -> List[str]:
+    def _build_targets(self) -> list[str]:
         # C++ interface
         return ["//tensorflow:libtensorflow_cc" + get_shlib_ext()]
 
     @property
-    def _build_opts(self) -> List[str]:
+    def _build_opts(self) -> list[str]:
         opts = [
             "--logging=6",
             "--verbose_failures",
@@ -798,7 +796,7 @@ def _build_opts(self) -> List[str]:
         return opts
 
     @property
-    def _bazel_opts(self) -> List[str]:
+    def _bazel_opts(self) -> list[str]:
         return []
 
     @property
@@ -826,7 +824,7 @@ def clean_package():
 # interface
 
 
-def env() -> Dict[str, str]:
+def env() -> dict[str, str]:
     return {
         "Python": sys.executable,
         "CUDA": CUDA_PATH,
@@ -855,12 +853,12 @@ class RawTextArgumentDefaultsHelpFormatter(
     pass
 
 
-def parse_args(args: Optional[List[str]] = None):
+def parse_args(args: Optional[list[str]] = None):
     """TensorFlow C++ Library Installer commandline options argument parser.
 
     Parameters
     ----------
-    args : List[str]
+    args : list[str]
         list of command line arguments, main purpose is testing default option None
         takes arguments from sys.argv
     """
diff --git a/source/install/test_cc.sh b/source/install/test_cc.sh
index ccdaf124cd..1626f36193 100755
--- a/source/install/test_cc.sh
+++ b/source/install/test_cc.sh
@@ -17,7 +17,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_29Aug2024 ${CUDA_ARGS} ..
+cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_29Aug2024_update1 ${CUDA_ARGS} ..
 cmake --build . -j${NPROC}
 cmake --install .
 ctest --output-on-failure
diff --git a/source/install/test_cc_local.sh b/source/install/test_cc_local.sh
index fdb2396a28..8ce4de4b21 100755
--- a/source/install/test_cc_local.sh
+++ b/source/install/test_cc_local.sh
@@ -25,7 +25,7 @@ cmake \
 	-D USE_TF_PYTHON_LIBS=TRUE \
 	-D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
 	-D BUILD_TESTING:BOOL=TRUE \
-	-D LAMMPS_VERSION=stable_29Aug2024 \
+	-D LAMMPS_VERSION=stable_29Aug2024_update1 \
 	${CUDA_ARGS} ..
 cmake --build . -j${NPROC}
 cmake --install .
diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index fb467674cb..9504a95b7a 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -18,8 +18,10 @@
 #define gpuMemset cudaMemset
 
 #define GPU_MAX_NBOR_SIZE 4096
-#define DPErrcheck(res) \
-  { DPAssert((res), __FILE__, __LINE__); }
+#define DPErrcheck(res)                  \
+  {                                      \
+    DPAssert((res), __FILE__, __LINE__); \
+  }
 inline void DPAssert(cudaError_t code,
                      const char *file,
                      int line,
@@ -54,8 +56,10 @@ inline void DPAssert(cudaError_t code,
   }
 }
 
-#define nborErrcheck(res) \
-  { nborAssert((res), __FILE__, __LINE__); }
+#define nborErrcheck(res)                  \
+  {                                        \
+    nborAssert((res), __FILE__, __LINE__); \
+  }
 inline void nborAssert(cudaError_t code,
                        const char *file,
                        int line,
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index fbd5e1ce3f..abb7ddfa62 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -20,8 +20,10 @@
 #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemset hipMemset
 
-#define DPErrcheck(res) \
-  { DPAssert((res), __FILE__, __LINE__); }
+#define DPErrcheck(res)                  \
+  {                                      \
+    DPAssert((res), __FILE__, __LINE__); \
+  }
 inline void DPAssert(hipError_t code,
                      const char *file,
                      int line,
@@ -39,8 +41,10 @@ inline void DPAssert(hipError_t code,
   }
 }
 
-#define nborErrcheck(res) \
-  { nborAssert((res), __FILE__, __LINE__); }
+#define nborErrcheck(res)                  \
+  {                                        \
+    nborAssert((res), __FILE__, __LINE__); \
+  }
 inline void nborAssert(hipError_t code,
                        const char *file,
                        int line,
diff --git a/source/lib/src/coord.cc b/source/lib/src/coord.cc
index b1456bc7f1..8e759f372f 100644
--- a/source/lib/src/coord.cc
+++ b/source/lib/src/coord.cc
@@ -4,6 +4,7 @@
 #include <vector>
 
 #include "SimulationRegion.h"
+#include "errors.h"
 #include "neighbor_list.h"
 
 using namespace deepmd;
@@ -95,6 +96,12 @@ void deepmd::compute_cell_info(
   }
   cell_info[21] = (cell_info[3 + 0]) * (cell_info[3 + 1]) *
                   (cell_info[3 + 2]);  // loc_cellnum
+  if (cell_info[21] <= 0) {
+    throw deepmd::deepmd_exception(
+        "loc_cellnum should be positive but is " +
+        std::to_string(cell_info[21]) +
+        ". You may give a PBC box with zero volume.");
+  }
   cell_info[22] = (2 * cell_info[12 + 0] + cell_info[3 + 0]) *
                   (2 * cell_info[12 + 1] + cell_info[3 + 1]) *
                   (2 * cell_info[12 + 2] + cell_info[3 + 2]);  // total_cellnum
diff --git a/source/tests/array_api_strict/__init__.py b/source/tests/array_api_strict/__init__.py
new file mode 100644
index 0000000000..27785c2fd5
--- /dev/null
+++ b/source/tests/array_api_strict/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Synchronize with deepmd.jax for test purpose only."""
diff --git a/source/tests/array_api_strict/common.py b/source/tests/array_api_strict/common.py
new file mode 100644
index 0000000000..28f67a97f6
--- /dev/null
+++ b/source/tests/array_api_strict/common.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Optional,
+)
+
+import array_api_strict
+import numpy as np
+
+
+def to_array_api_strict_array(array: Optional[np.ndarray]):
+    """Convert a numpy array to a JAX array.
+
+    Parameters
+    ----------
+    array : np.ndarray
+        The numpy array to convert.
+
+    Returns
+    -------
+    jnp.ndarray
+        The JAX tensor.
+    """
+    if array is None:
+        return None
+    return array_api_strict.asarray(array)
diff --git a/source/tests/array_api_strict/descriptor/__init__.py b/source/tests/array_api_strict/descriptor/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/source/tests/array_api_strict/descriptor/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/source/tests/array_api_strict/descriptor/dpa1.py b/source/tests/array_api_strict/descriptor/dpa1.py
new file mode 100644
index 0000000000..ebd688e303
--- /dev/null
+++ b/source/tests/array_api_strict/descriptor/dpa1.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
+from deepmd.dpmodel.descriptor.dpa1 import DescrptBlockSeAtten as DescrptBlockSeAttenDP
+from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1 as DescrptDPA1DP
+from deepmd.dpmodel.descriptor.dpa1 import GatedAttentionLayer as GatedAttentionLayerDP
+from deepmd.dpmodel.descriptor.dpa1 import (
+    NeighborGatedAttention as NeighborGatedAttentionDP,
+)
+from deepmd.dpmodel.descriptor.dpa1 import (
+    NeighborGatedAttentionLayer as NeighborGatedAttentionLayerDP,
+)
+
+from ..common import (
+    to_array_api_strict_array,
+)
+from ..utils.exclude_mask import (
+    PairExcludeMask,
+)
+from ..utils.network import (
+    LayerNorm,
+    NativeLayer,
+    NetworkCollection,
+)
+from ..utils.type_embed import (
+    TypeEmbedNet,
+)
+
+
+class GatedAttentionLayer(GatedAttentionLayerDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name in {"in_proj", "out_proj"}:
+            value = NativeLayer.deserialize(value.serialize())
+        return super().__setattr__(name, value)
+
+
+class NeighborGatedAttentionLayer(NeighborGatedAttentionLayerDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == "attention_layer":
+            value = GatedAttentionLayer.deserialize(value.serialize())
+        elif name == "attn_layer_norm":
+            value = LayerNorm.deserialize(value.serialize())
+        return super().__setattr__(name, value)
+
+
+class NeighborGatedAttention(NeighborGatedAttentionDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == "attention_layers":
+            value = [
+                NeighborGatedAttentionLayer.deserialize(ii.serialize()) for ii in value
+            ]
+        return super().__setattr__(name, value)
+
+
+class DescrptBlockSeAtten(DescrptBlockSeAttenDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name in {"mean", "stddev"}:
+            value = to_array_api_strict_array(value)
+        elif name in {"embeddings", "embeddings_strip"}:
+            if value is not None:
+                value = NetworkCollection.deserialize(value.serialize())
+        elif name == "dpa1_attention":
+            value = NeighborGatedAttention.deserialize(value.serialize())
+        elif name == "env_mat":
+            # env_mat doesn't store any value
+            pass
+        elif name == "emask":
+            value = PairExcludeMask(value.ntypes, value.exclude_types)
+
+        return super().__setattr__(name, value)
+
+
+class DescrptDPA1(DescrptDPA1DP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == "se_atten":
+            value = DescrptBlockSeAtten.deserialize(value.serialize())
+        elif name == "type_embedding":
+            value = TypeEmbedNet.deserialize(value.serialize())
+        return super().__setattr__(name, value)
diff --git a/source/tests/array_api_strict/fitting/__init__.py b/source/tests/array_api_strict/fitting/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/source/tests/array_api_strict/fitting/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/source/tests/array_api_strict/fitting/fitting.py b/source/tests/array_api_strict/fitting/fitting.py
new file mode 100644
index 0000000000..2e6bd9fe25
--- /dev/null
+++ b/source/tests/array_api_strict/fitting/fitting.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
+from deepmd.dpmodel.fitting.ener_fitting import EnergyFittingNet as EnergyFittingNetDP
+
+from ..common import (
+    to_array_api_strict_array,
+)
+from ..utils.exclude_mask import (
+    AtomExcludeMask,
+)
+from ..utils.network import (
+    NetworkCollection,
+)
+
+
+def setattr_for_general_fitting(name: str, value: Any) -> Any:
+    if name in {
+        "bias_atom_e",
+        "fparam_avg",
+        "fparam_inv_std",
+        "aparam_avg",
+        "aparam_inv_std",
+    }:
+        value = to_array_api_strict_array(value)
+    elif name == "emask":
+        value = AtomExcludeMask(value.ntypes, value.exclude_types)
+    elif name == "nets":
+        value = NetworkCollection.deserialize(value.serialize())
+    return value
+
+
+class EnergyFittingNet(EnergyFittingNetDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        value = setattr_for_general_fitting(name, value)
+        return super().__setattr__(name, value)
diff --git a/source/tests/array_api_strict/utils/__init__.py b/source/tests/array_api_strict/utils/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/source/tests/array_api_strict/utils/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/source/tests/array_api_strict/utils/exclude_mask.py b/source/tests/array_api_strict/utils/exclude_mask.py
new file mode 100644
index 0000000000..7f5c29e0a8
--- /dev/null
+++ b/source/tests/array_api_strict/utils/exclude_mask.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
+from deepmd.dpmodel.utils.exclude_mask import AtomExcludeMask as AtomExcludeMaskDP
+from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask as PairExcludeMaskDP
+
+from ..common import (
+    to_array_api_strict_array,
+)
+
+
+class AtomExcludeMask(AtomExcludeMaskDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name in {"type_mask"}:
+            value = to_array_api_strict_array(value)
+        return super().__setattr__(name, value)
+
+
+class PairExcludeMask(PairExcludeMaskDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name in {"type_mask"}:
+            value = to_array_api_strict_array(value)
+        return super().__setattr__(name, value)
diff --git a/source/tests/array_api_strict/utils/network.py b/source/tests/array_api_strict/utils/network.py
new file mode 100644
index 0000000000..42b0bb5c61
--- /dev/null
+++ b/source/tests/array_api_strict/utils/network.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+    ClassVar,
+)
+
+from deepmd.dpmodel.common import (
+    NativeOP,
+)
+from deepmd.dpmodel.utils.network import LayerNorm as LayerNormDP
+from deepmd.dpmodel.utils.network import NativeLayer as NativeLayerDP
+from deepmd.dpmodel.utils.network import NetworkCollection as NetworkCollectionDP
+from deepmd.dpmodel.utils.network import (
+    make_embedding_network,
+    make_fitting_network,
+    make_multilayer_network,
+)
+
+from ..common import (
+    to_array_api_strict_array,
+)
+
+
+class NativeLayer(NativeLayerDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name in {"w", "b", "idt"}:
+            value = to_array_api_strict_array(value)
+        return super().__setattr__(name, value)
+
+
+NativeNet = make_multilayer_network(NativeLayer, NativeOP)
+EmbeddingNet = make_embedding_network(NativeNet, NativeLayer)
+FittingNet = make_fitting_network(EmbeddingNet, NativeNet, NativeLayer)
+
+
+class NetworkCollection(NetworkCollectionDP):
+    NETWORK_TYPE_MAP: ClassVar[dict[str, type]] = {
+        "network": NativeNet,
+        "embedding_network": EmbeddingNet,
+        "fitting_network": FittingNet,
+    }
+
+
+class LayerNorm(LayerNormDP, NativeLayer):
+    pass
diff --git a/source/tests/array_api_strict/utils/type_embed.py b/source/tests/array_api_strict/utils/type_embed.py
new file mode 100644
index 0000000000..7551279002
--- /dev/null
+++ b/source/tests/array_api_strict/utils/type_embed.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
+from deepmd.dpmodel.utils.type_embed import TypeEmbedNet as TypeEmbedNetDP
+
+from ..common import (
+    to_array_api_strict_array,
+)
+from ..utils.network import (
+    EmbeddingNet,
+)
+
+
+class TypeEmbedNet(TypeEmbedNetDP):
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name in {"econf_tebd"}:
+            value = to_array_api_strict_array(value)
+        if name in {"embedding_net"}:
+            value = EmbeddingNet.deserialize(value.serialize())
+        return super().__setattr__(name, value)
diff --git a/source/tests/common/dpmodel/array_api/test_env_mat.py b/source/tests/common/dpmodel/array_api/test_env_mat.py
index d5bc7b6c18..8dfa199d53 100644
--- a/source/tests/common/dpmodel/array_api/test_env_mat.py
+++ b/source/tests/common/dpmodel/array_api/test_env_mat.py
@@ -1,11 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-import sys
 import unittest
 
-if sys.version_info >= (3, 9):
-    import array_api_strict as xp
-else:
-    raise unittest.SkipTest("array_api_strict doesn't support Python<=3.8")
+import array_api_strict as xp
 
 from deepmd.dpmodel.utils.env_mat import (
     compute_smooth_weight,
diff --git a/source/tests/common/dpmodel/test_descriptor_dpa1.py b/source/tests/common/dpmodel/test_descriptor_dpa1.py
index 317f4c3d3d..f441895f15 100644
--- a/source/tests/common/dpmodel/test_descriptor_dpa1.py
+++ b/source/tests/common/dpmodel/test_descriptor_dpa1.py
@@ -36,3 +36,22 @@ def test_self_consistency(
         mm1 = em1.call(self.coord_ext, self.atype_ext, self.nlist)
         for ii in [0, 1, 4]:
             np.testing.assert_allclose(mm0[ii], mm1[ii])
+
+    def test_multiple_frames(self):
+        rng = np.random.default_rng(GLOBAL_SEED)
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(dstd)
+
+        em0 = DescrptDPA1(self.rcut, self.rcut_smth, self.sel, ntypes=2)
+        em0.davg = davg
+        em0.dstd = dstd
+        two_coord_ext = np.concatenate([self.coord_ext, self.coord_ext], axis=0)
+        two_atype_ext = np.concatenate([self.atype_ext, self.atype_ext], axis=0)
+        two_nlist = np.concatenate([self.nlist, self.nlist], axis=0)
+
+        mm0 = em0.call(two_coord_ext, two_atype_ext, two_nlist)
+        for ii in [0, 1, 4]:
+            np.testing.assert_allclose(mm0[ii][0], mm0[ii][2], err_msg=f"{ii} 0~2")
+            np.testing.assert_allclose(mm0[ii][1], mm0[ii][3], err_msg=f"{ii} 1~3")
diff --git a/source/tests/common/dpmodel/test_output_def.py b/source/tests/common/dpmodel/test_output_def.py
index 9e8ef2940f..03ceb67d01 100644
--- a/source/tests/common/dpmodel/test_output_def.py
+++ b/source/tests/common/dpmodel/test_output_def.py
@@ -1,8 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import unittest
-from typing import (
-    List,
-)
 
 import numpy as np
 
@@ -26,7 +23,7 @@ class VariableDef:
     def __init__(
         self,
         name: str,
-        shape: List[int],
+        shape: list[int],
         atomic: bool = True,
     ):
         self.name = name
diff --git a/source/tests/common/test_argument_parser.py b/source/tests/common/test_argument_parser.py
index 36a2f07be5..1404185607 100644
--- a/source/tests/common/test_argument_parser.py
+++ b/source/tests/common/test_argument_parser.py
@@ -15,9 +15,6 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Dict,
-    List,
-    Tuple,
     Union,
 )
 
@@ -33,13 +30,13 @@
         from typing_extensions import TypedDict  # python<=3.7
 
     class DATA(TypedDict):
-        type: Union[type, Tuple[type]]
+        type: Union[type, tuple[type]]
         value: Any
 
-    TEST_DICT = Dict[str, DATA]
+    TEST_DICT = dict[str, DATA]
 
 
-def build_args(args: "TEST_DICT", command: str) -> List[str]:
+def build_args(args: "TEST_DICT", command: str) -> list[str]:
     """Build list of arguments similar to one generated by `sys.argv` used by argparse.
 
     Parameters
@@ -51,7 +48,7 @@ def build_args(args: "TEST_DICT", command: str) -> List[str]:
 
     Returns
     -------
-    List[str]
+    list[str]
         arguments with options as list of strings, goal is to emulate `sys.argv`
     """
     args_list = [command]
diff --git a/source/tests/common/test_auto_batch_size.py b/source/tests/common/test_auto_batch_size.py
index 0369bbb70c..cc1e6bf25a 100644
--- a/source/tests/common/test_auto_batch_size.py
+++ b/source/tests/common/test_auto_batch_size.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import os
-import sys
 import unittest
 
+import array_api_strict as xp
+
 from deepmd.utils.batch_size import (
     AutoBatchSize,
 )
@@ -10,11 +11,6 @@
     OutOfMemoryError,
 )
 
-if sys.version_info >= (3, 9):
-    import array_api_strict as xp
-else:
-    raise unittest.SkipTest("array_api_strict doesn't support Python<=3.8")
-
 
 class CustomizedAutoBatchSizeCPU(AutoBatchSize):
     def is_gpu_available(self):
diff --git a/source/tests/common/test_examples.py b/source/tests/common/test_examples.py
index 6abb482824..246e767f01 100644
--- a/source/tests/common/test_examples.py
+++ b/source/tests/common/test_examples.py
@@ -34,7 +34,9 @@
     p_examples / "water" / "hybrid" / "input.json",
     p_examples / "water" / "dplr" / "train" / "dw.json",
     p_examples / "water" / "dplr" / "train" / "ener.json",
+    p_examples / "water" / "d3" / "input_pt.json",
     p_examples / "water" / "linear" / "input.json",
+    p_examples / "water" / "linear" / "input_pt.json",
     p_examples / "nopbc" / "train" / "input.json",
     p_examples / "water_tensor" / "dipole" / "dipole_input.json",
     p_examples / "water_tensor" / "polar" / "polar_input.json",
diff --git a/source/tests/consistent/common.py b/source/tests/consistent/common.py
index e8873e528a..e3bf808978 100644
--- a/source/tests/consistent/common.py
+++ b/source/tests/consistent/common.py
@@ -3,6 +3,7 @@
 import itertools
 import os
 import sys
+import unittest
 from abc import (
     ABC,
     abstractmethod,
@@ -10,14 +11,14 @@
 from enum import (
     Enum,
 )
+from importlib.util import (
+    find_spec,
+)
 from typing import (
     Any,
     Callable,
     ClassVar,
-    Dict,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 from uuid import (
@@ -33,9 +34,15 @@
     Backend,
 )
 
+from ..utils import (
+    CI,
+    TEST_DEVICE,
+)
+
 INSTALLED_TF = Backend.get_backend("tensorflow")().is_available()
 INSTALLED_PT = Backend.get_backend("pytorch")().is_available()
 INSTALLED_JAX = Backend.get_backend("jax")().is_available()
+INSTALLED_ARRAY_API_STRICT = find_spec("array_api_strict") is not None
 
 if os.environ.get("CI") and not (INSTALLED_TF and INSTALLED_PT):
     raise ImportError("TensorFlow or PyTorch should be tested in the CI")
@@ -59,6 +66,7 @@
     "INSTALLED_TF",
     "INSTALLED_PT",
     "INSTALLED_JAX",
+    "INSTALLED_ARRAY_API_STRICT",
 ]
 
 
@@ -75,7 +83,8 @@ class CommonTest(ABC):
     """PyTorch model class."""
     jax_class: ClassVar[Optional[type]]
     """JAX model class."""
-    args: ClassVar[Optional[Union[Argument, List[Argument]]]]
+    array_api_strict_class: ClassVar[Optional[type]]
+    args: ClassVar[Optional[Union[Argument, list[Argument]]]]
     """Arguments that maps to the `data`."""
     skip_dp: ClassVar[bool] = False
     """Whether to skip the native DP model."""
@@ -86,6 +95,8 @@ class CommonTest(ABC):
     # we may usually skip jax before jax is fully supported
     skip_jax: ClassVar[bool] = True
     """Whether to skip the JAX model."""
+    skip_array_api_strict: ClassVar[bool] = True
+    """Whether to skip the array_api_strict model."""
     rtol = 1e-10
     """Relative tolerance for comparing the return value. Override for float32."""
     atol = 1e-10
@@ -118,7 +129,7 @@ def pass_data_to_cls(self, cls, data) -> Any:
         return cls(**data, **self.addtional_data)
 
     @abstractmethod
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         """Build the TF graph.
 
         Parameters
@@ -166,6 +177,16 @@ def eval_jax(self, jax_obj: Any) -> Any:
         """
         raise NotImplementedError("Not implemented")
 
+    def eval_array_api_strict(self, array_api_strict_obj: Any) -> Any:
+        """Evaluate the return value of array_api_strict.
+
+        Parameters
+        ----------
+        array_api_strict_obj : Any
+            The object of array_api_strict
+        """
+        raise NotImplementedError("Not implemented")
+
     class RefBackend(Enum):
         """Reference backend."""
 
@@ -173,9 +194,10 @@ class RefBackend(Enum):
         DP = 2
         PT = 3
         JAX = 5
+        ARRAY_API_STRICT = 6
 
     @abstractmethod
-    def extract_ret(self, ret: Any, backend: RefBackend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend: RefBackend) -> tuple[np.ndarray, ...]:
         """Extract the return value when comparing with other backends.
 
         Parameters
@@ -193,7 +215,7 @@ def extract_ret(self, ret: Any, backend: RefBackend) -> Tuple[np.ndarray, ...]:
 
     def build_eval_tf(
         self, sess: "tf.Session", obj: Any, suffix: str
-    ) -> List[np.ndarray]:
+    ) -> list[np.ndarray]:
         """Build and evaluate the TF graph."""
         t_out, feed_dict = self.build_tf(obj, suffix)
 
@@ -238,6 +260,11 @@ def get_jax_ret_serialization_from_cls(self, obj):
         data = obj.serialize()
         return ret, data
 
+    def get_array_api_strict_ret_serialization_from_cls(self, obj):
+        ret = self.eval_array_api_strict(obj)
+        data = obj.serialize()
+        return ret, data
+
     def get_reference_backend(self):
         """Get the reference backend.
 
@@ -251,6 +278,8 @@ def get_reference_backend(self):
             return self.RefBackend.PT
         if not self.skip_jax:
             return self.RefBackend.JAX
+        if not self.skip_array_api_strict:
+            return self.RefBackend.ARRAY_API_STRICT
         raise ValueError("No available reference")
 
     def get_reference_ret_serialization(self, ref: RefBackend):
@@ -264,6 +293,12 @@ def get_reference_ret_serialization(self, ref: RefBackend):
         if ref == self.RefBackend.PT:
             obj = self.init_backend_cls(self.pt_class)
             return self.get_pt_ret_serialization_from_cls(obj)
+        if ref == self.RefBackend.JAX:
+            obj = self.init_backend_cls(self.jax_class)
+            return self.get_jax_ret_serialization_from_cls(obj)
+        if ref == self.RefBackend.ARRAY_API_STRICT:
+            obj = self.init_backend_cls(self.array_api_strict_class)
+            return self.get_array_api_strict_ret_serialization_from_cls(obj)
         raise ValueError("No available reference")
 
     def test_tf_consistent_with_ref(self):
@@ -311,6 +346,7 @@ def test_tf_self_consistent(self):
             np.testing.assert_allclose(rr1, rr2, rtol=self.rtol, atol=self.atol)
             assert rr1.dtype == rr2.dtype, f"{rr1.dtype} != {rr2.dtype}"
 
+    @unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
     def test_dp_consistent_with_ref(self):
         """Test whether DP and reference are consistent."""
         if self.skip_dp:
@@ -329,6 +365,7 @@ def test_dp_consistent_with_ref(self):
             np.testing.assert_allclose(rr1, rr2, rtol=self.rtol, atol=self.atol)
             assert rr1.dtype == rr2.dtype, f"{rr1.dtype} != {rr2.dtype}"
 
+    @unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
     def test_dp_self_consistent(self):
         """Test whether DP is self consistent."""
         if self.skip_dp:
@@ -418,6 +455,42 @@ def test_jax_self_consistent(self):
             else:
                 self.assertEqual(rr1, rr2)
 
+    @unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
+    def test_array_api_strict_consistent_with_ref(self):
+        """Test whether array_api_strict and reference are consistent."""
+        if self.skip_array_api_strict:
+            self.skipTest("Unsupported backend")
+        ref_backend = self.get_reference_backend()
+        if ref_backend == self.RefBackend.ARRAY_API_STRICT:
+            self.skipTest("Reference is self")
+        ret1, data1 = self.get_reference_ret_serialization(ref_backend)
+        ret1 = self.extract_ret(ret1, ref_backend)
+        array_api_strict_obj = self.array_api_strict_class.deserialize(data1)
+        ret2 = self.eval_array_api_strict(array_api_strict_obj)
+        ret2 = self.extract_ret(ret2, self.RefBackend.ARRAY_API_STRICT)
+        data2 = array_api_strict_obj.serialize()
+        np.testing.assert_equal(data1, data2)
+        for rr1, rr2 in zip(ret1, ret2):
+            np.testing.assert_allclose(rr1, rr2, rtol=self.rtol, atol=self.atol)
+            assert rr1.dtype == rr2.dtype, f"{rr1.dtype} != {rr2.dtype}"
+
+    @unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
+    def test_array_api_strict_self_consistent(self):
+        """Test whether array_api_strict is self consistent."""
+        if self.skip_array_api_strict:
+            self.skipTest("Unsupported backend")
+        obj1 = self.init_backend_cls(self.array_api_strict_class)
+        ret1, data1 = self.get_array_api_strict_ret_serialization_from_cls(obj1)
+        obj1 = self.array_api_strict_class.deserialize(data1)
+        ret2, data2 = self.get_array_api_strict_ret_serialization_from_cls(obj1)
+        np.testing.assert_equal(data1, data2)
+        for rr1, rr2 in zip(ret1, ret2):
+            if isinstance(rr1, np.ndarray) and isinstance(rr2, np.ndarray):
+                np.testing.assert_allclose(rr1, rr2, rtol=self.rtol, atol=self.atol)
+                assert rr1.dtype == rr2.dtype, f"{rr1.dtype} != {rr2.dtype}"
+            else:
+                self.assertEqual(rr1, rr2)
+
     def tearDown(self) -> None:
         """Clear the TF session."""
         if not self.skip_tf:
@@ -489,7 +562,7 @@ class TestClass(base_class):
 
 def parameterize_func(
     func: Callable,
-    param_dict_list: Dict[str, Tuple],
+    param_dict_list: dict[str, tuple],
 ):
     """Parameterize functions with different default values.
 
@@ -497,7 +570,7 @@ def parameterize_func(
     ----------
     func : Callable
         The base function.
-    param_dict_list : Dict[str, Tuple]
+    param_dict_list : dict[str, Tuple]
         Dictionary of parameters with default values to be changed in base function, each of which is a tuple of choices.
 
     Returns
diff --git a/source/tests/consistent/descriptor/common.py b/source/tests/consistent/descriptor/common.py
index 74fc3d9b07..e0ca30c799 100644
--- a/source/tests/consistent/descriptor/common.py
+++ b/source/tests/consistent/descriptor/common.py
@@ -3,6 +3,8 @@
     Any,
 )
 
+import numpy as np
+
 from deepmd.common import (
     make_default_mesh,
 )
@@ -12,6 +14,8 @@
 )
 
 from ..common import (
+    INSTALLED_ARRAY_API_STRICT,
+    INSTALLED_JAX,
     INSTALLED_PT,
     INSTALLED_TF,
 )
@@ -29,6 +33,12 @@
         GLOBAL_TF_FLOAT_PRECISION,
         tf,
     )
+if INSTALLED_JAX:
+    from deepmd.jax.env import (
+        jnp,
+    )
+if INSTALLED_ARRAY_API_STRICT:
+    import array_api_strict
 
 
 class DescriptorTest:
@@ -99,3 +109,56 @@ def eval_pt_descriptor(
             x.detach().cpu().numpy() if torch.is_tensor(x) else x
             for x in pt_obj(ext_coords, ext_atype, nlist=nlist, mapping=mapping)
         ]
+
+    def eval_jax_descriptor(
+        self, jax_obj: Any, natoms, coords, atype, box, mixed_types: bool = False
+    ) -> Any:
+        ext_coords, ext_atype, mapping = extend_coord_with_ghosts(
+            jnp.array(coords).reshape(1, -1, 3),
+            jnp.array(atype).reshape(1, -1),
+            jnp.array(box).reshape(1, 3, 3),
+            jax_obj.get_rcut(),
+        )
+        nlist = build_neighbor_list(
+            ext_coords,
+            ext_atype,
+            natoms[0],
+            jax_obj.get_rcut(),
+            jax_obj.get_sel(),
+            distinguish_types=(not mixed_types),
+        )
+        return [
+            np.asarray(x) if isinstance(x, jnp.ndarray) else x
+            for x in jax_obj(ext_coords, ext_atype, nlist=nlist, mapping=mapping)
+        ]
+
+    def eval_array_api_strict_descriptor(
+        self,
+        array_api_strict_obj: Any,
+        natoms,
+        coords,
+        atype,
+        box,
+        mixed_types: bool = False,
+    ) -> Any:
+        array_api_strict.set_array_api_strict_flags(api_version="2023.12")
+        ext_coords, ext_atype, mapping = extend_coord_with_ghosts(
+            array_api_strict.asarray(coords.reshape(1, -1, 3)),
+            array_api_strict.asarray(atype.reshape(1, -1)),
+            array_api_strict.asarray(box.reshape(1, 3, 3)),
+            array_api_strict_obj.get_rcut(),
+        )
+        nlist = build_neighbor_list(
+            ext_coords,
+            ext_atype,
+            natoms[0],
+            array_api_strict_obj.get_rcut(),
+            array_api_strict_obj.get_sel(),
+            distinguish_types=(not mixed_types),
+        )
+        return [
+            np.asarray(x) if hasattr(x, "__array_namespace__") else x
+            for x in array_api_strict_obj(
+                ext_coords, ext_atype, nlist=nlist, mapping=mapping
+            )
+        ]
diff --git a/source/tests/consistent/descriptor/test_dpa1.py b/source/tests/consistent/descriptor/test_dpa1.py
index 0f44ecaae1..ed7884adb9 100644
--- a/source/tests/consistent/descriptor/test_dpa1.py
+++ b/source/tests/consistent/descriptor/test_dpa1.py
@@ -3,7 +3,6 @@
 from typing import (
     Any,
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -17,6 +16,8 @@
 )
 
 from ..common import (
+    INSTALLED_ARRAY_API_STRICT,
+    INSTALLED_JAX,
     INSTALLED_PT,
     INSTALLED_TF,
     CommonTest,
@@ -34,6 +35,14 @@
     from deepmd.tf.descriptor.se_atten import DescrptDPA1Compat as DescrptDPA1TF
 else:
     DescrptDPA1TF = None
+if INSTALLED_JAX:
+    from deepmd.jax.descriptor.dpa1 import DescrptDPA1 as DescriptorDPA1JAX
+else:
+    DescriptorDPA1JAX = None
+if INSTALLED_ARRAY_API_STRICT:
+    from ...array_api_strict.descriptor.dpa1 import DescrptDPA1 as DescriptorDPA1Strict
+else:
+    DescriptorDPA1Strict = None
 from deepmd.utils.argcheck import (
     descrpt_se_atten_args,
 )
@@ -184,6 +193,69 @@ def skip_dp(self) -> bool:
             temperature,
         )
 
+    @property
+    def skip_jax(self) -> bool:
+        (
+            tebd_dim,
+            tebd_input_mode,
+            resnet_dt,
+            type_one_side,
+            attn,
+            attn_layer,
+            attn_dotr,
+            excluded_types,
+            env_protection,
+            set_davg_zero,
+            scaling_factor,
+            normalize,
+            temperature,
+            ln_eps,
+            smooth_type_embedding,
+            concat_output_tebd,
+            precision,
+            use_econf_tebd,
+            use_tebd_bias,
+        ) = self.param
+        return not INSTALLED_JAX or self.is_meaningless_zero_attention_layer_tests(
+            attn_layer,
+            attn_dotr,
+            normalize,
+            temperature,
+        )
+
+    @property
+    def skip_array_api_strict(self) -> bool:
+        (
+            tebd_dim,
+            tebd_input_mode,
+            resnet_dt,
+            type_one_side,
+            attn,
+            attn_layer,
+            attn_dotr,
+            excluded_types,
+            env_protection,
+            set_davg_zero,
+            scaling_factor,
+            normalize,
+            temperature,
+            ln_eps,
+            smooth_type_embedding,
+            concat_output_tebd,
+            precision,
+            use_econf_tebd,
+            use_tebd_bias,
+        ) = self.param
+        return (
+            not INSTALLED_ARRAY_API_STRICT
+            or self.is_meaningless_zero_attention_layer_tests(
+                attn_layer,
+                attn_dotr,
+                normalize,
+                temperature,
+            )
+        )
+
     @property
     def skip_tf(self) -> bool:
         (
@@ -227,6 +299,9 @@ def skip_tf(self) -> bool:
     tf_class = DescrptDPA1TF
     dp_class = DescrptDPA1DP
     pt_class = DescrptDPA1PT
+    jax_class = DescriptorDPA1JAX
+    array_api_strict_class = DescriptorDPA1Strict
+
     args = descrpt_se_atten_args().append(Argument("ntypes", int, optional=False))
 
     def setUp(self):
@@ -284,7 +359,7 @@ def setUp(self):
             use_tebd_bias,
         ) = self.param
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         return self.build_tf_descriptor(
             obj,
             self.natoms,
@@ -314,7 +389,27 @@ def eval_pt(self, pt_obj: Any) -> Any:
             mixed_types=True,
         )
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def eval_jax(self, jax_obj: Any) -> Any:
+        return self.eval_jax_descriptor(
+            jax_obj,
+            self.natoms,
+            self.coords,
+            self.atype,
+            self.box,
+            mixed_types=True,
+        )
+
+    def eval_array_api_strict(self, array_api_strict_obj: Any) -> Any:
+        return self.eval_array_api_strict_descriptor(
+            array_api_strict_obj,
+            self.natoms,
+            self.coords,
+            self.atype,
+            self.box,
+            mixed_types=True,
+        )
+
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         return (ret[0],)
 
     @property
diff --git a/source/tests/consistent/descriptor/test_dpa2.py b/source/tests/consistent/descriptor/test_dpa2.py
index 144567ae58..53f9ce4200 100644
--- a/source/tests/consistent/descriptor/test_dpa2.py
+++ b/source/tests/consistent/descriptor/test_dpa2.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -338,7 +337,7 @@ def setUp(self):
             use_tebd_bias,
         ) = self.param
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         return self.build_tf_descriptor(
             obj,
             self.natoms,
@@ -368,7 +367,7 @@ def eval_pt(self, pt_obj: Any) -> Any:
             mixed_types=True,
         )
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         return (ret[0],)
 
     @property
diff --git a/source/tests/consistent/descriptor/test_hybrid.py b/source/tests/consistent/descriptor/test_hybrid.py
index 7cfb627d54..cd52eea5be 100644
--- a/source/tests/consistent/descriptor/test_hybrid.py
+++ b/source/tests/consistent/descriptor/test_hybrid.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -105,7 +104,7 @@ def setUp(self):
         )
         self.natoms = np.array([6, 6, 2, 4], dtype=np.int32)
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         return self.build_tf_descriptor(
             obj,
             self.natoms,
@@ -133,5 +132,5 @@ def eval_pt(self, pt_obj: Any) -> Any:
             self.box,
         )
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         return (ret[0],)
diff --git a/source/tests/consistent/descriptor/test_se_atten_v2.py b/source/tests/consistent/descriptor/test_se_atten_v2.py
index 989fdc16e7..a3fe4e98b4 100644
--- a/source/tests/consistent/descriptor/test_se_atten_v2.py
+++ b/source/tests/consistent/descriptor/test_se_atten_v2.py
@@ -3,7 +3,6 @@
 from typing import (
     Any,
     Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -215,7 +214,7 @@ def setUp(self):
         )
         self.natoms = np.array([6, 6, 2, 4], dtype=np.int32)
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         return self.build_tf_descriptor(
             obj,
             self.natoms,
@@ -245,7 +244,7 @@ def eval_pt(self, pt_obj: Any) -> Any:
             mixed_types=True,
         )
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         return (ret[0],)
 
     @property
diff --git a/source/tests/consistent/descriptor/test_se_e2_a.py b/source/tests/consistent/descriptor/test_se_e2_a.py
index 1e3e5ae86d..2563ee1d6d 100644
--- a/source/tests/consistent/descriptor/test_se_e2_a.py
+++ b/source/tests/consistent/descriptor/test_se_e2_a.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -150,7 +149,7 @@ def setUp(self):
             self.atype = self.atype[idx]
             self.coords = self.coords.reshape(-1, 3)[idx].ravel()
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         return self.build_tf_descriptor(
             obj,
             self.natoms,
@@ -178,7 +177,7 @@ def eval_pt(self, pt_obj: Any) -> Any:
             self.box,
         )
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         return (ret[0],)
 
     @property
diff --git a/source/tests/consistent/descriptor/test_se_r.py b/source/tests/consistent/descriptor/test_se_r.py
index 8b835f3b5c..7103f60aa7 100644
--- a/source/tests/consistent/descriptor/test_se_r.py
+++ b/source/tests/consistent/descriptor/test_se_r.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -121,7 +120,7 @@ def setUp(self):
         )
         self.natoms = np.array([6, 6, 2, 4], dtype=np.int32)
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         return self.build_tf_descriptor(
             obj,
             self.natoms,
@@ -149,7 +148,7 @@ def eval_pt(self, pt_obj: Any) -> Any:
             self.box,
         )
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         return (ret[0],)
 
     @property
diff --git a/source/tests/consistent/descriptor/test_se_t.py b/source/tests/consistent/descriptor/test_se_t.py
index 7579344012..833b76f6e1 100644
--- a/source/tests/consistent/descriptor/test_se_t.py
+++ b/source/tests/consistent/descriptor/test_se_t.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -141,7 +140,7 @@ def setUp(self):
         self.atype = self.atype[idx]
         self.coords = self.coords.reshape(-1, 3)[idx].ravel()
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         return self.build_tf_descriptor(
             obj,
             self.natoms,
@@ -169,7 +168,7 @@ def eval_pt(self, pt_obj: Any) -> Any:
             self.box,
         )
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         return (ret[0],)
 
     @property
diff --git a/source/tests/consistent/descriptor/test_se_t_tebd.py b/source/tests/consistent/descriptor/test_se_t_tebd.py
index d9bd00aad3..3299a04c78 100644
--- a/source/tests/consistent/descriptor/test_se_t_tebd.py
+++ b/source/tests/consistent/descriptor/test_se_t_tebd.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -187,7 +186,7 @@ def setUp(self):
             use_tebd_bias,
         ) = self.param
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         return self.build_tf_descriptor(
             obj,
             self.natoms,
@@ -217,7 +216,7 @@ def eval_pt(self, pt_obj: Any) -> Any:
             mixed_types=True,
         )
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         return (ret[0],)
 
     @property
diff --git a/source/tests/consistent/fitting/test_dipole.py b/source/tests/consistent/fitting/test_dipole.py
index 4f33d58c10..5d7be1b0e5 100644
--- a/source/tests/consistent/fitting/test_dipole.py
+++ b/source/tests/consistent/fitting/test_dipole.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -97,7 +96,7 @@ def addtional_data(self) -> dict:
             "embedding_width": 30,
         }
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         (
             resnet_dt,
             precision,
@@ -144,7 +143,7 @@ def eval_dp(self, dp_obj: Any) -> Any:
             None,
         )["dipole"]
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         if backend == self.RefBackend.TF:
             # shape is not same
             ret = ret[0].reshape(-1, self.natoms[0], 1)
diff --git a/source/tests/consistent/fitting/test_dos.py b/source/tests/consistent/fitting/test_dos.py
index bfdf76c8ff..ada65c8ac5 100644
--- a/source/tests/consistent/fitting/test_dos.py
+++ b/source/tests/consistent/fitting/test_dos.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -106,7 +105,7 @@ def addtional_data(self) -> dict:
             "mixed_types": mixed_types,
         }
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         (
             resnet_dt,
             precision,
@@ -158,7 +157,7 @@ def eval_dp(self, dp_obj: Any) -> Any:
             fparam=self.fparam if numb_fparam else None,
         )["dos"]
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         if backend == self.RefBackend.TF:
             # shape is not same
             ret = ret[0].reshape(-1, self.natoms[0], 1)
diff --git a/source/tests/consistent/fitting/test_ener.py b/source/tests/consistent/fitting/test_ener.py
index 157b1bab8a..ba2be1d86b 100644
--- a/source/tests/consistent/fitting/test_ener.py
+++ b/source/tests/consistent/fitting/test_ener.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -13,6 +12,8 @@
 )
 
 from ..common import (
+    INSTALLED_ARRAY_API_STRICT,
+    INSTALLED_JAX,
     INSTALLED_PT,
     INSTALLED_TF,
     CommonTest,
@@ -37,6 +38,22 @@
     fitting_ener,
 )
 
+if INSTALLED_JAX:
+    from deepmd.jax.env import (
+        jnp,
+    )
+    from deepmd.jax.fitting.fitting import EnergyFittingNet as EnerFittingJAX
+else:
+    EnerFittingJAX = object
+if INSTALLED_ARRAY_API_STRICT:
+    import array_api_strict
+
+    from ...array_api_strict.fitting.fitting import (
+        EnergyFittingNet as EnerFittingStrict,
+    )
+else:
+    EnerFittingStrict = None
+
 
 @parameterized(
     (True, False),  # resnet_dt
@@ -75,9 +92,25 @@ def skip_pt(self) -> bool:
         ) = self.param
         return CommonTest.skip_pt
 
+    skip_jax = not INSTALLED_JAX
+
+    @property
+    def skip_array_api_strict(self) -> bool:
+        (
+            resnet_dt,
+            precision,
+            mixed_types,
+            numb_fparam,
+            atom_ener,
+        ) = self.param
+        # TypeError: The array_api_strict namespace does not support the dtype 'bfloat16'
+        return not INSTALLED_ARRAY_API_STRICT or precision == "bfloat16"
+
     tf_class = EnerFittingTF
     dp_class = EnerFittingDP
     pt_class = EnerFittingPT
+    jax_class = EnerFittingJAX
+    array_api_strict_class = EnerFittingStrict
     args = fitting_ener()
 
     def setUp(self):
@@ -106,7 +139,7 @@ def addtional_data(self) -> dict:
             "mixed_types": mixed_types,
         }
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         (
             resnet_dt,
             precision,
@@ -158,7 +191,40 @@ def eval_dp(self, dp_obj: Any) -> Any:
             fparam=self.fparam if numb_fparam else None,
         )["energy"]
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def eval_jax(self, jax_obj: Any) -> Any:
+        (
+            resnet_dt,
+            precision,
+            mixed_types,
+            numb_fparam,
+            atom_ener,
+        ) = self.param
+        return np.asarray(
+            jax_obj(
+                jnp.asarray(self.inputs),
+                jnp.asarray(self.atype.reshape(1, -1)),
+                fparam=jnp.asarray(self.fparam) if numb_fparam else None,
+            )["energy"]
+        )
+
+    def eval_array_api_strict(self, array_api_strict_obj: Any) -> Any:
+        array_api_strict.set_array_api_strict_flags(api_version="2023.12")
+        (
+            resnet_dt,
+            precision,
+            mixed_types,
+            numb_fparam,
+            atom_ener,
+        ) = self.param
+        return np.asarray(
+            array_api_strict_obj(
+                array_api_strict.asarray(self.inputs),
+                array_api_strict.asarray(self.atype.reshape(1, -1)),
+                fparam=array_api_strict.asarray(self.fparam) if numb_fparam else None,
+            )["energy"]
+        )
+
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         if backend == self.RefBackend.TF:
             # shape is not same
             ret = ret[0].reshape(-1, self.natoms[0], 1)
diff --git a/source/tests/consistent/fitting/test_polar.py b/source/tests/consistent/fitting/test_polar.py
index 808514ade4..6a3465ba24 100644
--- a/source/tests/consistent/fitting/test_polar.py
+++ b/source/tests/consistent/fitting/test_polar.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -97,7 +96,7 @@ def addtional_data(self) -> dict:
             "embedding_width": 30,
         }
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         (
             resnet_dt,
             precision,
@@ -144,7 +143,7 @@ def eval_dp(self, dp_obj: Any) -> Any:
             None,
         )["polarizability"]
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         if backend == self.RefBackend.TF:
             # shape is not same
             ret = ret[0].reshape(-1, self.natoms[0], 1)
diff --git a/source/tests/consistent/fitting/test_property.py b/source/tests/consistent/fitting/test_property.py
index 3f406d3a6b..a9fb6b694a 100644
--- a/source/tests/consistent/fitting/test_property.py
+++ b/source/tests/consistent/fitting/test_property.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -113,7 +112,7 @@ def addtional_data(self) -> dict:
             "mixed_types": mixed_types,
         }
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         (
             resnet_dt,
             precision,
@@ -168,7 +167,7 @@ def eval_dp(self, dp_obj: Any) -> Any:
             fparam=self.fparam if numb_fparam else None,
         )["property"]
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         if backend == self.RefBackend.TF:
             # shape is not same
             ret = ret[0].reshape(-1, self.natoms[0], 1)
diff --git a/source/tests/consistent/model/test_ener.py b/source/tests/consistent/model/test_ener.py
index c8ff9e4dcf..692e1287dc 100644
--- a/source/tests/consistent/model/test_ener.py
+++ b/source/tests/consistent/model/test_ener.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -141,7 +140,7 @@ def setUp(self):
         self.atype = self.atype[:, idx_map]
         self.coords = self.coords[:, idx_map]
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         return self.build_tf_model(
             obj,
             self.natoms,
@@ -169,7 +168,7 @@ def eval_pt(self, pt_obj: Any) -> Any:
             self.box,
         )
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         # shape not matched. ravel...
         if backend is self.RefBackend.DP:
             return (ret["energy_redu"].ravel(), ret["energy"].ravel())
diff --git a/source/tests/consistent/model/test_frozen.py b/source/tests/consistent/model/test_frozen.py
index e362aed511..f11a11914b 100644
--- a/source/tests/consistent/model/test_frozen.py
+++ b/source/tests/consistent/model/test_frozen.py
@@ -3,7 +3,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -121,7 +120,7 @@ def setUp(self):
         self.atype = self.atype[:, idx_map]
         self.coords = self.coords[:, idx_map]
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         return self.build_tf_model(
             obj,
             self.natoms,
@@ -149,7 +148,7 @@ def eval_pt(self, pt_obj: Any) -> Any:
             self.box,
         )
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         # shape not matched. ravel...
         if backend is self.RefBackend.DP:
             return (ret["energy_redu"].ravel(), ret["energy"].ravel())
diff --git a/source/tests/consistent/test_type_embedding.py b/source/tests/consistent/test_type_embedding.py
index c66ef0fbaa..e2836c7a6c 100644
--- a/source/tests/consistent/test_type_embedding.py
+++ b/source/tests/consistent/test_type_embedding.py
@@ -2,7 +2,6 @@
 import unittest
 from typing import (
     Any,
-    Tuple,
 )
 
 import numpy as np
@@ -13,6 +12,7 @@
 )
 
 from .common import (
+    INSTALLED_ARRAY_API_STRICT,
     INSTALLED_JAX,
     INSTALLED_PT,
     INSTALLED_TF,
@@ -38,6 +38,10 @@
     from deepmd.jax.utils.type_embed import TypeEmbedNet as TypeEmbedNetJAX
 else:
     TypeEmbedNetJAX = object
+if INSTALLED_ARRAY_API_STRICT:
+    from ..array_api_strict.utils.type_embed import TypeEmbedNet as TypeEmbedNetStrict
+else:
+    TypeEmbedNetStrict = None
 
 
 @parameterized(
@@ -72,8 +76,10 @@ def data(self) -> dict:
     dp_class = TypeEmbedNetDP
     pt_class = TypeEmbedNetPT
     jax_class = TypeEmbedNetJAX
+    array_api_strict_class = TypeEmbedNetStrict
     args = type_embedding_args()
     skip_jax = not INSTALLED_JAX
+    skip_array_api_strict = not INSTALLED_ARRAY_API_STRICT
 
     @property
     def addtional_data(self) -> dict:
@@ -96,7 +102,7 @@ def setUp(self):
 
         self.ntypes = 2
 
-    def build_tf(self, obj: Any, suffix: str) -> Tuple[list, dict]:
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
         return [
             obj.build(
                 obj.ntypes,
@@ -121,7 +127,13 @@ def eval_jax(self, jax_obj: Any) -> Any:
                 raise ValueError("Output is numpy array")
         return [np.array(x) if isinstance(x, jnp.ndarray) else x for x in (out,)]
 
-    def extract_ret(self, ret: Any, backend) -> Tuple[np.ndarray, ...]:
+    def eval_array_api_strict(self, array_api_strict_obj: Any) -> Any:
+        out = array_api_strict_obj()
+        return [
+            np.asarray(x) if hasattr(x, "__array_namespace__") else x for x in (out,)
+        ]
+
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         return (ret[0],)
 
     @property
diff --git a/source/tests/infer/case.py b/source/tests/infer/case.py
index c1bce424c4..4a5ce638d6 100644
--- a/source/tests/infer/case.py
+++ b/source/tests/infer/case.py
@@ -27,7 +27,6 @@
     Path,
 )
 from typing import (
-    Dict,
     Optional,
 )
 
@@ -175,12 +174,12 @@ def get_model(self, suffix: str, out_file: Optional[str] = None) -> str:
 
 
 @lru_cache
-def get_cases() -> Dict[str, Case]:
+def get_cases() -> dict[str, Case]:
     """Get all test cases.
 
     Returns
     -------
-    Dict[str, Case]
+    dict[str, Case]
         A dictionary containing all test cases.
 
     Examples
diff --git a/source/tests/infer/test_models.py b/source/tests/infer/test_models.py
index 6b62e994aa..2b0f292046 100644
--- a/source/tests/infer/test_models.py
+++ b/source/tests/infer/test_models.py
@@ -153,8 +153,6 @@ def test_1frame_atm(self):
 
     def test_descriptor(self):
         _, extension = self.param
-        if extension == ".pth":
-            self.skipTest("eval_descriptor not supported for PyTorch models")
         for ii, result in enumerate(self.case.results):
             if result.descriptor is None:
                 continue
diff --git a/source/tests/pt/common.py b/source/tests/pt/common.py
index 16b343be8a..173e9d52dc 100644
--- a/source/tests/pt/common.py
+++ b/source/tests/pt/common.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
-    List,
     Optional,
     Union,
 )
@@ -46,7 +45,7 @@ def eval_model(
     model,
     coords: Union[np.ndarray, torch.Tensor],
     cells: Optional[Union[np.ndarray, torch.Tensor]],
-    atom_types: Union[np.ndarray, torch.Tensor, List[int]],
+    atom_types: Union[np.ndarray, torch.Tensor, list[int]],
     spins: Optional[Union[np.ndarray, torch.Tensor]] = None,
     atomic: bool = False,
     infer_batch_size: int = 2,
diff --git a/source/tests/pt/model/test_atomic_model_atomic_stat.py b/source/tests/pt/model/test_atomic_model_atomic_stat.py
index 470b01b507..6a21fc6e5a 100644
--- a/source/tests/pt/model/test_atomic_model_atomic_stat.py
+++ b/source/tests/pt/model/test_atomic_model_atomic_stat.py
@@ -5,7 +5,6 @@
     Path,
 )
 from typing import (
-    List,
     Optional,
 )
 
@@ -70,11 +69,11 @@ def serialize(self) -> dict:
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         raise NotImplementedError
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         raise NotImplementedError
 
     def forward(
diff --git a/source/tests/pt/model/test_atomic_model_global_stat.py b/source/tests/pt/model/test_atomic_model_global_stat.py
index 11752278e4..9ce5784bfa 100644
--- a/source/tests/pt/model/test_atomic_model_global_stat.py
+++ b/source/tests/pt/model/test_atomic_model_global_stat.py
@@ -5,7 +5,6 @@
     Path,
 )
 from typing import (
-    List,
     Optional,
 )
 
@@ -82,11 +81,11 @@ def serialize(self) -> dict:
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         raise NotImplementedError
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         raise NotImplementedError
 
     def forward(
diff --git a/source/tests/pt/model/test_force_grad.py b/source/tests/pt/model/test_force_grad.py
index ddc3c0bccf..d3cd11f71d 100644
--- a/source/tests/pt/model/test_force_grad.py
+++ b/source/tests/pt/model/test_force_grad.py
@@ -6,7 +6,6 @@
     Path,
 )
 from typing import (
-    List,
     Optional,
 )
 
@@ -32,7 +31,7 @@ class CheckSymmetry(DeepmdData):
     def __init__(
         self,
         sys_path: str,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
     ):
         super().__init__(sys_path=sys_path, type_map=type_map)
         self.add("energy", 1, atomic=False, must=False, high_prec=True)
diff --git a/source/tests/pt/model/test_linear_atomic_model_stat.py b/source/tests/pt/model/test_linear_atomic_model_stat.py
index 604c82f513..49b7a3821f 100644
--- a/source/tests/pt/model/test_linear_atomic_model_stat.py
+++ b/source/tests/pt/model/test_linear_atomic_model_stat.py
@@ -5,7 +5,6 @@
     Path,
 )
 from typing import (
-    List,
     Optional,
 )
 
@@ -63,11 +62,11 @@ def serialize(self) -> dict:
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         raise NotImplementedError
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         raise NotImplementedError
 
     def forward(
@@ -115,11 +114,11 @@ def serialize(self) -> dict:
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: List[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat=None
     ) -> None:
         raise NotImplementedError
 
-    def get_type_map(self) -> List[str]:
+    def get_type_map(self) -> list[str]:
         raise NotImplementedError
 
     def forward(
diff --git a/source/tests/pt/model/test_permutation.py b/source/tests/pt/model/test_permutation.py
index 6aec895041..2d391c7115 100644
--- a/source/tests/pt/model/test_permutation.py
+++ b/source/tests/pt/model/test_permutation.py
@@ -98,6 +98,7 @@
     "data_stat_nbatch": 20,
 }
 
+
 model_spin = {
     "type_map": ["O", "H", "B"],
     "descriptor": {
diff --git a/source/tests/pt/model/test_rotation.py b/source/tests/pt/model/test_rotation.py
index caa6385c80..cf947c30b2 100644
--- a/source/tests/pt/model/test_rotation.py
+++ b/source/tests/pt/model/test_rotation.py
@@ -5,7 +5,6 @@
     Path,
 )
 from typing import (
-    List,
     Optional,
 )
 
@@ -30,7 +29,7 @@ class CheckSymmetry(DeepmdData):
     def __init__(
         self,
         sys_path: str,
-        type_map: Optional[List[str]] = None,
+        type_map: Optional[list[str]] = None,
     ):
         super().__init__(sys_path=sys_path, type_map=type_map)
         self.add("energy", 1, atomic=False, must=False, high_prec=True)
diff --git a/source/tests/universal/common/cases/atomic_model/utils.py b/source/tests/universal/common/cases/atomic_model/utils.py
index b63563e237..bfd2e2cd5f 100644
--- a/source/tests/universal/common/cases/atomic_model/utils.py
+++ b/source/tests/universal/common/cases/atomic_model/utils.py
@@ -2,8 +2,6 @@
 from typing import (
     Any,
     Callable,
-    Dict,
-    List,
     Optional,
 )
 
@@ -21,7 +19,7 @@
 class AtomicModelTestCase:
     """Common test case for atomic model."""
 
-    expected_type_map: List[str]
+    expected_type_map: list[str]
     """Expected type map."""
     expected_rcut: float
     """Expected cut-off radius."""
@@ -29,25 +27,25 @@ class AtomicModelTestCase:
     """Expected number (dimension) of frame parameters."""
     expected_dim_aparam: int
     """Expected number (dimension) of atomic parameters."""
-    expected_sel_type: List[int]
+    expected_sel_type: list[int]
     """Expected selected atom types."""
     expected_aparam_nall: bool
     """Expected shape of atomic parameters."""
-    expected_model_output_type: List[str]
+    expected_model_output_type: list[str]
     """Expected output type for the model."""
-    model_output_equivariant: List[str]
+    model_output_equivariant: list[str]
     """Outputs that are equivariant to the input rotation."""
-    expected_sel: List[int]
+    expected_sel: list[int]
     """Expected number of neighbors."""
     expected_has_message_passing: bool
     """Expected whether having message passing."""
     forward_wrapper: Callable[[Any], Any]
     """Calss wrapper for forward method."""
-    aprec_dict: Dict[str, Optional[float]]
+    aprec_dict: dict[str, Optional[float]]
     """Dictionary of absolute precision in each test."""
-    rprec_dict: Dict[str, Optional[float]]
+    rprec_dict: dict[str, Optional[float]]
     """Dictionary of relative precision in each test."""
-    epsilon_dict: Dict[str, Optional[float]]
+    epsilon_dict: dict[str, Optional[float]]
     """Dictionary of epsilons in each test."""
 
     def test_get_type_map(self):
diff --git a/source/tests/universal/common/cases/model/model.py b/source/tests/universal/common/cases/model/model.py
index c31f5cd889..cee69d9d6c 100644
--- a/source/tests/universal/common/cases/model/model.py
+++ b/source/tests/universal/common/cases/model/model.py
@@ -28,6 +28,25 @@ def setUpClass(cls) -> None:
         cls.epsilon_dict = {}
 
 
+class LinearEnerModelTest(ModelTestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.expected_rcut = 5.0
+        cls.expected_type_map = ["O", "H"]
+        cls.expected_dim_fparam = 0
+        cls.expected_dim_aparam = 0
+        cls.expected_sel_type = [0, 1]
+        cls.expected_aparam_nall = False
+        cls.expected_model_output_type = ["energy", "mask"]
+        cls.model_output_equivariant = []
+        cls.expected_sel = [46, 92]
+        cls.expected_sel_mix = sum(cls.expected_sel)
+        cls.expected_has_message_passing = False
+        cls.aprec_dict = {}
+        cls.rprec_dict = {}
+        cls.epsilon_dict = {}
+
+
 class DipoleModelTest(ModelTestCase):
     @classmethod
     def setUpClass(cls) -> None:
diff --git a/source/tests/universal/common/cases/model/utils.py b/source/tests/universal/common/cases/model/utils.py
index 66b2e64fd3..628c415eb2 100644
--- a/source/tests/universal/common/cases/model/utils.py
+++ b/source/tests/universal/common/cases/model/utils.py
@@ -6,8 +6,6 @@
 from typing import (
     Any,
     Callable,
-    Dict,
-    List,
     Optional,
 )
 
@@ -24,6 +22,7 @@
     GLOBAL_SEED,
 )
 from .....utils import (
+    CI,
     TEST_DEVICE,
 )
 
@@ -31,7 +30,7 @@
 class ModelTestCase:
     """Common test case for model."""
 
-    expected_type_map: List[str]
+    expected_type_map: list[str]
     """Expected type map."""
     expected_rcut: float
     """Expected cut-off radius."""
@@ -39,15 +38,15 @@ class ModelTestCase:
     """Expected number (dimension) of frame parameters."""
     expected_dim_aparam: int
     """Expected number (dimension) of atomic parameters."""
-    expected_sel_type: List[int]
+    expected_sel_type: list[int]
     """Expected selected atom types."""
     expected_aparam_nall: bool
     """Expected shape of atomic parameters."""
-    expected_model_output_type: List[str]
+    expected_model_output_type: list[str]
     """Expected output type for the model."""
-    model_output_equivariant: List[str]
+    model_output_equivariant: list[str]
     """Outputs that are equivariant to the input rotation."""
-    expected_sel: List[int]
+    expected_sel: list[int]
     """Expected number of neighbors."""
     expected_has_message_passing: bool
     """Expected whether having message passing."""
@@ -55,11 +54,11 @@ class ModelTestCase:
     """Class wrapper for forward method."""
     forward_wrapper_cpu_ref: Callable[[Any], Any]
     """Convert model to CPU method."""
-    aprec_dict: Dict[str, Optional[float]]
+    aprec_dict: dict[str, Optional[float]]
     """Dictionary of absolute precision in each test."""
-    rprec_dict: Dict[str, Optional[float]]
+    rprec_dict: dict[str, Optional[float]]
     """Dictionary of relative precision in each test."""
-    epsilon_dict: Dict[str, Optional[float]]
+    epsilon_dict: dict[str, Optional[float]]
     """Dictionary of epsilons in each test."""
 
     def test_get_type_map(self):
@@ -329,7 +328,7 @@ def test_zero_forward(self):
                 continue
             np.testing.assert_allclose(rr1, rr2, atol=aprec)
 
-    @unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+    @unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
     def test_permutation(self):
         """Test permutation."""
         if getattr(self, "skip_test_permutation", False):
@@ -415,7 +414,7 @@ def test_permutation(self):
             else:
                 raise RuntimeError(f"Unknown output key: {kk}")
 
-    @unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+    @unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
     def test_trans(self):
         """Test translation."""
         if getattr(self, "skip_test_trans", False):
@@ -484,7 +483,7 @@ def test_trans(self):
             else:
                 raise RuntimeError(f"Unknown output key: {kk}")
 
-    @unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+    @unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
     def test_rot(self):
         """Test rotation."""
         if getattr(self, "skip_test_rot", False):
@@ -674,7 +673,7 @@ def test_rot(self):
             else:
                 raise RuntimeError(f"Unknown output key: {kk}")
 
-    @unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+    @unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
     def test_smooth(self):
         """Test smooth."""
         if getattr(self, "skip_test_smooth", False):
@@ -781,7 +780,7 @@ def test_smooth(self):
             else:
                 raise RuntimeError(f"Unknown output key: {kk}")
 
-    @unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+    @unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
     def test_autodiff(self):
         """Test autodiff."""
         if getattr(self, "skip_test_autodiff", False):
@@ -921,7 +920,7 @@ def ff_cell(bb):
             # not support virial by far
             pass
 
-    @unittest.skipIf(TEST_DEVICE == "cpu", "Skip test on CPU.")
+    @unittest.skipIf(TEST_DEVICE == "cpu" and CI, "Skip test on CPU.")
     def test_device_consistence(self):
         """Test forward consistency between devices."""
         test_spin = getattr(self, "test_spin", False)
diff --git a/source/tests/universal/dpmodel/atomc_model/test_atomic_model.py b/source/tests/universal/dpmodel/atomc_model/test_atomic_model.py
index 4c5a2b291b..8e7324e2bc 100644
--- a/source/tests/universal/dpmodel/atomc_model/test_atomic_model.py
+++ b/source/tests/universal/dpmodel/atomc_model/test_atomic_model.py
@@ -26,6 +26,7 @@
     parameterized,
 )
 from ....utils import (
+    CI,
     TEST_DEVICE,
 )
 from ...common.cases.atomic_model.atomic_model import (
@@ -98,7 +99,7 @@
         ),  # fitting_class_param & class
     ),
 )
-@unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+@unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
 class TestEnergyAtomicModelDP(unittest.TestCase, EnerAtomicModelTest, DPTestCase):
     @classmethod
     def setUpClass(cls):
@@ -165,7 +166,7 @@ def setUpClass(cls):
         ),  # fitting_class_param & class
     ),
 )
-@unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+@unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
 class TestDosAtomicModelDP(unittest.TestCase, DosAtomicModelTest, DPTestCase):
     @classmethod
     def setUpClass(cls):
@@ -227,7 +228,7 @@ def setUpClass(cls):
         ),  # fitting_class_param & class
     ),
 )
-@unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+@unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
 class TestDipoleAtomicModelDP(unittest.TestCase, DipoleAtomicModelTest, DPTestCase):
     @classmethod
     def setUpClass(cls):
@@ -290,7 +291,7 @@ def setUpClass(cls):
         ),  # fitting_class_param & class
     ),
 )
-@unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+@unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
 class TestPolarAtomicModelDP(unittest.TestCase, PolarAtomicModelTest, DPTestCase):
     @classmethod
     def setUpClass(cls):
@@ -351,7 +352,7 @@ def setUpClass(cls):
         ),  # fitting_class_param & class
     ),
 )
-@unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+@unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
 class TestZBLAtomicModelDP(unittest.TestCase, ZBLAtomicModelTest, DPTestCase):
     @classmethod
     def setUpClass(cls):
@@ -429,7 +430,7 @@ def setUpClass(cls):
         ),  # fitting_class_param & class
     ),
 )
-@unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+@unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
 class TestPropertyAtomicModelDP(unittest.TestCase, PropertyAtomicModelTest, DPTestCase):
     @classmethod
     def setUpClass(cls):
diff --git a/source/tests/universal/dpmodel/backend.py b/source/tests/universal/dpmodel/backend.py
index 99170c20e1..4f624ae501 100644
--- a/source/tests/universal/dpmodel/backend.py
+++ b/source/tests/universal/dpmodel/backend.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from functools import (
+    lru_cache,
+)
+
 import numpy as np
 
 from deepmd.dpmodel.common import (
@@ -30,8 +34,15 @@ def convert_to_numpy(cls, xx: np.ndarray) -> np.ndarray:
     def convert_from_numpy(cls, xx: np.ndarray) -> np.ndarray:
         return xx
 
+    @classmethod
+    @lru_cache(maxsize=1)
+    def _get_deserialized_module(cls):
+        return cls.module.deserialize(cls.module.serialize())
+
     @property
     def deserialized_module(self):
+        if hasattr(self.__class__, "module"):
+            return self._get_deserialized_module()
         return self.module.deserialize(self.module.serialize())
 
     @property
@@ -41,3 +52,10 @@ def modules_to_test(self):
             self.deserialized_module,
         ]
         return modules
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        if hasattr(cls, "module"):
+            del cls.module
+        cls._get_deserialized_module.cache_clear()
diff --git a/source/tests/universal/dpmodel/descriptor/test_descriptor.py b/source/tests/universal/dpmodel/descriptor/test_descriptor.py
index 256bea74f8..fc7ee8b075 100644
--- a/source/tests/universal/dpmodel/descriptor/test_descriptor.py
+++ b/source/tests/universal/dpmodel/descriptor/test_descriptor.py
@@ -26,6 +26,7 @@
     GLOBAL_SEED,
 )
 from ....utils import (
+    CI,
     TEST_DEVICE,
 )
 from ...common.cases.descriptor.descriptor import (
@@ -519,7 +520,7 @@ def DescriptorParamHybridMixedTTebd(ntypes, rcut, rcut_smth, sel, type_map, **kw
         (DescriptorParamHybridMixedTTebd, DescrptHybrid),
     )  # class_param & class
 )
-@unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+@unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
 class TestDescriptorDP(unittest.TestCase, DescriptorTest, DPTestCase):
     def setUp(self):
         DescriptorTest.setUp(self)
diff --git a/source/tests/universal/dpmodel/fitting/test_fitting.py b/source/tests/universal/dpmodel/fitting/test_fitting.py
index 393bab1707..f64faee76f 100644
--- a/source/tests/universal/dpmodel/fitting/test_fitting.py
+++ b/source/tests/universal/dpmodel/fitting/test_fitting.py
@@ -20,6 +20,7 @@
     GLOBAL_SEED,
 )
 from ....utils import (
+    CI,
     TEST_DEVICE,
 )
 from ...common.cases.fitting.fitting import (
@@ -236,7 +237,7 @@ def FittingParamProperty(
     ),  # class_param & class
     (True, False),  # mixed_types
 )
-@unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+@unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
 class TestFittingDP(unittest.TestCase, FittingTest, DPTestCase):
     def setUp(self):
         ((FittingParam, Fitting), self.mixed_types) = self.param
diff --git a/source/tests/universal/dpmodel/model/test_model.py b/source/tests/universal/dpmodel/model/test_model.py
index 66edc2d50e..265dc43c6c 100644
--- a/source/tests/universal/dpmodel/model/test_model.py
+++ b/source/tests/universal/dpmodel/model/test_model.py
@@ -25,6 +25,7 @@
     parameterized,
 )
 from ....utils import (
+    CI,
     TEST_DEVICE,
 )
 from ...common.cases.model.model import (
@@ -112,7 +113,7 @@ def skip_model_tests(test_obj):
         ),  # fitting_class_param & class
     ),
 )
-@unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+@unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
 class TestEnergyModelDP(unittest.TestCase, EnerModelTest, DPTestCase):
     @classmethod
     def setUpClass(cls):
@@ -200,7 +201,7 @@ def setUpClass(cls):
         ),  # fitting_class_param & class
     ),
 )
-@unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+@unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
 class TestSpinEnergyModelDP(unittest.TestCase, SpinEnerModelTest, DPTestCase):
     @classmethod
     def setUpClass(cls):
diff --git a/source/tests/universal/dpmodel/utils/test_type_embed.py b/source/tests/universal/dpmodel/utils/test_type_embed.py
index 67faef0a8d..ee3063af7d 100644
--- a/source/tests/universal/dpmodel/utils/test_type_embed.py
+++ b/source/tests/universal/dpmodel/utils/test_type_embed.py
@@ -6,6 +6,7 @@
 )
 
 from ....utils import (
+    CI,
     TEST_DEVICE,
 )
 from ...common.cases.utils.type_embed import (
@@ -16,7 +17,7 @@
 )
 
 
-@unittest.skipIf(TEST_DEVICE != "cpu", "Only test on CPU.")
+@unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
 class TestTypeEmbd(unittest.TestCase, TypeEmbdTest, DPTestCase):
     def setUp(self):
         TypeEmbdTest.setUp(self)
diff --git a/source/tests/universal/pt/backend.py b/source/tests/universal/pt/backend.py
index 951bf18262..5146fdc79b 100644
--- a/source/tests/universal/pt/backend.py
+++ b/source/tests/universal/pt/backend.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from functools import (
+    lru_cache,
+)
+
 import numpy as np
 import torch
 
@@ -18,13 +22,28 @@ class PTTestCase(BackendTestCase):
     module: "torch.nn.Module"
     """PT module to test."""
 
+    @classmethod
+    @lru_cache(maxsize=1)
+    def _get_script_module(cls):
+        with torch.jit.optimized_execution(False):
+            return torch.jit.script(cls.module)
+
     @property
     def script_module(self):
+        if hasattr(self.__class__, "module"):
+            return self._get_script_module()
         with torch.jit.optimized_execution(False):
             return torch.jit.script(self.module)
 
+    @classmethod
+    @lru_cache(maxsize=1)
+    def _get_deserialized_module(cls):
+        return cls.module.deserialize(cls.module.serialize())
+
     @property
     def deserialized_module(self):
+        if hasattr(self.__class__, "module"):
+            return self._get_deserialized_module()
         return self.module.deserialize(self.module.serialize())
 
     @property
@@ -35,6 +54,14 @@ def modules_to_test(self):
         ]
         return modules
 
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        if hasattr(cls, "module"):
+            del cls.module
+        cls._get_deserialized_module.cache_clear()
+        cls._get_script_module.cache_clear()
+
     def test_jit(self):
         if getattr(self, "skip_test_jit", False):
             self.skipTest("Skip test jit.")
diff --git a/source/tests/universal/pt/model/test_model.py b/source/tests/universal/pt/model/test_model.py
index 41df0cf762..81c32eb94c 100644
--- a/source/tests/universal/pt/model/test_model.py
+++ b/source/tests/universal/pt/model/test_model.py
@@ -21,6 +21,7 @@
     DOSModel,
     DPZBLModel,
     EnergyModel,
+    LinearEnergyModel,
     PolarModel,
     PropertyModel,
     SpinEnergyModel,
@@ -43,6 +44,7 @@
     DipoleModelTest,
     DosModelTest,
     EnerModelTest,
+    LinearEnerModelTest,
     PolarModelTest,
     PropertyModelTest,
     SpinEnerModelTest,
@@ -803,3 +805,100 @@ def setUpClass(cls):
         cls.expected_sel_type = ft.get_sel_type()
         cls.expected_dim_fparam = ft.get_dim_fparam()
         cls.expected_dim_aparam = ft.get_dim_aparam()
+
+
+@parameterized(
+    des_parameterized=(
+        (
+            *[(param_func, DescrptDPA1) for param_func in DescriptorParamDPA1List],
+            *[(param_func, DescrptDPA2) for param_func in DescriptorParamDPA2List],
+            (DescriptorParamHybridMixed, DescrptHybrid),
+            (DescriptorParamHybridMixedTTebd, DescrptHybrid),
+        ),  # descrpt_class_param & class
+        ((FittingParamEnergy, EnergyFittingNet),),  # fitting_class_param & class
+    ),
+    fit_parameterized=(
+        (
+            (DescriptorParamDPA1, DescrptDPA1),
+            (DescriptorParamDPA2, DescrptDPA2),
+        ),  # descrpt_class_param & class
+        (
+            *[(param_func, EnergyFittingNet) for param_func in FittingParamEnergyList],
+        ),  # fitting_class_param & class
+    ),
+)
+class TestLinearEnergyModelPT(unittest.TestCase, LinearEnerModelTest, PTTestCase):
+    @property
+    def modules_to_test(self):
+        skip_test_jit = getattr(self, "skip_test_jit", False)
+        modules = PTTestCase.modules_to_test.fget(self)
+        if not skip_test_jit:
+            # for Model, we can test script module API
+            modules += [
+                self._script_module
+                if hasattr(self, "_script_module")
+                else self.script_module
+            ]
+        return modules
+
+    @classmethod
+    def setUpClass(cls):
+        LinearEnerModelTest.setUpClass()
+        (DescriptorParam, Descrpt) = cls.param[0]
+        (FittingParam, Fitting) = cls.param[1]
+        # set special precision
+        cls.aprec_dict["test_smooth"] = 1e-5
+        cls.input_dict_ds = DescriptorParam(
+            len(cls.expected_type_map),
+            cls.expected_rcut,
+            cls.expected_rcut / 2,
+            cls.expected_sel,
+            cls.expected_type_map,
+        )
+
+        # set skip tests
+        skiptest, skip_reason = skip_model_tests(cls)
+        if skiptest:
+            raise cls.skipTest(cls, skip_reason)
+
+        ds1, ds2 = Descrpt(**cls.input_dict_ds), Descrpt(**cls.input_dict_ds)
+        cls.input_dict_ft = FittingParam(
+            ntypes=len(cls.expected_type_map),
+            dim_descrpt=ds1.get_dim_out(),
+            mixed_types=ds1.mixed_types(),
+            type_map=cls.expected_type_map,
+        )
+        ft1 = Fitting(
+            **cls.input_dict_ft,
+        )
+        ft2 = Fitting(
+            **cls.input_dict_ft,
+        )
+        dp_model1 = DPAtomicModel(
+            ds1,
+            ft1,
+            type_map=cls.expected_type_map,
+        )
+        dp_model2 = DPAtomicModel(
+            ds2,
+            ft2,
+            type_map=cls.expected_type_map,
+        )
+        cls.module = LinearEnergyModel(
+            [dp_model1, dp_model2],
+            type_map=cls.expected_type_map,
+        )
+        # only test jit API once for different models
+        if (
+            DescriptorParam not in defalut_des_param
+            or FittingParam not in defalut_fit_param
+        ):
+            cls.skip_test_jit = True
+        else:
+            with torch.jit.optimized_execution(False):
+                cls._script_module = torch.jit.script(cls.module)
+        cls.output_def = cls.module.translated_output_def()
+        cls.expected_has_message_passing = ds1.has_message_passing()
+        cls.expected_dim_fparam = ft1.get_dim_fparam()
+        cls.expected_dim_aparam = ft1.get_dim_aparam()
+        cls.expected_sel_type = ft1.get_sel_type()
diff --git a/source/tests/utils.py b/source/tests/utils.py
index 694f55186e..bfb3d445af 100644
--- a/source/tests/utils.py
+++ b/source/tests/utils.py
@@ -5,3 +5,6 @@
     TEST_DEVICE = "cpu"
 else:
     TEST_DEVICE = "cuda"
+
+# see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables
+CI = os.environ.get("CI") == "true"